当前位置: 首页>>代码示例>>Java>>正文


Java XHTMLContentHandler.characters方法代码示例

本文整理汇总了Java中org.apache.tika.sax.XHTMLContentHandler.characters方法的典型用法代码示例。如果您正苦于以下问题:Java XHTMLContentHandler.characters方法的具体用法?Java XHTMLContentHandler.characters怎么用?Java XHTMLContentHandler.characters使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.tika.sax.XHTMLContentHandler的用法示例。


在下文中一共展示了XHTMLContentHandler.characters方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: extractOutput

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
/**
 * Reads the contents of the given stream and write it to the given XHTML
 * content handler. The stream is closed once fully processed.
 *
 * @param stream Stream where is the result of ocr
 * @param xhtml XHTML content handler
 * @throws SAXException if the XHTML SAX events could not be handled
 * @throws IOException if an input error occurred
 */
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {

    xhtml.startDocument();
    xhtml.startElement("div");
    try (Reader reader = new InputStreamReader(stream, UTF_8)) {
        char[] buffer = new char[1024];
        for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
            if (n > 0) {
                xhtml.characters(buffer, 0, n);
            }
        }
    }
    xhtml.endElement("div");
    xhtml.endDocument();
}
 
开发者ID:fiohol,项目名称:theSemProject,代码行数:25,代码来源:TesseractOCRParser.java

示例2: textRunsToText

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs)
    throws SAXException {
  if (runs == null) {
    return;
  }

  for (TextRun run : runs) {
    if (run != null) {
      // Leaving in wisdom from TIKA-712 for easy revert.
      // Avoid boiler-plate text on the master slide (0
      // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
      // if (!isMaster || (run.getRunType() != 0 && run.getRunType()
      // != 1)) {
      String txt = run.getText();

      if (txt != null) {
        xhtml.characters(txt);
        xhtml.startElement("br");
        xhtml.endElement("br");
      }
    }
  }
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:24,代码来源:HSLFExtractor.java

示例3: parse

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
    ParseContext context) throws IOException, SAXException, TikaException {
  ChmExtractor chmExtractor = new ChmExtractor(stream);

  // metadata
  metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp");

  // content
  XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
  xhtml.startDocument();

  for (DirectoryListingEntry entry : chmExtractor.getChmDirList().getDirectoryListingEntryList()) {
    if (entry.getName().endsWith(".html") || entry.getName().endsWith(".htm")) {
      xhtml.characters(extract(chmExtractor.extractChmEntry(entry)));
    }
  }

  xhtml.endDocument();
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:21,代码来源:ChmParser.java

示例4: extractOutput

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
/**
 * Starts a thread that extracts the contents of the standard output
 * stream of the given process to the given XHTML content handler.
 * The standard output stream is closed once fully processed.
 *
 * @param process process
 * @param xhtml XHTML content handler
 * @throws SAXException if the XHTML SAX events could not be handled
 * @throws IOException if an input error occurred
 */
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
        throws SAXException, IOException {
    Reader reader = new InputStreamReader(stream);
    try {
        xhtml.startDocument();
        xhtml.startElement("p");
        char[] buffer = new char[1024];
        for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
            xhtml.characters(buffer, 0, n);
        }
        xhtml.endElement("p");
        xhtml.endDocument();
    } finally {
        reader.close();
    }
}
 
开发者ID:AlfrescoLabs,项目名称:tika-ffmpeg,代码行数:27,代码来源:WaitingExternalParser.java

示例5: handleDates

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
/**
 * Render a date range
 */
protected static void handleDates(String what, Date start, Date finish,
        Duration duration, XHTMLContentHandler xhtml) throws SAXException {
    if (start == null && finish == null) {
        // Assume there's nothing there, and skip
        return;
    }

    String cls = what.toLowerCase() + "Dates";

    xhtml.startElement("div", "class", "fromTo " + cls);
    xhtml.characters(what);
    xhtml.characters(" from ");
    xhtml.characters(buildDate(start));
    xhtml.characters(" to ");
    xhtml.characters(buildDate(finish));

    if (duration != null) {
        xhtml.characters(" taking ");
        xhtml.characters(buildDuration(duration));
    }

    xhtml.endElement("div");
}
 
开发者ID:Gagravarr,项目名称:MPXJ-Tika,代码行数:27,代码来源:ProjectFileProcessor.java

示例6: readFully

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void readFully(final Reader reader, final XHTMLContentHandler xhtml) throws IOException, SAXException {
	final char[] buffer = new char[1024];

	for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
		if (n > 0) {
			xhtml.characters(buffer, 0, n);
		}
	}
}
 
开发者ID:ICIJ,项目名称:extract,代码行数:10,代码来源:CachingTesseractOCRParser.java

示例7: extractSDT

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void extractSDT(XWPFSDT element, XHTMLContentHandler xhtml)
    throws SAXException, XmlException, IOException {
  XWPFSDTContent content = element.getContent();
  String tag = "p";
  xhtml.startElement(tag);
  xhtml.characters(content.getText());
  xhtml.endElement(tag);
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:9,代码来源:XWPFWordExtractorDecorator.java

示例8: parse

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void parse(InputStream stream, ContentHandler handler,
    Metadata metadata, ParseContext context) throws IOException,
    SAXException, TikaException {
  // Automatically detect the character encoding
  AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(
      stream), metadata, context.get(ServiceLoader.class, LOADER));
  try {
    Charset charset = reader.getCharset();
    MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
    metadata.set(Metadata.CONTENT_TYPE, type.toString());
    // deprecated, see TIKA-431
    metadata.set(Metadata.CONTENT_ENCODING, charset.name());

    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();

    xhtml.startElement("p");
    char[] buffer = new char[4096];
    int n = reader.read(buffer);
    while (n != -1) {
      xhtml.characters(buffer, 0, n);
      n = reader.read(buffer);
    }
    xhtml.endElement("p");

    xhtml.endDocument();
  } finally {
    reader.close();
  }
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:31,代码来源:TXTParser.java

示例9: addFieldString

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void addFieldString(PDField field, XHTMLContentHandler handler)
throws SAXException {
  // Pick partial name to present in content and altName for attribute
  // Ignoring FullyQualifiedName for now
  String partName = field.getPartialName();
  String altName = field.getAlternateFieldName();

  StringBuilder sb = new StringBuilder();
  AttributesImpl attrs = new AttributesImpl();

  if (partName != null) {
    sb.append(partName).append(": ");
  }
  if (altName != null) {
    attrs.addAttribute("", "altName", "altName", "CDATA", altName);
  }
  // return early if PDSignature field
  if (field instanceof PDSignatureField) {
    handleSignature(attrs, (PDSignatureField) field, handler);
    return;
  }
  try {
    // getValue can throw an IOException if there is no value
    String value = field.getValue();
    if (value != null && !value.equals("null")) {
      sb.append(value);
    }
  } catch (Exception e) {
    // swallow
  }

  if (attrs.getLength() > 0 || sb.length() > 0) {
    handler.startElement("li", attrs);
    handler.characters(sb.toString());
    handler.endElement("li");
  }
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:38,代码来源:PDF2XHTML.java

示例10: parse

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void parse(InputStream stream, ContentHandler handler, 
        Metadata metadata, ParseContext context)
                throws IOException, SAXException, TikaException {

    //Only outputting the MIME type as metadata
    metadata.set(Metadata.CONTENT_TYPE, ENVI_MIME_TYPE);

    // The following code was taken from the TXTParser
    // Automatically detect the character encoding
    AutoDetectReader reader = 
            new AutoDetectReader(new CloseShieldInputStream(stream), metadata);

    try {
        Charset charset = reader.getCharset();
        MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
        // deprecated, see TIKA-431
        metadata.set(Metadata.CONTENT_ENCODING, charset.name());

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);

        xhtml.startDocument();

        //text contents of the xhtml
        xhtml.startElement("p");
        char[] buffer = new char[4096];
        int n = reader.read(buffer);
        while (n != -1) {
            xhtml.characters(buffer, 0, n);
            n = reader.read(buffer);
        }
        xhtml.endElement("p");

        xhtml.endDocument();
    } 		
    finally{
        reader.close();
    }

}
 
开发者ID:abburgess,项目名称:ENVIJava,代码行数:40,代码来源:EnviHeaderParser.java

示例11: addTextToHandler

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
/**
 *
 *
 * @param imageText
 * @param page
 * @param allPagesCount
 * @throws SAXException
 */
public void addTextToHandler(XHTMLContentHandler handler, Integer page, Integer allPagesCount)
    throws SAXException {
  if (!this.imageText.isEmpty()) {
    // handler.startElement("p", "class", "page_indicator");
    // // TODO: text
    // handler.characters("Images on page: " + page);
    // handler.endElement("p");

    boolean endImageContainer = false;
    for (int i = 0; i < this.imageText.size(); i++) {
      String text = this.imageText.get(i);
      text = St.removeRareCharacters(text);

      if (text.length() > 5) {
        if (i == 0) {
          handler.startElement("div", "class", IMAGE_CONTAINER_CLASS);
          endImageContainer = true;
        }
        // TODO: pagination
        if (page != null && allPagesCount != null) {
          handler.startElement("span", "page", page + ":" + allPagesCount);
          handler.characters(" ");
          handler.endElement("span");
        }

        handler.characters(text);

        // --- //
        if (i >= 0 && i != this.imageText.size() - 1) {
          handler.startElement("span", "class", "imageDivider");
          handler.characters(" ");
          handler.endElement("span");
        }
      }
    }
    if (endImageContainer) {
      handler.characters(" ");
      handler.endElement("div");
    }
  }
  this.imageText.clear();
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:51,代码来源:TikaImageHelper.java

示例12: render

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void render(XHTMLContentHandler handler) throws SAXException {
    handler.characters(text);
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:4,代码来源:TextCell.java

示例13: processRun

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private TmpFormatting processRun(XWPFRun run, XWPFParagraph paragraph,
    XHTMLContentHandler xhtml, TmpFormatting tfmtg) throws SAXException,
    XmlException, IOException {
  // True if we are currently in the named style tag:
  if (run.isBold() != tfmtg.isBold()) {
    if (tfmtg.isItalic()) {
      xhtml.endElement("i");
      tfmtg.setItalic(false);
    }
    if (run.isBold()) {
      xhtml.startElement("b");
    } else {
      xhtml.endElement("b");
    }
    tfmtg.setBold(run.isBold());
  }

  if (run.isItalic() != tfmtg.isItalic()) {
    if (run.isItalic()) {
      xhtml.startElement("i");
    } else {
      xhtml.endElement("i");
    }
    tfmtg.setItalic(run.isItalic());
  }

  boolean addedHREF = false;
  if (run instanceof XWPFHyperlinkRun) {
    XWPFHyperlinkRun linkRun = (XWPFHyperlinkRun) run;
    XWPFHyperlink link = linkRun.getHyperlink(document);
    if (link != null && link.getURL() != null) {
      xhtml.startElement("a", "href", link.getURL());
      addedHREF = true;
    } else if (linkRun.getAnchor() != null
        && linkRun.getAnchor().length() > 0) {
      xhtml.startElement("a", "href", "#" + linkRun.getAnchor());
      addedHREF = true;
    }
  }

  xhtml.characters(run.toString());

  // If we have any pictures, output them
  for (XWPFPicture picture : run.getEmbeddedPictures()) {
    if (paragraph.getDocument() != null) {
      XWPFPictureData data = picture.getPictureData();
      if (data != null) {
        AttributesImpl attr = new AttributesImpl();

        attr.addAttribute("", "src", "src", "CDATA",
            "embedded:" + data.getFileName());
        attr.addAttribute("", "alt", "alt", "CDATA", picture.getDescription());

        xhtml.startElement("img", attr);
        xhtml.endElement("img");
      }
    }
  }

  if (addedHREF) {
    xhtml.endElement("a");
  }

  return tfmtg;
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:66,代码来源:XWPFWordExtractorDecorator.java

示例14: processSDTRun

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void processSDTRun(XWPFSDT run, XHTMLContentHandler xhtml)
    throws SAXException, XmlException, IOException {
  xhtml.characters(run.getContent().getText());
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:5,代码来源:XWPFWordExtractorDecorator.java

示例15: render

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void render(XHTMLContentHandler handler) throws SAXException {
    handler.characters(format.format(number));
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:4,代码来源:NumberCell.java


注:本文中的org.apache.tika.sax.XHTMLContentHandler.characters方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。