当前位置: 首页>>代码示例>>Java>>正文


Java XHTMLContentHandler类代码示例

本文整理汇总了Java中org.apache.tika.sax.XHTMLContentHandler的典型用法代码示例。如果您正苦于以下问题:Java XHTMLContentHandler类的具体用法?Java XHTMLContentHandler怎么用?Java XHTMLContentHandler使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


XHTMLContentHandler类属于org.apache.tika.sax包,在下文中一共展示了XHTMLContentHandler类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: buildXHTML

import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
/**
 * @see org.apache.poi.xwpf.extractor.XWPFWordExtractor#getText()
 */
@Override
protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
    XmlException, IOException {
  XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();

  // headers
  if (hfPolicy != null) {
    extractHeaders(xhtml, hfPolicy);
  }

  // process text in the order that it occurs in
  extractIBodyText(document, xhtml);

  // then all document tables
  if (hfPolicy != null) {
    extractFooters(xhtml, hfPolicy);
  }

  if (Config.inst().getProp(ConfigBool.ENABLE_IMAGE_OCR)) {
    extractImageText(xhtml);
  }
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:26,代码来源:XWPFWordExtractorDecorator.java

示例2: extractHeaders

import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
private void extractHeaders(XHTMLContentHandler xhtml,
    XWPFHeaderFooterPolicy hfPolicy) throws SAXException, XmlException,
    IOException {
  if (hfPolicy == null)
    return;

  if (hfPolicy.getFirstPageHeader() != null) {
    extractHeaderText(xhtml, hfPolicy.getFirstPageHeader());
  }

  if (hfPolicy.getEvenPageHeader() != null) {
    extractHeaderText(xhtml, hfPolicy.getEvenPageHeader());
  }

  if (hfPolicy.getDefaultHeader() != null) {
    extractHeaderText(xhtml, hfPolicy.getDefaultHeader());
  }
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:19,代码来源:XWPFWordExtractorDecorator.java

示例3: parse

import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
public void parse(
        InputStream stream, ContentHandler handler,
        Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    TemporaryResources tmp = new TemporaryResources();
    try {
        TikaInputStream tis = TikaInputStream.get(stream, tmp);
        new ImageMetadataExtractor(metadata).parseJpeg(tis.getFile());
        new JempboxExtractor(metadata).parse(tis);
    } finally {
        tmp.dispose();
    }

    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.endDocument();
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:18,代码来源:JpegParser.java

示例4: extractOutput

import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
/**
 * Reads the contents of the given stream and write it to the given XHTML
 * content handler. The stream is closed once fully processed.
 *
 * @param stream Stream where is the result of ocr
 * @param xhtml XHTML content handler
 * @throws SAXException if the XHTML SAX events could not be handled
 * @throws IOException if an input error occurred
 */
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {

    xhtml.startDocument();
    xhtml.startElement("div");
    try (Reader reader = new InputStreamReader(stream, UTF_8)) {
        char[] buffer = new char[1024];
        for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
            if (n > 0) {
                xhtml.characters(buffer, 0, n);
            }
        }
    }
    xhtml.endElement("div");
    xhtml.endDocument();
}
 
开发者ID:fiohol,项目名称:theSemProject,代码行数:25,代码来源:TesseractOCRParser.java

示例5: extractImageText

import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
private void extractImageText(XHTMLContentHandler xhtml,
    HSLFSlideShow document) {
  if (Config.inst().getProp(ConfigBool.ENABLE_IMAGE_OCR)) {
    TikaImageHelper helper = new TikaImageHelper(metadata);
    try {
      PictureData[] pictures = document.getPictures();
      for (PictureData picture : pictures) {
        ByteArrayInputStream imageData = new ByteArrayInputStream(
            picture.getData());
        helper.addImage(ImageIO.read(imageData));
      }
      // TODO: find out page number
      helper.addTextToHandler(xhtml);
    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      if (helper != null) {
        helper.close();
      }
    }
  }
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:23,代码来源:HSLFExtractor.java

示例6: extractMaster

import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
private void extractMaster(XHTMLContentHandler xhtml, MasterSheet master)
    throws SAXException {
  if (master == null) {
    return;
  }
  Shape[] shapes = master.getShapes();
  if (shapes == null || shapes.length == 0) {
    return;
  }

  xhtml.startElement("div", "class", "slide-master-content");
  for (int i = 0; i < shapes.length; i++) {
    Shape sh = shapes[i];
    if (sh != null && !MasterSheet.isPlaceholder(sh)) {
      if (sh instanceof TextShape) {
        TextShape tsh = (TextShape) sh;
        String text = tsh.getText();
        if (text != null) {
          xhtml.element("p", text);
        }
      }
    }
  }
  xhtml.endElement("div");
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:26,代码来源:HSLFExtractor.java

示例7: extractTableText

import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
private void extractTableText(XHTMLContentHandler xhtml, Table shape)
    throws SAXException {
  xhtml.startElement("table");
  for (int row = 0; row < shape.getNumberOfRows(); row++) {
    xhtml.startElement("tr");
    for (int col = 0; col < shape.getNumberOfColumns(); col++) {
      TableCell cell = shape.getCell(row, col);
      // insert empty string for empty cell if cell is null
      String txt = "";
      if (cell != null) {
        txt = cell.getText();
      }
      xhtml.element("td", txt);
    }
    xhtml.endElement("tr");
  }
  xhtml.endElement("table");
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:19,代码来源:HSLFExtractor.java

示例8: textRunsToText

import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs)
    throws SAXException {
  if (runs == null) {
    return;
  }

  for (TextRun run : runs) {
    if (run != null) {
      // Leaving in wisdom from TIKA-712 for easy revert.
      // Avoid boiler-plate text on the master slide (0
      // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
      // if (!isMaster || (run.getRunType() != 0 && run.getRunType()
      // != 1)) {
      String txt = run.getText();

      if (txt != null) {
        xhtml.characters(txt);
        xhtml.startElement("br");
        xhtml.endElement("br");
      }
    }
  }
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:24,代码来源:HSLFExtractor.java

示例9: handleEmbeddedResource

import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
protected void handleEmbeddedResource(TikaInputStream resource, String filename,
                                      String relationshipID, String mediaType, XHTMLContentHandler xhtml,
                                      boolean outputHtml)
throws IOException, SAXException, TikaException {
  try {
    Metadata metadata = new Metadata();
    if (filename != null) {
      metadata.set(Metadata.TIKA_MIME_FILE, filename);
      metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
    }
    if (relationshipID != null) {
      metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, relationshipID);
    }
    if (mediaType != null) {
      metadata.set(Metadata.CONTENT_TYPE, mediaType);
    }

    if (extractor.shouldParseEmbedded(metadata)) {
      extractor.parseEmbedded(resource, xhtml, metadata, outputHtml);
    }
  } finally {
    resource.close();
  }
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:25,代码来源:AbstractPOIFSExtractor.java

示例10: handleHeaderFooter

import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
private void handleHeaderFooter(Range[] ranges, String type,
    HWPFDocument document, PicturesSource pictures,
    PicturesTable pictureTable, XHTMLContentHandler xhtml)
    throws SAXException, IOException, TikaException {
  if (countParagraphs(ranges) > 0) {
    xhtml.startElement("div", "class", type);
    for (Range r : ranges) {
      if (r != null) {
        for (int i = 0; i < r.numParagraphs(); i++) {
          Paragraph p = r.getParagraph(i);

          String text = p.text();
          if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
            // Skip empty header or footer paragraphs
          } else {
            i += handleParagraph(p, 0, r, document,
                FieldsDocumentPart.HEADER, pictures, pictureTable, xhtml);
          }
        }
      }
    }
    xhtml.endElement("div");
  }
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:25,代码来源:WordExtractor.java

示例11: parseWord6

import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
protected void parseWord6(DirectoryNode root, XHTMLContentHandler xhtml)
    throws IOException, SAXException, TikaException {
  HWPFOldDocument doc = new HWPFOldDocument(root);
  Word6Extractor extractor = new Word6Extractor(doc);

  for (String p : extractor.getParagraphText()) {
    xhtml.element("p", p);
  }
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:10,代码来源:WordExtractor.java

示例12: extractIBodyText

import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
private void extractIBodyText(IBody bodyElement, XHTMLContentHandler xhtml)
    throws SAXException, XmlException, IOException {
  for (IBodyElement element : bodyElement.getBodyElements()) {
    if (element instanceof XWPFParagraph) {
      XWPFParagraph paragraph = (XWPFParagraph) element;
      extractParagraph(paragraph, xhtml);
    }
    if (element instanceof XWPFTable) {
      XWPFTable table = (XWPFTable) element;
      extractTable(table, xhtml);
    }
    if (element instanceof XWPFSDT) {
      extractSDT((XWPFSDT) element, xhtml);
    }
  }
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:17,代码来源:XWPFWordExtractorDecorator.java

示例13: extractImageText

import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
private void extractImageText(XHTMLContentHandler xhtml) {
  if (Config.inst().getProp(ConfigBool.ENABLE_IMAGE_OCR)) {
    TikaImageHelper helper = new TikaImageHelper(metadata);
    try {
      List<XWPFPictureData> pictures = document.getAllPictures();
      Iterator<XWPFPictureData> iterator = pictures.iterator();

      while (iterator.hasNext()) {
        ByteArrayInputStream imageData = new ByteArrayInputStream(iterator
            .next().getData());
        helper.addImage(ImageIO.read(imageData));
      }

      // TODO: find out page number
      helper.addTextToHandler(xhtml);
    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      if (helper != null) {
        helper.close();
      }
    }
  }
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:25,代码来源:XWPFWordExtractorDecorator.java

示例14: extractTable

import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
private void extractTable(XWPFTable table, XHTMLContentHandler xhtml)
    throws SAXException, XmlException, IOException {
  xhtml.startElement("table");
  xhtml.startElement("tbody");
  for (XWPFTableRow row : table.getRows()) {
    xhtml.startElement("tr");
    for (XWPFTableCell cell : row.getTableCells()) {
      xhtml.startElement("td");
      extractIBodyText(cell, xhtml);
      xhtml.endElement("td");
    }
    xhtml.endElement("tr");
  }
  xhtml.endElement("tbody");
  xhtml.endElement("table");
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:17,代码来源:XWPFWordExtractorDecorator.java

示例15: extractFooters

import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
private void extractFooters(XHTMLContentHandler xhtml,
    XWPFHeaderFooterPolicy hfPolicy) throws SAXException, XmlException,
    IOException {
  // footers
  if (hfPolicy.getFirstPageFooter() != null) {
    extractHeaderText(xhtml, hfPolicy.getFirstPageFooter());
  }
  if (hfPolicy.getEvenPageFooter() != null) {
    extractHeaderText(xhtml, hfPolicy.getEvenPageFooter());
  }
  if (hfPolicy.getDefaultFooter() != null) {
    extractHeaderText(xhtml, hfPolicy.getDefaultFooter());
  }
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:15,代码来源:XWPFWordExtractorDecorator.java


注:本文中的org.apache.tika.sax.XHTMLContentHandler类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。