Java XHTMLContentHandler.startElement方法代码示例

本文整理汇总了Java中org.apache.tika.sax.XHTMLContentHandler.startElement方法的典型用法代码示例。如果您正苦于以下问题：Java XHTMLContentHandler.startElement方法的具体用法？Java XHTMLContentHandler.startElement怎么用？Java XHTMLContentHandler.startElement使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.tika.sax.XHTMLContentHandler的用法示例。

在下文中一共展示了XHTMLContentHandler.startElement方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: extractOutput

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
/**
 * Reads the contents of the given stream and write it to the given XHTML
 * content handler. The stream is closed once fully processed.
 *
 * @param stream Stream where is the result of ocr
 * @param xhtml XHTML content handler
 * @throws SAXException if the XHTML SAX events could not be handled
 * @throws IOException if an input error occurred
 */
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {

    xhtml.startDocument();
    xhtml.startElement("div");
    try (Reader reader = new InputStreamReader(stream, UTF_8)) {
        char[] buffer = new char[1024];
        for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
            if (n > 0) {
                xhtml.characters(buffer, 0, n);
            }
        }
    }
    xhtml.endElement("div");
    xhtml.endDocument();
}

开发者ID:fiohol，项目名称:theSemProject，代码行数:25，代码来源:TesseractOCRParser.java

示例2: extractMaster

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void extractMaster(XHTMLContentHandler xhtml, MasterSheet master)
    throws SAXException {
  if (master == null) {
    return;
  }
  Shape[] shapes = master.getShapes();
  if (shapes == null || shapes.length == 0) {
    return;
  }

  xhtml.startElement("div", "class", "slide-master-content");
  for (int i = 0; i < shapes.length; i++) {
    Shape sh = shapes[i];
    if (sh != null && !MasterSheet.isPlaceholder(sh)) {
      if (sh instanceof TextShape) {
        TextShape tsh = (TextShape) sh;
        String text = tsh.getText();
        if (text != null) {
          xhtml.element("p", text);
        }
      }
    }
  }
  xhtml.endElement("div");
}

开发者ID:kolbasa，项目名称:OCRaptor，代码行数:26，代码来源:HSLFExtractor.java

示例3: extractTableText

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void extractTableText(XHTMLContentHandler xhtml, Table shape)
    throws SAXException {
  xhtml.startElement("table");
  for (int row = 0; row < shape.getNumberOfRows(); row++) {
    xhtml.startElement("tr");
    for (int col = 0; col < shape.getNumberOfColumns(); col++) {
      TableCell cell = shape.getCell(row, col);
      // insert empty string for empty cell if cell is null
      String txt = "";
      if (cell != null) {
        txt = cell.getText();
      }
      xhtml.element("td", txt);
    }
    xhtml.endElement("tr");
  }
  xhtml.endElement("table");
}

开发者ID:kolbasa，项目名称:OCRaptor，代码行数:19，代码来源:HSLFExtractor.java

示例4: textRunsToText

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs)
    throws SAXException {
  if (runs == null) {
    return;
  }

  for (TextRun run : runs) {
    if (run != null) {
      // Leaving in wisdom from TIKA-712 for easy revert.
      // Avoid boiler-plate text on the master slide (0
      // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
      // if (!isMaster || (run.getRunType() != 0 && run.getRunType()
      // != 1)) {
      String txt = run.getText();

      if (txt != null) {
        xhtml.characters(txt);
        xhtml.startElement("br");
        xhtml.endElement("br");
      }
    }
  }
}

开发者ID:kolbasa，项目名称:OCRaptor，代码行数:24，代码来源:HSLFExtractor.java

示例5: handleHeaderFooter

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void handleHeaderFooter(Range[] ranges, String type,
    HWPFDocument document, PicturesSource pictures,
    PicturesTable pictureTable, XHTMLContentHandler xhtml)
    throws SAXException, IOException, TikaException {
  if (countParagraphs(ranges) > 0) {
    xhtml.startElement("div", "class", type);
    for (Range r : ranges) {
      if (r != null) {
        for (int i = 0; i < r.numParagraphs(); i++) {
          Paragraph p = r.getParagraph(i);

          String text = p.text();
          if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
            // Skip empty header or footer paragraphs
          } else {
            i += handleParagraph(p, 0, r, document,
                FieldsDocumentPart.HEADER, pictures, pictureTable, xhtml);
          }
        }
      }
    }
    xhtml.endElement("div");
  }
}

开发者ID:kolbasa，项目名称:OCRaptor，代码行数:25，代码来源:WordExtractor.java

示例6: extractTable

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void extractTable(XWPFTable table, XHTMLContentHandler xhtml)
    throws SAXException, XmlException, IOException {
  xhtml.startElement("table");
  xhtml.startElement("tbody");
  for (XWPFTableRow row : table.getRows()) {
    xhtml.startElement("tr");
    for (XWPFTableCell cell : row.getTableCells()) {
      xhtml.startElement("td");
      extractIBodyText(cell, xhtml);
      xhtml.endElement("td");
    }
    xhtml.endElement("tr");
  }
  xhtml.endElement("tbody");
  xhtml.endElement("table");
}

开发者ID:kolbasa，项目名称:OCRaptor，代码行数:17，代码来源:XWPFWordExtractorDecorator.java

示例7: parse

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
    ParseContext context) throws IOException, SAXException, TikaException {
  if (metadata.get(Metadata.CONTENT_TYPE) == null) {
    metadata.set(Metadata.CONTENT_TYPE, "application/xml");
  }

  final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
  xhtml.startDocument();
  xhtml.startElement("p");

  TaggedContentHandler tagged = new TaggedContentHandler(handler);
  try {
    context.getSAXParser().parse(
        new CloseShieldInputStream(stream),
        new OfflineContentHandler(new EmbeddedContentHandler(getContentHandler(tagged, metadata,
            context))));
  } catch (SAXException e) {
    tagged.throwIfCauseOf(e);
    LOG.info("XML parse error", e);
    // TODO:
    // throw new TikaException("XML parse error", e);
  }

  xhtml.endElement("p");
  xhtml.endDocument();
}

开发者ID:kolbasa，项目名称:OCRaptor，代码行数:27，代码来源:XMLParser.java

示例8: extractAcroForm

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler)
throws IOException, SAXException {
  // Thank you, Ben Litchfield, for
  // org.apache.pdfbox.examples.fdf.PrintFields
  // this code derives from Ben's code
  PDDocumentCatalog catalog = pdf.getDocumentCatalog();

  if (catalog == null)
    return;

  PDAcroForm form = catalog.getAcroForm();
  if (form == null)
    return;

  @SuppressWarnings("rawtypes")
  List fields = form.getFields();

  if (fields == null)
    return;

  @SuppressWarnings("rawtypes")
  ListIterator itr = fields.listIterator();

  if (itr == null)
    return;

  handler.startElement("div", "class", "acroform");
  handler.startElement("ol");
  while (itr.hasNext()) {
    Object obj = itr.next();
    if (obj != null && obj instanceof PDField) {
      processAcroField((PDField) obj, handler, 0);
    }
  }
  handler.endElement("ol");
  handler.endElement("div");
}

开发者ID:kolbasa，项目名称:OCRaptor，代码行数:38，代码来源:PDF2XHTML.java

示例9: extractOutput

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
/**
 * Starts a thread that extracts the contents of the standard output
 * stream of the given process to the given XHTML content handler.
 * The standard output stream is closed once fully processed.
 *
 * @param process process
 * @param xhtml XHTML content handler
 * @throws SAXException if the XHTML SAX events could not be handled
 * @throws IOException if an input error occurred
 */
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
        throws SAXException, IOException {
    Reader reader = new InputStreamReader(stream);
    try {
        xhtml.startDocument();
        xhtml.startElement("p");
        char[] buffer = new char[1024];
        for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
            xhtml.characters(buffer, 0, n);
        }
        xhtml.endElement("p");
        xhtml.endDocument();
    } finally {
        reader.close();
    }
}

开发者ID:AlfrescoLabs，项目名称:tika-ffmpeg，代码行数:27，代码来源:WaitingExternalParser.java

示例10: handleChildTasks

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
protected static void handleChildTasks(ChildTaskContainer parentTask, XHTMLContentHandler xhtml,
        Set<Integer> usedResources) throws SAXException {
    List<Task> tasks = parentTask.getChildTasks();
    if (tasks != null && ! tasks.isEmpty()) {
        xhtml.startElement("ol");

        for (Task task : tasks) {
            xhtml.startElement("li", "id", task.getID().toString());

            // Firstly, output the task details
            xhtml.startElement("div", "class", "task");
            handleTask(task, xhtml, usedResources);
            xhtml.endElement("div");

            // Then recurse into children (if any)
            handleChildTasks(task, xhtml, usedResources);

            xhtml.endElement("li");
        }

        xhtml.endElement("ol");
    }
}

开发者ID:Gagravarr，项目名称:MPXJ-Tika，代码行数:24，代码来源:ProjectFileProcessor.java

示例11: handleDates

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
/**
 * Render a date range
 */
protected static void handleDates(String what, Date start, Date finish,
        Duration duration, XHTMLContentHandler xhtml) throws SAXException {
    if (start == null && finish == null) {
        // Assume there's nothing there, and skip
        return;
    }

    String cls = what.toLowerCase() + "Dates";

    xhtml.startElement("div", "class", "fromTo " + cls);
    xhtml.characters(what);
    xhtml.characters(" from ");
    xhtml.characters(buildDate(start));
    xhtml.characters(" to ");
    xhtml.characters(buildDate(finish));

    if (duration != null) {
        xhtml.characters(" taking ");
        xhtml.characters(buildDuration(duration));
    }

    xhtml.endElement("div");
}

开发者ID:Gagravarr，项目名称:MPXJ-Tika，代码行数:27，代码来源:ProjectFileProcessor.java

示例12: handlePictureCharacterRun

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void handlePictureCharacterRun(CharacterRun cr, Picture picture,
    PicturesSource pictures, XHTMLContentHandler xhtml) throws SAXException,
    IOException, TikaException {
  if (!isRendered(cr) || picture == null) {
    // Oh dear, we've run out...
    // Probably caused by multiple \u0008 images referencing
    // the same real image
    return;
  }

  // Which one is it?
  String extension = picture.suggestFileExtension();
  int pictureNumber = pictures.pictureNumber(picture);

  // Make up a name for the picture
  // There isn't one in the file, but we need to be able to reference
  // the picture from the img tag and the embedded resource
  String filename = "image" + pictureNumber
      + (extension.length() > 0 ? "." + extension : "");

  // Grab the mime type for the picture
  String mimeType = picture.getMimeType();

  // Output the img tag
  AttributesImpl attr = new AttributesImpl();
  attr.addAttribute("", "src", "src", "CDATA", "embedded:" + filename);
  attr.addAttribute("", "alt", "alt", "CDATA", filename);
  xhtml.startElement("img", attr);
  xhtml.endElement("img");

  // Have we already output this one?
  // (Only expose each individual image once)
  if (!pictures.hasOutput(picture)) {
    TikaInputStream stream = TikaInputStream.get(picture.getContent());
    handleEmbeddedResource(stream, filename, null, mimeType, xhtml, false);
    pictures.recordOutput(picture);
  }
}

开发者ID:kolbasa，项目名称:OCRaptor，代码行数:39，代码来源:WordExtractor.java

示例13: extractSDT

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void extractSDT(XWPFSDT element, XHTMLContentHandler xhtml)
    throws SAXException, XmlException, IOException {
  XWPFSDTContent content = element.getContent();
  String tag = "p";
  xhtml.startElement(tag);
  xhtml.characters(content.getText());
  xhtml.endElement(tag);
}

开发者ID:kolbasa，项目名称:OCRaptor，代码行数:9，代码来源:XWPFWordExtractorDecorator.java

示例14: parse

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void parse(InputStream stream, ContentHandler handler,
    Metadata metadata, ParseContext context) throws IOException,
    SAXException, TikaException {
  // Automatically detect the character encoding
  AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(
      stream), metadata, context.get(ServiceLoader.class, LOADER));
  try {
    Charset charset = reader.getCharset();
    MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
    metadata.set(Metadata.CONTENT_TYPE, type.toString());
    // deprecated, see TIKA-431
    metadata.set(Metadata.CONTENT_ENCODING, charset.name());

    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();

    xhtml.startElement("p");
    char[] buffer = new char[4096];
    int n = reader.read(buffer);
    while (n != -1) {
      xhtml.characters(buffer, 0, n);
      n = reader.read(buffer);
    }
    xhtml.endElement("p");

    xhtml.endDocument();
  } finally {
    reader.close();
  }
}

开发者ID:kolbasa，项目名称:OCRaptor，代码行数:31，代码来源:TXTParser.java

示例15: parseEntry

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void parseEntry(
        ArchiveInputStream archive, ArchiveEntry entry,
        EmbeddedDocumentExtractor extractor, XHTMLContentHandler xhtml)
        throws SAXException, IOException, TikaException {
    String name = entry.getName();
    if (archive.canReadEntryData(entry)) {
        Metadata entrydata = new Metadata();
        if (name != null && name.length() > 0) {
            entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
            AttributesImpl attributes = new AttributesImpl();
            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
            attributes.addAttribute("", "id", "id", "CDATA", name);
            xhtml.startElement("div", attributes);
            xhtml.endElement("div");

            entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name);
        }
        if (extractor.shouldParseEmbedded(entrydata)) {
            // For detectors to work, we need a mark/reset supporting
            // InputStream, which ArchiveInputStream isn't, so wrap
            TemporaryResources tmp = new TemporaryResources();
            try {
                TikaInputStream tis = TikaInputStream.get(archive, tmp);
                extractor.parseEmbedded(tis, xhtml, entrydata, true);
            } finally {
                tmp.dispose();
            }
        }
    } else if (name != null && name.length() > 0) {
        xhtml.element("p", name);
    }
}

开发者ID:kolbasa，项目名称:OCRaptor，代码行数:33，代码来源:PackageParser.java

注：本文中的org.apache.tika.sax.XHTMLContentHandler.startElement方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。