当前位置: 首页>>代码示例>>Java>>正文


Java XHTMLContentHandler.endDocument方法代码示例

本文整理汇总了Java中org.apache.tika.sax.XHTMLContentHandler.endDocument方法的典型用法代码示例。如果您正苦于以下问题:Java XHTMLContentHandler.endDocument方法的具体用法?Java XHTMLContentHandler.endDocument怎么用?Java XHTMLContentHandler.endDocument使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.tika.sax.XHTMLContentHandler的用法示例。


在下文中一共展示了XHTMLContentHandler.endDocument方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: parse

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void parse(
        InputStream stream, ContentHandler handler,
        Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    TemporaryResources tmp = new TemporaryResources();
    try {
        TikaInputStream tis = TikaInputStream.get(stream, tmp);
        new ImageMetadataExtractor(metadata).parseJpeg(tis.getFile());
        new JempboxExtractor(metadata).parse(tis);
    } finally {
        tmp.dispose();
    }

    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.endDocument();
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:18,代码来源:JpegParser.java

示例2: extractOutput

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
/**
 * Reads the contents of the given stream and write it to the given XHTML
 * content handler. The stream is closed once fully processed.
 *
 * @param stream Stream where is the result of ocr
 * @param xhtml XHTML content handler
 * @throws SAXException if the XHTML SAX events could not be handled
 * @throws IOException if an input error occurred
 */
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {

    xhtml.startDocument();
    xhtml.startElement("div");
    try (Reader reader = new InputStreamReader(stream, UTF_8)) {
        char[] buffer = new char[1024];
        for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
            if (n > 0) {
                xhtml.characters(buffer, 0, n);
            }
        }
    }
    xhtml.endElement("div");
    xhtml.endDocument();
}
 
开发者ID:fiohol,项目名称:theSemProject,代码行数:25,代码来源:TesseractOCRParser.java

示例3: parse

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
    ParseContext context) throws IOException, SAXException, TikaException {
  if (metadata.get(Metadata.CONTENT_TYPE) == null) {
    metadata.set(Metadata.CONTENT_TYPE, "application/xml");
  }

  final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
  xhtml.startDocument();
  xhtml.startElement("p");

  TaggedContentHandler tagged = new TaggedContentHandler(handler);
  try {
    context.getSAXParser().parse(
        new CloseShieldInputStream(stream),
        new OfflineContentHandler(new EmbeddedContentHandler(getContentHandler(tagged, metadata,
            context))));
  } catch (SAXException e) {
    tagged.throwIfCauseOf(e);
    LOG.info("XML parse error", e);
    // TODO:
    // throw new TikaException("XML parse error", e);
  }

  xhtml.endElement("p");
  xhtml.endDocument();
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:27,代码来源:XMLParser.java

示例4: parse

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
    ParseContext context) throws IOException, SAXException, TikaException {
  ChmExtractor chmExtractor = new ChmExtractor(stream);

  // metadata
  metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp");

  // content
  XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
  xhtml.startDocument();

  for (DirectoryListingEntry entry : chmExtractor.getChmDirList().getDirectoryListingEntryList()) {
    if (entry.getName().endsWith(".html") || entry.getName().endsWith(".htm")) {
      xhtml.characters(extract(chmExtractor.extractChmEntry(entry)));
    }
  }

  xhtml.endDocument();
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:21,代码来源:ChmParser.java

示例5: parse

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void parse(
        InputStream stream, ContentHandler handler,
        Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    TemporaryResources tmp = new TemporaryResources();
    try {
        TikaInputStream tis = TikaInputStream.get(stream, tmp);
        new ImageMetadataExtractor(metadata).parseTiff(tis.getFile());
        new JempboxExtractor(metadata).parse(tis);
    } finally {
        tmp.dispose();
    }

    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.endDocument();
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:18,代码来源:TiffParser.java

示例6: extractOutput

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
/**
 * Starts a thread that extracts the contents of the standard output
 * stream of the given process to the given XHTML content handler.
 * The standard output stream is closed once fully processed.
 *
 * @param process process
 * @param xhtml XHTML content handler
 * @throws SAXException if the XHTML SAX events could not be handled
 * @throws IOException if an input error occurred
 */
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
        throws SAXException, IOException {
    Reader reader = new InputStreamReader(stream);
    try {
        xhtml.startDocument();
        xhtml.startElement("p");
        char[] buffer = new char[1024];
        for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
            xhtml.characters(buffer, 0, n);
        }
        xhtml.endElement("p");
        xhtml.endDocument();
    } finally {
        reader.close();
    }
}
 
开发者ID:AlfrescoLabs,项目名称:tika-ffmpeg,代码行数:27,代码来源:WaitingExternalParser.java

示例7: getXHTML

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
/**
 * @see mj.ocraptor.extraction.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(org.xml.sax.ContentHandler,
 *      org.apache.tika.metadata.Metadata)
 */
public void getXHTML(
        ContentHandler handler, Metadata metadata, ParseContext context)
        throws SAXException, XmlException, IOException, TikaException {
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();

    buildXHTML(xhtml);

    // Now do any embedded parts
    handleEmbeddedParts(handler);

    xhtml.endDocument();
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:18,代码来源:AbstractOOXMLExtractor.java

示例8: parse

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void parse(InputStream stream, ContentHandler handler,
    Metadata metadata, ParseContext context) throws IOException,
    SAXException, TikaException {
  // Automatically detect the character encoding
  AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(
      stream), metadata, context.get(ServiceLoader.class, LOADER));
  try {
    Charset charset = reader.getCharset();
    MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
    metadata.set(Metadata.CONTENT_TYPE, type.toString());
    // deprecated, see TIKA-431
    metadata.set(Metadata.CONTENT_ENCODING, charset.name());

    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();

    xhtml.startElement("p");
    char[] buffer = new char[4096];
    int n = reader.read(buffer);
    while (n != -1) {
      xhtml.characters(buffer, 0, n);
      n = reader.read(buffer);
    }
    xhtml.endElement("p");

    xhtml.endDocument();
  } finally {
    reader.close();
  }
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:31,代码来源:TXTParser.java

示例9: parse

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void parse(InputStream stream, ContentHandler handler, 
        Metadata metadata, ParseContext context)
                throws IOException, SAXException, TikaException {

    //Only outputting the MIME type as metadata
    metadata.set(Metadata.CONTENT_TYPE, ENVI_MIME_TYPE);

    // The following code was taken from the TXTParser
    // Automatically detect the character encoding
    AutoDetectReader reader = 
            new AutoDetectReader(new CloseShieldInputStream(stream), metadata);

    try {
        Charset charset = reader.getCharset();
        MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
        // deprecated, see TIKA-431
        metadata.set(Metadata.CONTENT_ENCODING, charset.name());

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);

        xhtml.startDocument();

        //text contents of the xhtml
        xhtml.startElement("p");
        char[] buffer = new char[4096];
        int n = reader.read(buffer);
        while (n != -1) {
            xhtml.characters(buffer, 0, n);
            n = reader.read(buffer);
        }
        xhtml.endElement("p");

        xhtml.endDocument();
    } 		
    finally{
        reader.close();
    }

}
 
开发者ID:abburgess,项目名称:ENVIJava,代码行数:40,代码来源:EnviHeaderParser.java

示例10: parse

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
    ParseContext context) throws IOException, SAXException, TikaException {
  String type = metadata.get(Metadata.CONTENT_TYPE);

  if (type != null) {

    TikaImageHelper helper = null;
    try {
      helper = new TikaImageHelper(metadata);

      XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
      xhtml.startDocument();

      Loader loader = new Loader();
      // TODO: temp files
      Document xojDocument = loader.load(stream);
      PageGenerator pageGen = new PageGenerator(xojDocument);
      List<Page> pages = pageGen.paginate();

      for (Page page : pages) {
        final List<String> snippets = page.getTextSnippets();
        xhtml.startElement("div", "class", "page");
        for (String snippet : snippets) {
          xhtml.startElement("p");
          xhtml.characters(snippet);
          xhtml.endElement("p");
        }
        xhtml.endElement("div");
      }

      for (int i = 0; i < pages.size(); i++) {
        final List<BufferedImage> images = pages.get(i).getImageFiles();
        for (BufferedImage image : images) {
          helper.addImage(image);
        }
        helper.addTextToHandler(xhtml, i + 1, pages.size());
      }
      xhtml.endDocument();

    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      if (helper != null) {
        helper.close();
      }
    }
  }
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:49,代码来源:XojParser.java

示例11: parse

import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void parse(
        InputStream stream, ContentHandler handler,
        Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    // Check for the magic header signature
    byte[] signature = new byte[4];
    IOUtils.readFully(stream, signature);
    if(signature[0] == (byte)'8' && signature[1] == (byte)'B' &&
       signature[2] == (byte)'P' && signature[3] == (byte)'S') {
       // Good, signature found
    } else {
       throw new TikaException("PSD/PSB magic signature invalid");
    }
    
    // Check the version
    int version = EndianUtils.readUShortBE(stream);
    if(version == 1 || version == 2) {
       // Good, we support these two
    } else {
       throw new TikaException("Invalid PSD/PSB version " + version);
    }
    
    // Skip the reserved block
    IOUtils.readFully(stream, new byte[6]);
    
    // Number of channels in the image
    int numChannels = EndianUtils.readUShortBE(stream);
    // TODO Identify a suitable metadata key for this

    // Width and Height
    int height = EndianUtils.readIntBE(stream);
    int width = EndianUtils.readIntBE(stream);
    metadata.set(TIFF.IMAGE_LENGTH, height);
    metadata.set(TIFF.IMAGE_WIDTH, width);
    
    // Depth (bits per channel)
    int depth = EndianUtils.readUShortBE(stream);
    metadata.set(TIFF.BITS_PER_SAMPLE, Integer.toString(depth));
    
    // Colour mode
    // Bitmap = 0; Grayscale = 1; Indexed = 2; RGB = 3; CMYK = 4; Multichannel = 7; Duotone = 8; Lab = 9.
    int colorMode = EndianUtils.readUShortBE(stream);
    // TODO Identify a suitable metadata key for this
    
    // Next is the Color Mode section
    // We don't care about this bit
    long colorModeSectionSize = EndianUtils.readIntBE(stream);
    stream.skip(colorModeSectionSize);

    // Next is the Image Resources section
    // Check for certain interesting keys here
    long imageResourcesSectionSize = EndianUtils.readIntBE(stream);
    long read = 0;
    while(read < imageResourcesSectionSize) {
       ResourceBlock rb = new ResourceBlock(stream);
       read += rb.totalLength;
       
       // Is it one we can do something useful with?
       if(rb.id == ResourceBlock.ID_CAPTION) {
          metadata.add(TikaCoreProperties.DESCRIPTION, rb.getDataAsString()); 
       } else if(rb.id == ResourceBlock.ID_EXIF_1) {
          // TODO Parse the EXIF info
       } else if(rb.id == ResourceBlock.ID_EXIF_3) {
          // TODO Parse the EXIF info
       } else if(rb.id == ResourceBlock.ID_XMP) {
          // TODO Parse the XMP info
       }
    }
    
    // Next is the Layer and Mask Info
    // Finally we have Image Data
    // We can't do anything with these parts
    
    // We don't have any helpful text, sorry...
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.endDocument();
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:79,代码来源:PSDParser.java


注:本文中的org.apache.tika.sax.XHTMLContentHandler.endDocument方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。