本文整理汇总了Java中org.apache.tika.sax.XHTMLContentHandler.startDocument方法的典型用法代码示例。如果您正苦于以下问题:Java XHTMLContentHandler.startDocument方法的具体用法?Java XHTMLContentHandler.startDocument怎么用?Java XHTMLContentHandler.startDocument使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.tika.sax.XHTMLContentHandler
的用法示例。
在下文中一共展示了XHTMLContentHandler.startDocument方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: parse
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
new ImageMetadataExtractor(metadata).parseJpeg(tis.getFile());
new JempboxExtractor(metadata).parse(tis);
} finally {
tmp.dispose();
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
示例2: extractOutput
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
/**
* Reads the contents of the given stream and write it to the given XHTML
* content handler. The stream is closed once fully processed.
*
* @param stream Stream where is the result of ocr
* @param xhtml XHTML content handler
* @throws SAXException if the XHTML SAX events could not be handled
* @throws IOException if an input error occurred
*/
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {
xhtml.startDocument();
xhtml.startElement("div");
try (Reader reader = new InputStreamReader(stream, UTF_8)) {
char[] buffer = new char[1024];
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
if (n > 0) {
xhtml.characters(buffer, 0, n);
}
}
}
xhtml.endElement("div");
xhtml.endDocument();
}
示例3: parse
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
if (metadata.get(Metadata.CONTENT_TYPE) == null) {
metadata.set(Metadata.CONTENT_TYPE, "application/xml");
}
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
TaggedContentHandler tagged = new TaggedContentHandler(handler);
try {
context.getSAXParser().parse(
new CloseShieldInputStream(stream),
new OfflineContentHandler(new EmbeddedContentHandler(getContentHandler(tagged, metadata,
context))));
} catch (SAXException e) {
tagged.throwIfCauseOf(e);
LOG.info("XML parse error", e);
// TODO:
// throw new TikaException("XML parse error", e);
}
xhtml.endElement("p");
xhtml.endDocument();
}
示例4: parse
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
ChmExtractor chmExtractor = new ChmExtractor(stream);
// metadata
metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp");
// content
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
for (DirectoryListingEntry entry : chmExtractor.getChmDirList().getDirectoryListingEntryList()) {
if (entry.getName().endsWith(".html") || entry.getName().endsWith(".htm")) {
xhtml.characters(extract(chmExtractor.extractChmEntry(entry)));
}
}
xhtml.endDocument();
}
示例5: parse
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
new ImageMetadataExtractor(metadata).parseTiff(tis.getFile());
new JempboxExtractor(metadata).parse(tis);
} finally {
tmp.dispose();
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
示例6: extractOutput
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
/**
* Starts a thread that extracts the contents of the standard output
* stream of the given process to the given XHTML content handler.
* The standard output stream is closed once fully processed.
*
* @param process process
* @param xhtml XHTML content handler
* @throws SAXException if the XHTML SAX events could not be handled
* @throws IOException if an input error occurred
*/
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
throws SAXException, IOException {
Reader reader = new InputStreamReader(stream);
try {
xhtml.startDocument();
xhtml.startElement("p");
char[] buffer = new char[1024];
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
xhtml.characters(buffer, 0, n);
}
xhtml.endElement("p");
xhtml.endDocument();
} finally {
reader.close();
}
}
示例7: getXHTML
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
/**
* @see mj.ocraptor.extraction.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(org.xml.sax.ContentHandler,
* org.apache.tika.metadata.Metadata)
*/
public void getXHTML(
ContentHandler handler, Metadata metadata, ParseContext context)
throws SAXException, XmlException, IOException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
buildXHTML(xhtml);
// Now do any embedded parts
handleEmbeddedParts(handler);
xhtml.endDocument();
}
示例8: parse
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
// Automatically detect the character encoding
AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(
stream), metadata, context.get(ServiceLoader.class, LOADER));
try {
Charset charset = reader.getCharset();
MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
char[] buffer = new char[4096];
int n = reader.read(buffer);
while (n != -1) {
xhtml.characters(buffer, 0, n);
n = reader.read(buffer);
}
xhtml.endElement("p");
xhtml.endDocument();
} finally {
reader.close();
}
}
示例9: parse
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
//Only outputting the MIME type as metadata
metadata.set(Metadata.CONTENT_TYPE, ENVI_MIME_TYPE);
// The following code was taken from the TXTParser
// Automatically detect the character encoding
AutoDetectReader reader =
new AutoDetectReader(new CloseShieldInputStream(stream), metadata);
try {
Charset charset = reader.getCharset();
MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
//text contents of the xhtml
xhtml.startElement("p");
char[] buffer = new char[4096];
int n = reader.read(buffer);
while (n != -1) {
xhtml.characters(buffer, 0, n);
n = reader.read(buffer);
}
xhtml.endElement("p");
xhtml.endDocument();
}
finally{
reader.close();
}
}
示例10: parse
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
String type = metadata.get(Metadata.CONTENT_TYPE);
if (type != null) {
TikaImageHelper helper = null;
try {
helper = new TikaImageHelper(metadata);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
Loader loader = new Loader();
// TODO: temp files
Document xojDocument = loader.load(stream);
PageGenerator pageGen = new PageGenerator(xojDocument);
List<Page> pages = pageGen.paginate();
for (Page page : pages) {
final List<String> snippets = page.getTextSnippets();
xhtml.startElement("div", "class", "page");
for (String snippet : snippets) {
xhtml.startElement("p");
xhtml.characters(snippet);
xhtml.endElement("p");
}
xhtml.endElement("div");
}
for (int i = 0; i < pages.size(); i++) {
final List<BufferedImage> images = pages.get(i).getImageFiles();
for (BufferedImage image : images) {
helper.addImage(image);
}
helper.addTextToHandler(xhtml, i + 1, pages.size());
}
xhtml.endDocument();
} catch (Exception e) {
e.printStackTrace();
} finally {
if (helper != null) {
helper.close();
}
}
}
}
示例11: parse
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
// Check for the magic header signature
byte[] signature = new byte[4];
IOUtils.readFully(stream, signature);
if(signature[0] == (byte)'8' && signature[1] == (byte)'B' &&
signature[2] == (byte)'P' && signature[3] == (byte)'S') {
// Good, signature found
} else {
throw new TikaException("PSD/PSB magic signature invalid");
}
// Check the version
int version = EndianUtils.readUShortBE(stream);
if(version == 1 || version == 2) {
// Good, we support these two
} else {
throw new TikaException("Invalid PSD/PSB version " + version);
}
// Skip the reserved block
IOUtils.readFully(stream, new byte[6]);
// Number of channels in the image
int numChannels = EndianUtils.readUShortBE(stream);
// TODO Identify a suitable metadata key for this
// Width and Height
int height = EndianUtils.readIntBE(stream);
int width = EndianUtils.readIntBE(stream);
metadata.set(TIFF.IMAGE_LENGTH, height);
metadata.set(TIFF.IMAGE_WIDTH, width);
// Depth (bits per channel)
int depth = EndianUtils.readUShortBE(stream);
metadata.set(TIFF.BITS_PER_SAMPLE, Integer.toString(depth));
// Colour mode
// Bitmap = 0; Grayscale = 1; Indexed = 2; RGB = 3; CMYK = 4; Multichannel = 7; Duotone = 8; Lab = 9.
int colorMode = EndianUtils.readUShortBE(stream);
// TODO Identify a suitable metadata key for this
// Next is the Color Mode section
// We don't care about this bit
long colorModeSectionSize = EndianUtils.readIntBE(stream);
stream.skip(colorModeSectionSize);
// Next is the Image Resources section
// Check for certain interesting keys here
long imageResourcesSectionSize = EndianUtils.readIntBE(stream);
long read = 0;
while(read < imageResourcesSectionSize) {
ResourceBlock rb = new ResourceBlock(stream);
read += rb.totalLength;
// Is it one we can do something useful with?
if(rb.id == ResourceBlock.ID_CAPTION) {
metadata.add(TikaCoreProperties.DESCRIPTION, rb.getDataAsString());
} else if(rb.id == ResourceBlock.ID_EXIF_1) {
// TODO Parse the EXIF info
} else if(rb.id == ResourceBlock.ID_EXIF_3) {
// TODO Parse the EXIF info
} else if(rb.id == ResourceBlock.ID_XMP) {
// TODO Parse the XMP info
}
}
// Next is the Layer and Mask Info
// Finally we have Image Data
// We can't do anything with these parts
// We don't have any helpful text, sorry...
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}