当前位置: 首页>>代码示例>>Java>>正文


Java WriteOutContentHandler类代码示例

本文整理汇总了Java中org.apache.tika.sax.WriteOutContentHandler的典型用法代码示例。如果您正苦于以下问题:Java WriteOutContentHandler类的具体用法?Java WriteOutContentHandler怎么用?Java WriteOutContentHandler使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


WriteOutContentHandler类属于org.apache.tika.sax包,在下文中一共展示了WriteOutContentHandler类的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: extractText

import org.apache.tika.sax.WriteOutContentHandler; //导入依赖的package包/类
@Override
public void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)
	throws IOException
{
	WriteOutContentHandler wrapped = new WriteOutContentHandler(maxSize);
	ContentHandler handler = new BodyContentHandler(wrapped);
	try
	{
		Metadata meta = new Metadata();
		Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));
		parser.parse(input, handler, meta, new ParseContext());

		appendText(handler, outputText, maxSize);
	}
	catch( Exception t )
	{
		if( wrapped.isWriteLimitReached(t) )
		{
			// keep going
			LOGGER.debug("PDF size limit reached.  Indexing truncated text");
			appendText(handler, outputText, maxSize);
			return;
		}
		throw Throwables.propagate(t);
	}
}
 
开发者ID:equella,项目名称:Equella,代码行数:27,代码来源:PdfExtracter.java

示例2: parseTXTToString

import org.apache.tika.sax.WriteOutContentHandler; //导入依赖的package包/类
private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException {
	WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength);
	try {
		ParseContext context = new ParseContext();
		context.set(Parser.class, txtParser);
		txtParser.parse(stream, new BodyContentHandler(handler), metadata, context);
	} catch (SAXException e) {
		if (!handler.isWriteLimitReached(e)) {
			// This should never happen with BodyContentHandler...
			throw new TikaException("Unexpected SAX processing failure", e);
		}
	} finally {
		stream.close();
	}
	return handler.toString();
}
 
开发者ID:ziqizhang,项目名称:jate,代码行数:17,代码来源:ContentExtractor.java

示例3: testWriteToCache

import org.apache.tika.sax.WriteOutContentHandler; //导入依赖的package包/类
@Test
public void testWriteToCache() throws Throwable {
	final Path simple = Paths.get(this.simple.toURI());

	Writer writer = new StringWriter();
	final AtomicInteger hit = new AtomicInteger(), miss = new AtomicInteger();

	final Parser parser = new CachingTesseractOCRParser(tmpDir) {

		private static final long serialVersionUID = 6551690243986921730L;

		@Override
		public void cacheHit() {
			hit.incrementAndGet();
		}

		@Override
		public void cacheMiss() {
			miss.incrementAndGet();
		}
	};

	try (final InputStream in = Files.newInputStream(simple)) {
		parser.parse(in, new WriteOutContentHandler(writer), new Metadata(), new ParseContext());
	}

	Assert.assertEquals("HEAVY\nMETAL", writer.toString().trim());
	Assert.assertEquals(0, hit.get());
	Assert.assertEquals(1, miss.get());

	// Try again from the cache.
	writer = new StringWriter();
	try (final InputStream in = Files.newInputStream(simple)) {
		parser.parse(in, new WriteOutContentHandler(writer), new Metadata(), new ParseContext());
	}

	Assert.assertEquals("HEAVY\nMETAL", writer.toString().trim());
	Assert.assertEquals(1, hit.get());
	Assert.assertEquals(1, miss.get());
}
 
开发者ID:ICIJ,项目名称:extract,代码行数:41,代码来源:CachingTesseractOCRParserTest.java

示例4: extractText

import org.apache.tika.sax.WriteOutContentHandler; //导入依赖的package包/类
/**
 * Takes in a file and returns the text contained in that stream
 * @param file
 *  The file to extract from.
 * @return
 *  The text we were able to pull out
 */
static String extractText(File file) throws ExtractionException {
    Parser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    StringWriter writer = new StringWriter();
    try {
        parser.parse(TikaInputStream.get(file),
                new WriteOutContentHandler(writer),
                metadata,
                new ParseContext());
    } catch (IOException | SAXException | TikaException e) {
        throw new ExtractionException("Failed at extracting text from stream", e);
    }
    return writer.toString().trim();
}
 
开发者ID:Bachmann1234,项目名称:nlpExperiments,代码行数:22,代码来源:TextExtractor.java

示例5: parseToCache

import org.apache.tika.sax.WriteOutContentHandler; //导入依赖的package包/类
private void parseToCache(final TikaInputStream tis, final ContentHandler handler, final Metadata metadata,
                          final ParseContext context, final TesseractOCRConfig config, final boolean inline,
                          final Writer writer) throws SAXException, IOException, TikaException {
	final ContentHandler tee = new TeeContentHandler(handler, new WriteOutContentHandler(writer));

	if (inline) {
		super.parseInline(tis, new XHTMLContentHandler(tee, metadata), context, config);
	} else {
		super.parse(tis, tee, metadata, context);
	}
}
 
开发者ID:ICIJ,项目名称:extract,代码行数:12,代码来源:CachingTesseractOCRParser.java

示例6: extract

import org.apache.tika.sax.WriteOutContentHandler; //导入依赖的package包/类
/**
 * Create a pull-parser from the given {@link TikaInputStream}.
 *
 * @param input the stream to extract from
 * @param document file that is being extracted from
 * @return A pull-parsing reader.
 */
protected Reader extract(final Document document, final TikaInputStream input) throws IOException {
	final Metadata metadata = document.getMetadata();
	final ParseContext context = new ParseContext();
	final AutoDetectParser autoDetectParser = new AutoDetectParser(defaultParser);
	final Parser parser;

	if (null != digester) {
		parser = new DigestingParser(autoDetectParser, digester);
	} else {
		parser = autoDetectParser;
	}

	if (!ocrDisabled) {
		context.set(TesseractOCRConfig.class, ocrConfig);
	}

	context.set(PDFParserConfig.class, pdfConfig);

	// Set a fallback parser that outputs an empty document for empty files,
	// otherwise throws an exception.
	autoDetectParser.setFallback(FallbackParser.INSTANCE);

	// Only include "safe" tags in the HTML output from Tika's HTML parser.
	// This excludes script tags and objects.
	context.set(HtmlMapper.class, DefaultHtmlMapper.INSTANCE);

	final Reader reader;
	final Function<Writer, ContentHandler> handler;

	if (OutputFormat.HTML == outputFormat) {
		handler = (writer) -> new ExpandedTitleContentHandler(new HTML5Serializer(writer));
	} else {

		// The default BodyContentHandler is used when constructing the ParsingReader for text output, but
		// because only the body of embeds is pushed to the content handler further down the line, we can't
		// expect a body tag.
		handler = WriteOutContentHandler::new;
	}

	if (EmbedHandling.SPAWN == embedHandling) {
		context.set(Parser.class, parser);
		context.set(EmbeddedDocumentExtractor.class, new EmbedSpawner(document, context, embedOutput, handler));
	} else if (EmbedHandling.CONCATENATE == embedHandling) {
		context.set(Parser.class, parser);
		context.set(EmbeddedDocumentExtractor.class, new EmbedParser(document, context));
	} else {
		context.set(Parser.class, EmptyParser.INSTANCE);
		context.set(EmbeddedDocumentExtractor.class, new EmbedBlocker());
	}

	if (OutputFormat.HTML == outputFormat) {
		reader = new ParsingReader(parser, input, metadata, context, handler);
	} else {
		reader = new ParsingReader(parser, input, metadata, context);
	}

	return reader;
}
 
开发者ID:ICIJ,项目名称:extract,代码行数:66,代码来源:Extractor.java

示例7: extractText

import org.apache.tika.sax.WriteOutContentHandler; //导入依赖的package包/类
public static String extractText(String uri, Map<String, Object> options) throws Exception {
	final AutoDetectParser parser = createParser();
	final Metadata metadata = new Metadata();
	final ParseContext context = new ParseContext();

	String outputEncoding = null;
	String contentType = null;
	int maxLength = -1;

	if (options != null) {
		Object option;

		option = options.get("outputEncoding");
		if (option != null) {
			outputEncoding = option.toString();
		}

		option = options.get("contentType");
		if (option != null) {
			contentType = option.toString();
		}

		option = options.get("maxLength");
		if (option != null) {
			maxLength = (int)Float.parseFloat(option.toString());
		}
	}

	if (outputEncoding == null) {
		outputEncoding = "UTF-8";
	}

	fillMetadata(parser, metadata, contentType, uri);
	fillParseContext(context, options);

	final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
	final OutputStreamWriter writer = new OutputStreamWriter(outputStream, outputEncoding);
	final WriteOutContentHandler contentHandler = new WriteOutContentHandler(writer, maxLength);

	final TikaInputStream inputStream = createInputStream(uri, metadata);

	// Set up recursive parsing of archives.
	// See: http://wiki.apache.org/tika/RecursiveMetadata
	context.set(Parser.class, parser);
	context.set(EmbeddedDocumentExtractor.class, new ParsingEmbeddedDocumentExtractor(context));

	try {
		parser.parse(inputStream, new BodyContentHandler(contentHandler), metadata, context);
	} catch (Throwable e) {
		if (!contentHandler.isWriteLimitReached(e)) {
			throw e;
		} else {
			writer.close();
		}
	} finally {
		inputStream.close();
	}

	return outputStream.toString(outputEncoding);
}
 
开发者ID:ICIJ,项目名称:node-tika,代码行数:61,代码来源:NodeTika.java

示例8: BoilerpipeContentHandler

import org.apache.tika.sax.WriteOutContentHandler; //导入依赖的package包/类
/**
 * Creates a content handler that writes XHTML body character events to
 * the given writer.
 *
 * @param writer writer
 */
public BoilerpipeContentHandler(Writer writer) {
    this(new WriteOutContentHandler(writer));
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:10,代码来源:BoilerpipeContentHandler.java


注:本文中的org.apache.tika.sax.WriteOutContentHandler类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。