本文整理汇总了Java中org.apache.tika.sax.BodyContentHandler类的典型用法代码示例。如果您正苦于以下问题:Java BodyContentHandler类的具体用法?Java BodyContentHandler怎么用?Java BodyContentHandler使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
BodyContentHandler类属于org.apache.tika.sax包,在下文中一共展示了BodyContentHandler类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: extractText
import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
@Override
public void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)
throws IOException
{
try
{
Metadata meta = new Metadata();
ContentHandler handler = new BodyContentHandler();
Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));
parser.parse(input, handler, meta, new ParseContext());
String content = handler.toString();
if( content.length() > maxSize )
{
content = content.substring(0, maxSize);
}
outputText.append(content);
if( LOGGER.isDebugEnabled() )
{
LOGGER.debug("Word Summary:" + content); //$NON-NLS-1$
}
}
catch( Exception e )
{
throw new RuntimeException(e);
}
}
示例2: getFullText
import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
private static String getFullText(final String filepath) throws IOException, SAXException, TikaException {
StringWriter writer = new StringWriter();
final TikaInputStream inputStream = TikaInputStream.get(new File(filepath));
try {
final Detector detector = new DefaultDetector();
final Parser parser = new AutoDetectParser(detector);
final Metadata metadata = new Metadata();
final ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
ContentHandler contentHandler = new BodyContentHandler(writer);
parser.parse(inputStream, contentHandler, metadata, parseContext);
}
finally {
inputStream.close();
}
return writer.toString();
}
示例3: extractText
import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
@Override
public void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)
throws IOException
{
WriteOutContentHandler wrapped = new WriteOutContentHandler(maxSize);
ContentHandler handler = new BodyContentHandler(wrapped);
try
{
Metadata meta = new Metadata();
Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));
parser.parse(input, handler, meta, new ParseContext());
appendText(handler, outputText, maxSize);
}
catch( Exception t )
{
if( wrapped.isWriteLimitReached(t) )
{
// keep going
LOGGER.debug("PDF size limit reached. Indexing truncated text");
appendText(handler, outputText, maxSize);
return;
}
throw Throwables.propagate(t);
}
}
示例4: getMetadata
import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
@Override
public Single<Map<String, String>> getMetadata(InputStream ins) {
return Single.create(sub -> {
Parser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
try {
parser.parse(ins, handler, metadata, context);
Map<String, String> map = new HashMap<>();
String[] metadataNames = metadata.names();
for (String name : metadataNames) {
map.put(name, metadata.get(name));
}
sub.onSuccess(map);
} catch (Exception e) {
sub.onError(e);
}
// ins.close();
});
}
示例5: parseEmbedded
import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
@Override
public void parseEmbedded(final InputStream input, final ContentHandler handler, final Metadata metadata,
final boolean outputHtml) throws SAXException, IOException {
// There's no need to spawn inline embeds, like images in PDFs. These should be concatenated to the main
// document as usual.
if (TikaCoreProperties.EmbeddedResourceType.INLINE.toString().equals(metadata
.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE))) {
final ContentHandler embedHandler = new EmbeddedContentHandler(new BodyContentHandler(handler));
if (outputHtml) {
writeStart(handler, metadata);
}
delegateParsing(input, embedHandler, metadata);
if (outputHtml) {
writeEnd(handler);
}
} else {
try (final TikaInputStream tis = TikaInputStream.get(input)) {
spawnEmbedded(tis, metadata);
}
}
}
示例6: testNulls
import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
@Test
public void testNulls() throws UnsupportedEncodingException, IOException,
SAXException, TikaException {
String text = "";
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
GeoParserConfig config = new GeoParserConfig();
config.setGazetterPath(gazetteer);
config.setNERModelPath(nerPath);
context.set(GeoParserConfig.class, config);
geoparser.parse(new ByteArrayInputStream(text.getBytes("UTF-8")),
new BodyContentHandler(), metadata, context);
assertNull(metadata.get("Geographic_NAME"));
assertNull(metadata.get("Geographic_LONGITUDE"));
assertNull(metadata.get("Geographic_LATITUDE"));
}
示例7: Convert
import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
protected String Convert(InputStream Bytes) throws PDException
{
try {
ContentHandler textHandler=new BodyContentHandler(-1);
Metadata metadata=new Metadata();
Parser parser=new AutoDetectParser();
ParseContext context=new ParseContext();
parser.parse(Bytes, textHandler, metadata, context);
FileMetadata="";
for (String key : metadata.names())
FileMetadata+=key+"="+metadata.get(key)+"\n";
FullText=textHandler.toString();
} catch (Exception ex)
{
PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage());
}
return(FullText);
}
示例8: readXlsx
import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
public static ExcelData readXlsx(String xlsxFilePath)
throws IOException, InvalidFormatException, XmlException, TikaException, SAXException {
BodyContentHandler bcHandler = new BodyContentHandler();
Metadata metadata = new Metadata();
FileInputStream inputStream = new FileInputStream(new File(xlsxFilePath));
ParseContext pcontext = new ParseContext();
OOXMLParser parser = new OOXMLParser();
parser.parse(inputStream, bcHandler, metadata, pcontext);
if (DEBUG_PRINT_META_DATA) {
System.err.println("Metadata:");
for (String name : metadata.names())
System.out.println(name + "\t:\t" + metadata.get(name));
}
ExcelData spreedsheet = new ExcelData(bcHandler.toString());
return spreedsheet;
}
示例9: doProcessStream
import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
super.doProcessStream(stream, source, jCas);
try {
BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
AutoDetectParser autoParser = new AutoDetectParser();
autoParser.parse(stream, textHandler, metadata, context);
jCas.setDocumentText(textHandler.toString());
for (String name : metadata.names()) {
addMetadata(jCas, name, metadata.get(name));
}
} catch (SAXException | TikaException e) {
getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
if (Strings.isNullOrEmpty(jCas.getDocumentText())) {
jCas.setDocumentText(CORRUPT_FILE_TEXT);
}
}
}
示例10: main
import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
public static void main(String[] args) {
String[] urls = {"http://t.co/hP5PM6fm", "http://t.co/xSFteG23"};
for (String url : urls)
{
try {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
URL urlObject = new URL(url);
ContentHandler handler = new BodyContentHandler(10 *
1024 * 1024);
parser.parse((InputStream) urlObject.getContent(),
handler, metadata, parseContext);
String[] mimeDetails = metadata.get("Content-Type")
.split(";");
logger.info("execute: url = "+url+", mimeDetails = "+Arrays.asList(mimeDetails));
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
示例11: parse
import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
public ParsedData parse(InputStream stream, String fileName, String contentType) {
BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS);
BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE);
Metadata metadata = createMetadata(fileName, contentType);
ParseContext context = new ParseContext();
try {
parser.parse(stream, textHandler, metadata, context);
Map<String, String> metadataMap = new HashMap<String, String>();
for (String propertyName : metadata.names()) {
metadataMap.put(propertyName, metadata.get(propertyName));
}
return new ParsedData(handler.toString(), metadataMap);
} catch (IOException | SAXException | TikaException e) {
logger.error("Failed to extract metadata using Tika.", e);
return null;
}
}
示例12: init
import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
@Override
public Instance<String> init(BlobMetadata data, Payload payload, boolean isSegment) {
return new Instance<String>() {
@Override
public <T> List<T> select(String value, DataConverter<String, T> converter) {
TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
org.apache.tika.metadata.Metadata metadata = new org.apache.tika.metadata.Metadata();
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
ContentHandler handler = new BodyContentHandler();
try {
TikaInputStream stream = TikaInputStream.get(payload.openStream());
parser.parse(stream, handler, metadata, new ParseContext());
} catch (Exception e) {
payload.release();
throw Throwables.propagate(e);
}
return Arrays.asList(converter.convert(handler.toString()));
}
};
}
示例13: parseTXTToString
import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException {
WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength);
try {
ParseContext context = new ParseContext();
context.set(Parser.class, txtParser);
txtParser.parse(stream, new BodyContentHandler(handler), metadata, context);
} catch (SAXException e) {
if (!handler.isWriteLimitReached(e)) {
// This should never happen with BodyContentHandler...
throw new TikaException("Unexpected SAX processing failure", e);
}
} finally {
stream.close();
}
return handler.toString();
}
示例14: testWord
import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
/**
* Test the plain text output of the Word converter
* @throws Exception
*/
@Test
public void testWord() throws Exception {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
InputStream input = getTestDocument("testWORD.docx");
try {
parser.parse(input, handler, metadata, context);
assertEquals(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
assertTrue(handler.toString().contains("Sample Word Document"));
} finally {
input.close();
}
}
示例15: testEmbeddedWord
import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
/**
* Test the plain text output of the Word converter
* @throws Exception
*/
@Test
public void testEmbeddedWord() throws Exception {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
InputStream input = getTestDocument("Doc w Structure that wont extract.docx");
try {
parser.parse(input, handler, metadata, context);
//
System.out.println(handler.toString());
// assertEquals(
// "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
// metadata.get(Metadata.CONTENT_TYPE));
// assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
// assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
// assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
assertTrue(handler.toString().contains("N"));
} finally {
input.close();
}
}