当前位置: 首页>>代码示例>>Java>>正文


Java BodyContentHandler类代码示例

本文整理汇总了Java中org.apache.tika.sax.BodyContentHandler的典型用法代码示例。如果您正苦于以下问题:Java BodyContentHandler类的具体用法?Java BodyContentHandler怎么用?Java BodyContentHandler使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


BodyContentHandler类属于org.apache.tika.sax包,在下文中一共展示了BodyContentHandler类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: extractText

import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
@Override
public void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)
	throws IOException
{
	try
	{
		Metadata meta = new Metadata();
		ContentHandler handler = new BodyContentHandler();
		Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));
		parser.parse(input, handler, meta, new ParseContext());

		String content = handler.toString();

		if( content.length() > maxSize )
		{
			content = content.substring(0, maxSize);
		}
		outputText.append(content);
		if( LOGGER.isDebugEnabled() )
		{
			LOGGER.debug("Word Summary:" + content); //$NON-NLS-1$
		}
	}
	catch( Exception e )
	{
		throw new RuntimeException(e);
	}
}
 
开发者ID:equella,项目名称:Equella,代码行数:29,代码来源:MsWordExtracter.java

示例2: getFullText

import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
private static String getFullText(final String filepath) throws IOException, SAXException, TikaException {
    StringWriter writer = new StringWriter();
    
    final TikaInputStream inputStream =  TikaInputStream.get(new File(filepath));
    try {
        final Detector detector = new DefaultDetector();
        final Parser parser = new AutoDetectParser(detector);

        final Metadata metadata = new Metadata();
        final ParseContext parseContext = new ParseContext();
        parseContext.set(Parser.class, parser);

        
        ContentHandler contentHandler = new BodyContentHandler(writer);
        parser.parse(inputStream, contentHandler, metadata, parseContext);
    }
    finally {
        inputStream.close();
    }
    
    return writer.toString();
}
 
开发者ID:CoEIA,项目名称:DEM,代码行数:23,代码来源:ItemFactory.java

示例3: extractText

import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
@Override
public void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)
	throws IOException
{
	WriteOutContentHandler wrapped = new WriteOutContentHandler(maxSize);
	ContentHandler handler = new BodyContentHandler(wrapped);
	try
	{
		Metadata meta = new Metadata();
		Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));
		parser.parse(input, handler, meta, new ParseContext());

		appendText(handler, outputText, maxSize);
	}
	catch( Exception t )
	{
		if( wrapped.isWriteLimitReached(t) )
		{
			// keep going
			LOGGER.debug("PDF size limit reached.  Indexing truncated text");
			appendText(handler, outputText, maxSize);
			return;
		}
		throw Throwables.propagate(t);
	}
}
 
开发者ID:equella,项目名称:Equella,代码行数:27,代码来源:PdfExtracter.java

示例4: getMetadata

import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
@Override
public Single<Map<String, String>> getMetadata(InputStream ins) {
	return Single.create(sub -> {
		Parser parser = new AutoDetectParser();
		BodyContentHandler handler = new BodyContentHandler();
		Metadata metadata = new Metadata();
		ParseContext context = new ParseContext();
		try {
			parser.parse(ins, handler, metadata, context);
			Map<String, String> map = new HashMap<>();
			String[] metadataNames = metadata.names();

			for (String name : metadataNames) {
				map.put(name, metadata.get(name));
			}

			sub.onSuccess(map);

		} catch (Exception e) {
			sub.onError(e);
		}
		// ins.close();
	});
}
 
开发者ID:gentics,项目名称:mesh,代码行数:25,代码来源:ImgscalrImageManipulator.java

示例5: parseEmbedded

import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
@Override
public void parseEmbedded(final InputStream input, final ContentHandler handler, final Metadata metadata,
                          final boolean outputHtml) throws SAXException, IOException {

	// There's no need to spawn inline embeds, like images in PDFs. These should be concatenated to the main
	// document as usual.
	if (TikaCoreProperties.EmbeddedResourceType.INLINE.toString().equals(metadata
			.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE))) {
		final ContentHandler embedHandler = new EmbeddedContentHandler(new BodyContentHandler(handler));

		if (outputHtml) {
			writeStart(handler, metadata);
		}

		delegateParsing(input, embedHandler, metadata);

		if (outputHtml) {
			writeEnd(handler);
		}
	} else {
		try (final TikaInputStream tis = TikaInputStream.get(input)) {
			spawnEmbedded(tis, metadata);
		}
	}
}
 
开发者ID:ICIJ,项目名称:extract,代码行数:26,代码来源:EmbedSpawner.java

示例6: testNulls

import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
@Test
public void testNulls() throws UnsupportedEncodingException, IOException,
		SAXException, TikaException {
	String text = "";

	Metadata metadata = new Metadata();
	ParseContext context = new ParseContext();
	GeoParserConfig config = new GeoParserConfig();
	config.setGazetterPath(gazetteer);
	config.setNERModelPath(nerPath);
	context.set(GeoParserConfig.class, config);
	geoparser.parse(new ByteArrayInputStream(text.getBytes("UTF-8")),
			new BodyContentHandler(), metadata, context);
	assertNull(metadata.get("Geographic_NAME"));
	assertNull(metadata.get("Geographic_LONGITUDE"));
	assertNull(metadata.get("Geographic_LATITUDE"));

}
 
开发者ID:anyayunli,项目名称:GeoParsingNSF,代码行数:19,代码来源:GeoParserTest.java

示例7: Convert

import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
protected String Convert(InputStream Bytes) throws PDException
{  
try {                
ContentHandler textHandler=new BodyContentHandler(-1);
Metadata metadata=new Metadata();
Parser parser=new AutoDetectParser();
ParseContext context=new ParseContext();
parser.parse(Bytes, textHandler, metadata, context);
FileMetadata="";
for (String key : metadata.names()) 
    FileMetadata+=key+"="+metadata.get(key)+"\n";
FullText=textHandler.toString();
} catch (Exception ex)
    {
    PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage());
    }

return(FullText); 
}
 
开发者ID:JHierrot,项目名称:openprodoc,代码行数:20,代码来源:FTConnector.java

示例8: readXlsx

import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
public static ExcelData readXlsx(String xlsxFilePath)
    throws IOException, InvalidFormatException, XmlException, TikaException, SAXException {
  BodyContentHandler bcHandler = new BodyContentHandler();
  Metadata metadata = new Metadata();
  FileInputStream inputStream = new FileInputStream(new File(xlsxFilePath));
  ParseContext pcontext = new ParseContext();
  OOXMLParser parser = new OOXMLParser();
  parser.parse(inputStream, bcHandler, metadata, pcontext);
  if (DEBUG_PRINT_META_DATA) {
    System.err.println("Metadata:");
    for (String name : metadata.names())
      System.out.println(name + "\t:\t" + metadata.get(name));
  }
  ExcelData spreedsheet = new ExcelData(bcHandler.toString());
  return spreedsheet;
}
 
开发者ID:mark-watson,项目名称:power-java,代码行数:17,代码来源:PoiMicrosoftFileReader.java

示例9: doProcessStream

import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
	super.doProcessStream(stream, source, jCas);

	try {
		BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
		Metadata metadata = new Metadata();
		ParseContext context = new ParseContext();

		AutoDetectParser autoParser = new AutoDetectParser();
		autoParser.parse(stream, textHandler, metadata, context);

		jCas.setDocumentText(textHandler.toString());

		for (String name : metadata.names()) {
			addMetadata(jCas, name, metadata.get(name));
		}
	} catch (SAXException | TikaException e) {
		getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
		if (Strings.isNullOrEmpty(jCas.getDocumentText())) {
			jCas.setDocumentText(CORRUPT_FILE_TEXT);
		}
	}
}
 
开发者ID:dstl,项目名称:baleen,代码行数:25,代码来源:TikaContentExtractor.java

示例10: main

import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
public static void main(String[] args) {
	String[] urls = {"http://t.co/hP5PM6fm", "http://t.co/xSFteG23"};
	for (String url : urls)
	{
		try {
	          Parser parser = new AutoDetectParser();
	          Metadata metadata = new Metadata();
	          ParseContext parseContext = new ParseContext();
	          URL urlObject = new URL(url);
	          ContentHandler handler = new BodyContentHandler(10 *
	                                   1024 * 1024);
	          parser.parse((InputStream) urlObject.getContent(),
	                         handler, metadata, parseContext);
	          String[] mimeDetails = metadata.get("Content-Type")
	                                              .split(";");
	          logger.info("execute: url = "+url+", mimeDetails = "+Arrays.asList(mimeDetails));
	          
	          
	        } catch (Exception ex) {
	        	ex.printStackTrace();
	        }
	}
       
}
 
开发者ID:BinitaBharati,项目名称:storm-trident-example,代码行数:25,代码来源:TikaParserTest.java

示例11: parse

import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
public ParsedData parse(InputStream stream, String fileName, String contentType) {
    BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS);
    BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE);
    Metadata metadata = createMetadata(fileName, contentType);
    ParseContext context = new ParseContext();
    try {
        parser.parse(stream, textHandler, metadata, context);
        
        Map<String, String> metadataMap = new HashMap<String, String>();
        for (String propertyName : metadata.names()) {
            metadataMap.put(propertyName, metadata.get(propertyName));
        }
        
        return new ParsedData(handler.toString(), metadataMap);
        
    } catch (IOException | SAXException | TikaException e) {
        logger.error("Failed to extract metadata using Tika.", e);
        return null;
    }
}
 
开发者ID:ViDA-NYU,项目名称:ache,代码行数:21,代码来源:TikaExtractor.java

示例12: init

import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
@Override
public Instance<String> init(BlobMetadata data, Payload payload, boolean isSegment) {
	return new Instance<String>() {
		@Override
		public <T> List<T> select(String value, DataConverter<String, T> converter) {
			TikaConfig tikaConfig = TikaConfig.getDefaultConfig();

			org.apache.tika.metadata.Metadata metadata = new org.apache.tika.metadata.Metadata();
			AutoDetectParser parser = new AutoDetectParser(tikaConfig);
			ContentHandler handler = new BodyContentHandler();
			try {
				TikaInputStream stream = TikaInputStream.get(payload.openStream());
				parser.parse(stream, handler, metadata, new ParseContext());
			} catch (Exception e) {
				payload.release();
				throw Throwables.propagate(e);
			}
			return Arrays.asList(converter.convert(handler.toString()));
		}
	};
}
 
开发者ID:Treydone,项目名称:mandrel,代码行数:22,代码来源:TikaSelector.java

示例13: parseTXTToString

import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException {
	WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength);
	try {
		ParseContext context = new ParseContext();
		context.set(Parser.class, txtParser);
		txtParser.parse(stream, new BodyContentHandler(handler), metadata, context);
	} catch (SAXException e) {
		if (!handler.isWriteLimitReached(e)) {
			// This should never happen with BodyContentHandler...
			throw new TikaException("Unexpected SAX processing failure", e);
		}
	} finally {
		stream.close();
	}
	return handler.toString();
}
 
开发者ID:ziqizhang,项目名称:jate,代码行数:17,代码来源:ContentExtractor.java

示例14: testWord

import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
/**
 * Test the plain text output of the Word converter
 * @throws Exception
 */
@Test
public void testWord() throws Exception {
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    ParseContext context = new ParseContext();

    InputStream input = getTestDocument("testWORD.docx");
    try {
        parser.parse(input, handler, metadata, context);
        
      
        assertEquals(
                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
        assertTrue(handler.toString().contains("Sample Word Document"));
    } finally {
        input.close();
    }
}
 
开发者ID:kanrourou,项目名称:software-testing,代码行数:27,代码来源:OOXMLParserTest.java

示例15: testEmbeddedWord

import org.apache.tika.sax.BodyContentHandler; //导入依赖的package包/类
/**
     * Test the plain text output of the Word converter
     * @throws Exception
     */
    @Test
    public void testEmbeddedWord() throws Exception {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();

        InputStream input = getTestDocument("Doc w Structure that wont extract.docx");
        try {
            parser.parse(input, handler, metadata, context);
            
//          
            System.out.println(handler.toString());
//            assertEquals(
//                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
//                    metadata.get(Metadata.CONTENT_TYPE));
//            assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
//            assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
//            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
            assertTrue(handler.toString().contains("N"));
        } finally {
            input.close();
        }
    }
 
开发者ID:kanrourou,项目名称:software-testing,代码行数:28,代码来源:OOXMLParserTest.java


注:本文中的org.apache.tika.sax.BodyContentHandler类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。