当前位置: 首页>>代码示例>>Java>>正文


Java AutoDetectParser类代码示例

本文整理汇总了Java中org.apache.tika.parser.AutoDetectParser的典型用法代码示例。如果您正苦于以下问题:Java AutoDetectParser类的具体用法?Java AutoDetectParser怎么用?Java AutoDetectParser使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


AutoDetectParser类属于org.apache.tika.parser包,在下文中一共展示了AutoDetectParser类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: findMediaType

import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
/**
 * Finds media type (through Apache Tika library), based on filename and magic numbers.
 * @throws IOException
 */
public static MediaType findMediaType(InputStream is, String fileName) throws IOException {
    BufferedInputStream bis = new BufferedInputStream(is);
    try {
        AutoDetectParser parser = new AutoDetectParser();
        Detector detector = parser.getDetector();
        Metadata md = new Metadata();
        md.add(Metadata.RESOURCE_NAME_KEY, fileName);
        MediaType mediaType = detector.detect(bis, md);
        return mediaType;
    } finally {
        try {
            bis.close();
        } catch (IOException e) {
            ;
        }
    }
}
 
开发者ID:ilscipio,项目名称:scipio-erp,代码行数:22,代码来源:TikaUtil.java

示例2: extractText

import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
@Override
public void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)
	throws IOException
{
	try
	{
		Metadata meta = new Metadata();
		ContentHandler handler = new BodyContentHandler();
		Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));
		parser.parse(input, handler, meta, new ParseContext());

		String content = handler.toString();

		if( content.length() > maxSize )
		{
			content = content.substring(0, maxSize);
		}
		outputText.append(content);
		if( LOGGER.isDebugEnabled() )
		{
			LOGGER.debug("Word Summary:" + content); //$NON-NLS-1$
		}
	}
	catch( Exception e )
	{
		throw new RuntimeException(e);
	}
}
 
开发者ID:equella,项目名称:Equella,代码行数:29,代码来源:MsWordExtracter.java

示例3: getFullText

import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
private static String getFullText(final String filepath) throws IOException, SAXException, TikaException {
    StringWriter writer = new StringWriter();
    
    final TikaInputStream inputStream =  TikaInputStream.get(new File(filepath));
    try {
        final Detector detector = new DefaultDetector();
        final Parser parser = new AutoDetectParser(detector);

        final Metadata metadata = new Metadata();
        final ParseContext parseContext = new ParseContext();
        parseContext.set(Parser.class, parser);

        
        ContentHandler contentHandler = new BodyContentHandler(writer);
        parser.parse(inputStream, contentHandler, metadata, parseContext);
    }
    finally {
        inputStream.close();
    }
    
    return writer.toString();
}
 
开发者ID:CoEIA,项目名称:DEM,代码行数:23,代码来源:ItemFactory.java

示例4: TikaProperties

import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
public TikaProperties(File file) throws IOException, SAXException,
		TikaException {
	TikaInputStream tikaStream = TikaInputStream.get(file);
	metadata = new Metadata();
	ContentHandler handler = new DefaultHandler();
	Parser parser = new AutoDetectParser();
	ParseContext context = new ParseContext();
	try {
		parser.parse(tikaStream, handler, metadata, context);
	} finally {
		try {
			tikaStream.close();
		} catch (Exception ie) {
			// ignore
		}
	}
}
 
开发者ID:fmui,项目名称:ApacheChemistryInAction,代码行数:18,代码来源:TikaProperties.java

示例5: render

import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
@Override
protected void render(RenderingContext context)
{
    ContentReader contentReader = context.makeContentReader();
    String sourceMimeType = contentReader.getMimetype();
    
    // Check that Tika supports the supplied file
    AutoDetectParser p = new AutoDetectParser(tikaConfig);
    MediaType sourceMediaType = MediaType.parse(sourceMimeType);
    if(! p.getParsers().containsKey(sourceMediaType))
    {
       throw new RenditionServiceException(
             "Source mime type of " + sourceMimeType + 
             " is not supported by Tika for HTML conversions"
       );
    }
    
    // Make the HTML Version using Tika
    // This will also extract out any images as found
    generateHTML(p, context);
}
 
开发者ID:Alfresco,项目名称:alfresco-repository,代码行数:22,代码来源:HTMLRenderingEngine.java

示例6: setTikaConfig

import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
/**
 * Injects the TikaConfig to use
 * 
 * @param tikaConfig The Tika Config to use 
 */
public void setTikaConfig(TikaConfig tikaConfig)
{
    this.config = tikaConfig;
    
    // Setup the detector and parser
    detector = new DefaultDetector(config.getMimeRepository());
    parser = new AutoDetectParser(detector);
}
 
开发者ID:Alfresco,项目名称:alfresco-repository,代码行数:14,代码来源:TikaPoweredContainerExtractor.java

示例7: buildParseContext

import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
@Override
protected ParseContext buildParseContext(Metadata metadata,
     String targetMimeType, TransformationOptions options) {
  ParseContext context = super.buildParseContext(metadata, targetMimeType, options);
  
  boolean recurse = includeContents;
  if(options.getIncludeEmbedded() != null)
  {
     recurse = options.getIncludeEmbedded();
  }
  
  if(recurse)
  {
     // Use an auto detect parser to handle the contents
     if(tikaConfig == null)
     {
         tikaConfig = TikaConfig.getDefaultConfig();
     }
     context.set(Parser.class, new AutoDetectParser(tikaConfig));
  }
  
  return context;
}
 
开发者ID:Alfresco,项目名称:alfresco-repository,代码行数:24,代码来源:ArchiveContentTransformer.java

示例8: buildMimeTypes

import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
private static ArrayList<String> buildMimeTypes(TikaConfig tikaConfig)
{
   config = tikaConfig;
   parser = new AutoDetectParser(config);

   SUPPORTED_MIMETYPES = new ArrayList<String>();
   for(MediaType mt : parser.getParsers().keySet()) 
   {
      // Add the canonical mime type
      SUPPORTED_MIMETYPES.add( mt.toString() );
      
      // And add any aliases of the mime type too - Alfresco uses some
      //  non canonical forms of various mimetypes, so we need all of them
      for(MediaType alias : config.getMediaTypeRegistry().getAliases(mt)) 
      {
          SUPPORTED_MIMETYPES.add( alias.toString() );
      }
   }
   return SUPPORTED_MIMETYPES;
}
 
开发者ID:Alfresco,项目名称:alfresco-repository,代码行数:21,代码来源:TikaAutoMetadataExtracter.java

示例9: process

import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
@Override
public void process(ProcessingContext<Corpus> ctx, Corpus corpus) throws ModuleException {
	Logger.getLogger("org.apache.pdfbox").setLevel(Level.OFF);
	AutoDetectParser parser = new AutoDetectParser();
	ParseContext parseContext = new ParseContext();
	try {
		for (InputStream is : Iterators.loop(source.getInputStreams())) {
			TikaReaderHandler handler = parse(parser, parseContext, is);
			Document doc = createDocument(corpus, handler);
			createTagAnnotations(doc, handler);
		}
	}
	catch (IOException|SAXException|TikaException e) {
		rethrow(e);
	}
}
 
开发者ID:Bibliome,项目名称:alvisnlp,代码行数:17,代码来源:TikaReader.java

示例10: extractText

import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
@Override
public void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)
	throws IOException
{
	WriteOutContentHandler wrapped = new WriteOutContentHandler(maxSize);
	ContentHandler handler = new BodyContentHandler(wrapped);
	try
	{
		Metadata meta = new Metadata();
		Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));
		parser.parse(input, handler, meta, new ParseContext());

		appendText(handler, outputText, maxSize);
	}
	catch( Exception t )
	{
		if( wrapped.isWriteLimitReached(t) )
		{
			// keep going
			LOGGER.debug("PDF size limit reached.  Indexing truncated text");
			appendText(handler, outputText, maxSize);
			return;
		}
		throw Throwables.propagate(t);
	}
}
 
开发者ID:equella,项目名称:Equella,代码行数:27,代码来源:PdfExtracter.java

示例11: resolveContentType

import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
private String resolveContentType(byte[] data) {
    AutoDetectParser parser = new AutoDetectParser(new ImageParser());
    try {
        return parser.getDetector().detect(TikaInputStream.get(data), new Metadata()).toString();
    } catch (IOException e) {
        return MediaType.OCTET_STREAM.toString();
    }
}
 
开发者ID:reportportal,项目名称:service-authorization,代码行数:9,代码来源:AbstractUserReplicator.java

示例12: PDFExtract

import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
public PDFExtract(){
    parser = new AutoDetectParser();
    TesseractOCRConfig config = new TesseractOCRConfig();
    PDFParserConfig pdfConfig = new PDFParserConfig();
    pdfConfig.setExtractInlineImages(true);

    parseContext = new ParseContext();
    parseContext.set(TesseractOCRConfig.class, config);
    parseContext.set(PDFParserConfig.class, pdfConfig);
    //need to add this to make sure recursive parsing happens!
    parseContext.set(Parser.class, parser);
}
 
开发者ID:thammegowda,项目名称:pdf-extractor,代码行数:13,代码来源:PDFExtract.java

示例13: getMetadata

import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
@Override
public Single<Map<String, String>> getMetadata(InputStream ins) {
	return Single.create(sub -> {
		Parser parser = new AutoDetectParser();
		BodyContentHandler handler = new BodyContentHandler();
		Metadata metadata = new Metadata();
		ParseContext context = new ParseContext();
		try {
			parser.parse(ins, handler, metadata, context);
			Map<String, String> map = new HashMap<>();
			String[] metadataNames = metadata.names();

			for (String name : metadataNames) {
				map.put(name, metadata.get(name));
			}

			sub.onSuccess(map);

		} catch (Exception e) {
			sub.onError(e);
		}
		// ins.close();
	});
}
 
开发者ID:gentics,项目名称:mesh,代码行数:25,代码来源:ImgscalrImageManipulator.java

示例14: extractMetaData

import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
public static TreeMap<String, String> extractMetaData(InputStream input) throws IOException {
	TreeMap<String, String> treeMap = new TreeMap<String, String>();
	try {
		ContentHandler handler = new DefaultHandler();
		Metadata metadata = new Metadata();
		Parser parser = new AutoDetectParser();
		parser.parse(input, handler, metadata, new ParseContext());
		for (int i = 0; i < metadata.names().length; i++) {
			String name = metadata.names()[i];
			treeMap.put(name, stripWhiteSpace(metadata.get(name)));
		}
	} catch (Exception e) {
		e.printStackTrace();
	} finally {
		if (input != null)
			input.close();
	}
	return treeMap;
}
 
开发者ID:regestaexe,项目名称:bygle-ldp,代码行数:20,代码来源:FileInfoReader.java

示例15: extractStringMetaData

import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
public static String extractStringMetaData(InputStream input) throws IOException {
	String result="";
	try {
		ContentHandler handler = new DefaultHandler();
		Metadata metadata = new Metadata();
		Parser parser = new AutoDetectParser();
		parser.parse(input, handler, metadata, new ParseContext());
		for (int i = 0; i < metadata.names().length; i++) {
			String name = metadata.names()[i];
			result+=name.toUpperCase()+" : "+stripWhiteSpace(metadata.get(name))+"\n";
		}
	} catch (Exception e) {
	} finally {
		if (input != null)
			input.close();
	}
	return result;
}
 
开发者ID:regestaexe,项目名称:bygle-ldp,代码行数:19,代码来源:FileInfoReader.java


注:本文中的org.apache.tika.parser.AutoDetectParser类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。