本文整理汇总了Java中org.apache.tika.parser.AutoDetectParser类的典型用法代码示例。如果您正苦于以下问题:Java AutoDetectParser类的具体用法?Java AutoDetectParser怎么用?Java AutoDetectParser使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
AutoDetectParser类属于org.apache.tika.parser包,在下文中一共展示了AutoDetectParser类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: findMediaType
import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
/**
* Finds media type (through Apache Tika library), based on filename and magic numbers.
* @throws IOException
*/
public static MediaType findMediaType(InputStream is, String fileName) throws IOException {
BufferedInputStream bis = new BufferedInputStream(is);
try {
AutoDetectParser parser = new AutoDetectParser();
Detector detector = parser.getDetector();
Metadata md = new Metadata();
md.add(Metadata.RESOURCE_NAME_KEY, fileName);
MediaType mediaType = detector.detect(bis, md);
return mediaType;
} finally {
try {
bis.close();
} catch (IOException e) {
;
}
}
}
示例2: extractText
import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
@Override
public void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)
throws IOException
{
try
{
Metadata meta = new Metadata();
ContentHandler handler = new BodyContentHandler();
Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));
parser.parse(input, handler, meta, new ParseContext());
String content = handler.toString();
if( content.length() > maxSize )
{
content = content.substring(0, maxSize);
}
outputText.append(content);
if( LOGGER.isDebugEnabled() )
{
LOGGER.debug("Word Summary:" + content); //$NON-NLS-1$
}
}
catch( Exception e )
{
throw new RuntimeException(e);
}
}
示例3: getFullText
import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
private static String getFullText(final String filepath) throws IOException, SAXException, TikaException {
StringWriter writer = new StringWriter();
final TikaInputStream inputStream = TikaInputStream.get(new File(filepath));
try {
final Detector detector = new DefaultDetector();
final Parser parser = new AutoDetectParser(detector);
final Metadata metadata = new Metadata();
final ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
ContentHandler contentHandler = new BodyContentHandler(writer);
parser.parse(inputStream, contentHandler, metadata, parseContext);
}
finally {
inputStream.close();
}
return writer.toString();
}
示例4: TikaProperties
import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
public TikaProperties(File file) throws IOException, SAXException,
TikaException {
TikaInputStream tikaStream = TikaInputStream.get(file);
metadata = new Metadata();
ContentHandler handler = new DefaultHandler();
Parser parser = new AutoDetectParser();
ParseContext context = new ParseContext();
try {
parser.parse(tikaStream, handler, metadata, context);
} finally {
try {
tikaStream.close();
} catch (Exception ie) {
// ignore
}
}
}
示例5: render
import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
@Override
protected void render(RenderingContext context)
{
ContentReader contentReader = context.makeContentReader();
String sourceMimeType = contentReader.getMimetype();
// Check that Tika supports the supplied file
AutoDetectParser p = new AutoDetectParser(tikaConfig);
MediaType sourceMediaType = MediaType.parse(sourceMimeType);
if(! p.getParsers().containsKey(sourceMediaType))
{
throw new RenditionServiceException(
"Source mime type of " + sourceMimeType +
" is not supported by Tika for HTML conversions"
);
}
// Make the HTML Version using Tika
// This will also extract out any images as found
generateHTML(p, context);
}
示例6: setTikaConfig
import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
/**
* Injects the TikaConfig to use
*
* @param tikaConfig The Tika Config to use
*/
public void setTikaConfig(TikaConfig tikaConfig)
{
this.config = tikaConfig;
// Setup the detector and parser
detector = new DefaultDetector(config.getMimeRepository());
parser = new AutoDetectParser(detector);
}
示例7: buildParseContext
import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
@Override
protected ParseContext buildParseContext(Metadata metadata,
String targetMimeType, TransformationOptions options) {
ParseContext context = super.buildParseContext(metadata, targetMimeType, options);
boolean recurse = includeContents;
if(options.getIncludeEmbedded() != null)
{
recurse = options.getIncludeEmbedded();
}
if(recurse)
{
// Use an auto detect parser to handle the contents
if(tikaConfig == null)
{
tikaConfig = TikaConfig.getDefaultConfig();
}
context.set(Parser.class, new AutoDetectParser(tikaConfig));
}
return context;
}
示例8: buildMimeTypes
import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
private static ArrayList<String> buildMimeTypes(TikaConfig tikaConfig)
{
config = tikaConfig;
parser = new AutoDetectParser(config);
SUPPORTED_MIMETYPES = new ArrayList<String>();
for(MediaType mt : parser.getParsers().keySet())
{
// Add the canonical mime type
SUPPORTED_MIMETYPES.add( mt.toString() );
// And add any aliases of the mime type too - Alfresco uses some
// non canonical forms of various mimetypes, so we need all of them
for(MediaType alias : config.getMediaTypeRegistry().getAliases(mt))
{
SUPPORTED_MIMETYPES.add( alias.toString() );
}
}
return SUPPORTED_MIMETYPES;
}
示例9: process
import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
@Override
public void process(ProcessingContext<Corpus> ctx, Corpus corpus) throws ModuleException {
Logger.getLogger("org.apache.pdfbox").setLevel(Level.OFF);
AutoDetectParser parser = new AutoDetectParser();
ParseContext parseContext = new ParseContext();
try {
for (InputStream is : Iterators.loop(source.getInputStreams())) {
TikaReaderHandler handler = parse(parser, parseContext, is);
Document doc = createDocument(corpus, handler);
createTagAnnotations(doc, handler);
}
}
catch (IOException|SAXException|TikaException e) {
rethrow(e);
}
}
示例10: extractText
import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
@Override
public void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)
throws IOException
{
WriteOutContentHandler wrapped = new WriteOutContentHandler(maxSize);
ContentHandler handler = new BodyContentHandler(wrapped);
try
{
Metadata meta = new Metadata();
Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));
parser.parse(input, handler, meta, new ParseContext());
appendText(handler, outputText, maxSize);
}
catch( Exception t )
{
if( wrapped.isWriteLimitReached(t) )
{
// keep going
LOGGER.debug("PDF size limit reached. Indexing truncated text");
appendText(handler, outputText, maxSize);
return;
}
throw Throwables.propagate(t);
}
}
示例11: resolveContentType
import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
private String resolveContentType(byte[] data) {
AutoDetectParser parser = new AutoDetectParser(new ImageParser());
try {
return parser.getDetector().detect(TikaInputStream.get(data), new Metadata()).toString();
} catch (IOException e) {
return MediaType.OCTET_STREAM.toString();
}
}
示例12: PDFExtract
import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
public PDFExtract(){
parser = new AutoDetectParser();
TesseractOCRConfig config = new TesseractOCRConfig();
PDFParserConfig pdfConfig = new PDFParserConfig();
pdfConfig.setExtractInlineImages(true);
parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, config);
parseContext.set(PDFParserConfig.class, pdfConfig);
//need to add this to make sure recursive parsing happens!
parseContext.set(Parser.class, parser);
}
示例13: getMetadata
import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
@Override
public Single<Map<String, String>> getMetadata(InputStream ins) {
return Single.create(sub -> {
Parser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
try {
parser.parse(ins, handler, metadata, context);
Map<String, String> map = new HashMap<>();
String[] metadataNames = metadata.names();
for (String name : metadataNames) {
map.put(name, metadata.get(name));
}
sub.onSuccess(map);
} catch (Exception e) {
sub.onError(e);
}
// ins.close();
});
}
示例14: extractMetaData
import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
public static TreeMap<String, String> extractMetaData(InputStream input) throws IOException {
TreeMap<String, String> treeMap = new TreeMap<String, String>();
try {
ContentHandler handler = new DefaultHandler();
Metadata metadata = new Metadata();
Parser parser = new AutoDetectParser();
parser.parse(input, handler, metadata, new ParseContext());
for (int i = 0; i < metadata.names().length; i++) {
String name = metadata.names()[i];
treeMap.put(name, stripWhiteSpace(metadata.get(name)));
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (input != null)
input.close();
}
return treeMap;
}
示例15: extractStringMetaData
import org.apache.tika.parser.AutoDetectParser; //导入依赖的package包/类
public static String extractStringMetaData(InputStream input) throws IOException {
String result="";
try {
ContentHandler handler = new DefaultHandler();
Metadata metadata = new Metadata();
Parser parser = new AutoDetectParser();
parser.parse(input, handler, metadata, new ParseContext());
for (int i = 0; i < metadata.names().length; i++) {
String name = metadata.names()[i];
result+=name.toUpperCase()+" : "+stripWhiteSpace(metadata.get(name))+"\n";
}
} catch (Exception e) {
} finally {
if (input != null)
input.close();
}
return result;
}