本文整理匯總了Java中org.apache.tika.parser.ParseContext.get方法的典型用法代碼示例。如果您正苦於以下問題:Java ParseContext.get方法的具體用法?Java ParseContext.get怎麽用?Java ParseContext.get使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類org.apache.tika.parser.ParseContext
的用法示例。
在下文中一共展示了ParseContext.get方法的8個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: parse
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
public void parse(InputStream inputStream, ContentHandler contentHandler,
Metadata metadata, ParseContext parseContext)
throws IOException, SAXException, TikaException {
this.config = parseContext.get(ProfileParserConfig.class, config);
initialize(this.config.getAgeProfilerModelUrl(),
this.config.getGenderProfilerModelUrl());
if (!isAvailable()) {
return;
}
ProfileExtractor extractor = null;
try {
extractor = new ProfileExtractor(this.ageProfiler, this.genderProfiler);
} catch (Exception e) {
LOG.warning("Profiler setup failed: " + e);
return;
}
Profile profile = extractor.getProfileFromInput(inputStream);
metadata.add("Author_AGE", profile.getAgeRange());
metadata.add("Author_GENDER", profile.getGender());
metadata.add("Author_"+ TraitProfiler.TRAITS.TRAIT_AGREEABLE.name(),
Double.toString(profile.getTraits().get(0)));
metadata.add("Author_"+ TraitProfiler.TRAITS.TRAIT_CONSCIENTIOUS.name(),
Double.toString(profile.getTraits().get(1)));
metadata.add("Author_"+ TraitProfiler.TRAITS.TRAIT_EXTROVERT.name(),
Double.toString(profile.getTraits().get(2)));
metadata.add("Author_"+ TraitProfiler.TRAITS.TRAIT_OPEN.name(),
Double.toString(profile.getTraits().get(3)));
metadata.add("Author_"+ TraitProfiler.TRAITS.TRAIT_STABLE.name(),
Double.toString(profile.getTraits().get(4)));
}
示例2: getSupportedTypes
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
/**
* Ritorna i tipi supportati
* @param context contesto
* @return tipi supportati
*/
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
// If Tesseract is installed, offer our supported image types
TesseractOCRConfig config = context == null ? DEFAULT_CONFIG : context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
if (hasTesseract(config)) {
return SUPPORTED_TYPES;
}
// Otherwise don't advertise anything, so the other image parsers
// can be selected instead
return Collections.emptySet();
}
示例3: AbstractPOIFSExtractor
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
protected AbstractPOIFSExtractor(ParseContext context) {
EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
if (ex == null) {
this.extractor = new ParsingEmbeddedDocumentExtractor(context);
} else {
this.extractor = ex;
}
tikaConfig = context.get(TikaConfig.class);
mimeTypes = context.get(MimeTypes.class);
detector = context.get(Detector.class);
}
示例4: AbstractOOXMLExtractor
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) {
this.extractor = extractor;
EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
if (ex==null) {
embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
} else {
embeddedExtractor = ex;
}
}
示例5: parse
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
// Automatically detect the character encoding
AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(
stream), metadata, context.get(ServiceLoader.class, LOADER));
try {
Charset charset = reader.getCharset();
MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
char[] buffer = new char[4096];
int n = reader.read(buffer);
while (n != -1) {
xhtml.characters(buffer, 0, n);
n = reader.read(buffer);
}
xhtml.endElement("p");
xhtml.endDocument();
} finally {
reader.close();
}
}
示例6: parse
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
// Automatically detect the character encoding
AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(
stream), metadata, context.get(ServiceLoader.class, LOADER));
try {
Charset charset = reader.getCharset();
// charset = Charset.forName("utf-8");
String previous = metadata.get(Metadata.CONTENT_TYPE);
if (previous == null || previous.startsWith("text/html")) {
MediaType type = new MediaType(MediaType.TEXT_HTML, charset);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
}
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
// Get the HTML mapper from the parse context
HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper());
// Parse the HTML document
org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
// Use schema from context or default
Schema schema = context.get(Schema.class, HTML_SCHEMA);
// TIKA-528: Reuse share schema to avoid heavy instantiation
parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
// TIKA-599: Shared schema is thread-safe only if bogons are ignored
parser
.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
parser.setContentHandler(new XHTMLDowngradeHandler(new HtmlHandler(
mapper, handler, metadata)));
parser.parse(reader.asInputSource());
} finally {
reader.close();
}
}
示例7: parse
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
/**
* Parsa lo stream
* @param stream stream
* @param handler handler
* @param metadata metadata
* @param context contesto
* @throws IOException eccezione
* @throws SAXException eccezione
* @throws TikaException eccezione
*/
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
// If Tesseract is not on the path with the current config, do not try to run OCR
// getSupportedTypes shouldn't have listed us as handling it, so this should only
// occur if someone directly calls this parser, not via DefaultParser or similar
if (!hasTesseract(config)) {
return;
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
TemporaryResources tmp = new TemporaryResources();
File output = null;
try {
TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
File input = tikaStream.getFile();
long size = tikaStream.getLength();
if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
output = tmp.createTemporaryFile();
doOCR(input, output, config);
// Tesseract appends .txt to output file name
output = new File(output.getAbsolutePath() + ".txt");
if (output.exists()) {
extractOutput(new FileInputStream(output), xhtml);
}
}
// Temporary workaround for TIKA-1445 - until we can specify
// composite parsers with strategies (eg Composite, Try In Turn),
// always send the image onwards to the regular parser to have
// the metadata for them extracted as well
_TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
} finally {
tmp.dispose();
if (output != null) {
output.delete();
}
}
}
示例8: parse
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
@Override
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
/*----------------configure this parser by ParseContext Object---------------------*/
GeoParserConfig localconfig = context.get(GeoParserConfig.class,
defaultconfig);
String nerModelPath = localconfig.getNERPath();
gazetteerPath = localconfig.getGazetterPath();
/*----------------get locationNameEntities and best nameEntity for the input stream---------------------*/
NameEntityExtractor extractor = new NameEntityExtractor(nerModelPath);
extractor.getAllNameEntitiesfromInput(stream);
extractor.getBestNameEntity();
ArrayList<String> locationNameEntities = extractor.locationNameEntities;
String bestner = extractor.bestNameEntity;
/*----------------build lucene search engine for the gazetteer file,
*------------------------resolve geonames for each ner, store results in a hashmap---------------------*/
GeoNameResolver resolver = new GeoNameResolver();
resolver.buildIndex(gazetteerPath);
HashMap<String, ArrayList<String>> resolvedGeonames = resolver
.searchGeoName(locationNameEntities);
/*----------------store locationNameEntities and their geonames in a geotag, each input has one geotag---------------------*/
GeoTag geotag = new GeoTag();
geotag.toGeoTag(resolvedGeonames, bestner);
/* add resolved entities in metadata */
metadata.add("Geographic_NAME", geotag.Geographic_NAME);
metadata.add("Geographic_LONGITUDE", geotag.Geographic_LONGTITUDE);
metadata.add("Geographic_LATITUDE", geotag.Geographic_LATITUDE);
for (int i = 0; i < geotag.alternatives.size(); ++i) {
GeoTag alter = (GeoTag) geotag.alternatives.get(i);
metadata.add("Optional_NAME" + (i + 1), alter.Geographic_NAME);
metadata.add("Optional_LONGITUDE" + (i + 1),
alter.Geographic_LONGTITUDE);
metadata.add("Optional_LATITUDE" + (i + 1),
alter.Geographic_LATITUDE);
}
}