本文整理匯總了Java中org.apache.tika.parser.ParseContext類的典型用法代碼示例。如果您正苦於以下問題:Java ParseContext類的具體用法?Java ParseContext怎麽用?Java ParseContext使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
ParseContext類屬於org.apache.tika.parser包,在下文中一共展示了ParseContext類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: extractText
import org.apache.tika.parser.ParseContext; //導入依賴的package包/類
@Override
public void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)
throws IOException
{
try
{
Metadata meta = new Metadata();
ContentHandler handler = new BodyContentHandler();
Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));
parser.parse(input, handler, meta, new ParseContext());
String content = handler.toString();
if( content.length() > maxSize )
{
content = content.substring(0, maxSize);
}
outputText.append(content);
if( LOGGER.isDebugEnabled() )
{
LOGGER.debug("Word Summary:" + content); //$NON-NLS-1$
}
}
catch( Exception e )
{
throw new RuntimeException(e);
}
}
示例2: convertWordDocumentIntoHtml
import org.apache.tika.parser.ParseContext; //導入依賴的package包/類
/**
* Converts a .docx document into HTML markup. This code
* is based on <a href="http://stackoverflow.com/a/9053258/313554">this StackOverflow</a> answer.
*
* @param wordDocument The converted .docx document.
* @return
*/
public ConvertedDocumentDTO convertWordDocumentIntoHtml(MultipartFile wordDocument) {
LOGGER.info("Converting word document: {} into HTML", wordDocument.getOriginalFilename());
try {
InputStream input = wordDocument.getInputStream();
Parser parser = new OOXMLParser();
StringWriter sw = new StringWriter();
SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8");
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.setResult(new StreamResult(sw));
Metadata metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, "text/html;charset=utf-8");
parser.parse(input, handler, metadata, new ParseContext());
return new ConvertedDocumentDTO(wordDocument.getOriginalFilename(), sw.toString());
}
catch (IOException | SAXException | TransformerException | TikaException ex) {
LOGGER.error("Conversion failed because an exception was thrown", ex);
throw new DocumentConversionException(ex.getMessage(), ex);
}
}
示例3: parse
import org.apache.tika.parser.ParseContext; //導入依賴的package包/類
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
new ImageMetadataExtractor(metadata).parseJpeg(tis.getFile());
new JempboxExtractor(metadata).parse(tis);
} finally {
tmp.dispose();
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
示例4: getFullText
import org.apache.tika.parser.ParseContext; //導入依賴的package包/類
private static String getFullText(final String filepath) throws IOException, SAXException, TikaException {
StringWriter writer = new StringWriter();
final TikaInputStream inputStream = TikaInputStream.get(new File(filepath));
try {
final Detector detector = new DefaultDetector();
final Parser parser = new AutoDetectParser(detector);
final Metadata metadata = new Metadata();
final ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
ContentHandler contentHandler = new BodyContentHandler(writer);
parser.parse(inputStream, contentHandler, metadata, parseContext);
}
finally {
inputStream.close();
}
return writer.toString();
}
示例5: buildParseContext
import org.apache.tika.parser.ParseContext; //導入依賴的package包/類
@Override
protected ParseContext buildParseContext(Metadata metadata,
String targetMimeType, TransformationOptions options) {
ParseContext context = super.buildParseContext(metadata, targetMimeType, options);
boolean recurse = includeContents;
if(options.getIncludeEmbedded() != null)
{
recurse = options.getIncludeEmbedded();
}
if(recurse)
{
// Use an auto detect parser to handle the contents
if(tikaConfig == null)
{
tikaConfig = TikaConfig.getDefaultConfig();
}
context.set(Parser.class, new AutoDetectParser(tikaConfig));
}
return context;
}
示例6: testSupports
import org.apache.tika.parser.ParseContext; //導入依賴的package包/類
public void testSupports() throws Exception
{
ArrayList<String> mimeTypes = new ArrayList<String>();
for (Parser p : new Parser[] {
new OfficeParser(), new OpenDocumentParser(),
new Mp3Parser(), new OOXMLParser()
}) {
Set<MediaType> mts = p.getSupportedTypes(new ParseContext());
for (MediaType mt : mts)
{
mimeTypes.add(mt.toString());
}
}
for (String mimetype : mimeTypes)
{
boolean supports = extracter.isSupported(mimetype);
assertTrue("Mimetype should be supported: " + mimetype, supports);
}
}
示例7: process
import org.apache.tika.parser.ParseContext; //導入依賴的package包/類
@Override
public void process(ProcessingContext<Corpus> ctx, Corpus corpus) throws ModuleException {
Logger.getLogger("org.apache.pdfbox").setLevel(Level.OFF);
AutoDetectParser parser = new AutoDetectParser();
ParseContext parseContext = new ParseContext();
try {
for (InputStream is : Iterators.loop(source.getInputStreams())) {
TikaReaderHandler handler = parse(parser, parseContext, is);
Document doc = createDocument(corpus, handler);
createTagAnnotations(doc, handler);
}
}
catch (IOException|SAXException|TikaException e) {
rethrow(e);
}
}
示例8: extractText
import org.apache.tika.parser.ParseContext; //導入依賴的package包/類
@Override
public void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)
throws IOException
{
WriteOutContentHandler wrapped = new WriteOutContentHandler(maxSize);
ContentHandler handler = new BodyContentHandler(wrapped);
try
{
Metadata meta = new Metadata();
Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));
parser.parse(input, handler, meta, new ParseContext());
appendText(handler, outputText, maxSize);
}
catch( Exception t )
{
if( wrapped.isWriteLimitReached(t) )
{
// keep going
LOGGER.debug("PDF size limit reached. Indexing truncated text");
appendText(handler, outputText, maxSize);
return;
}
throw Throwables.propagate(t);
}
}
示例9: PDFExtract
import org.apache.tika.parser.ParseContext; //導入依賴的package包/類
public PDFExtract(){
parser = new AutoDetectParser();
TesseractOCRConfig config = new TesseractOCRConfig();
PDFParserConfig pdfConfig = new PDFParserConfig();
pdfConfig.setExtractInlineImages(true);
parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, config);
parseContext.set(PDFParserConfig.class, pdfConfig);
//need to add this to make sure recursive parsing happens!
parseContext.set(Parser.class, parser);
}
示例10: getMetadata
import org.apache.tika.parser.ParseContext; //導入依賴的package包/類
@Override
public Single<Map<String, String>> getMetadata(InputStream ins) {
return Single.create(sub -> {
Parser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
try {
parser.parse(ins, handler, metadata, context);
Map<String, String> map = new HashMap<>();
String[] metadataNames = metadata.names();
for (String name : metadataNames) {
map.put(name, metadata.get(name));
}
sub.onSuccess(map);
} catch (Exception e) {
sub.onError(e);
}
// ins.close();
});
}
示例11: extractMetaData
import org.apache.tika.parser.ParseContext; //導入依賴的package包/類
public static TreeMap<String, String> extractMetaData(InputStream input) throws IOException {
TreeMap<String, String> treeMap = new TreeMap<String, String>();
try {
ContentHandler handler = new DefaultHandler();
Metadata metadata = new Metadata();
Parser parser = new AutoDetectParser();
parser.parse(input, handler, metadata, new ParseContext());
for (int i = 0; i < metadata.names().length; i++) {
String name = metadata.names()[i];
treeMap.put(name, stripWhiteSpace(metadata.get(name)));
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (input != null)
input.close();
}
return treeMap;
}
示例12: extractStringMetaData
import org.apache.tika.parser.ParseContext; //導入依賴的package包/類
public static String extractStringMetaData(InputStream input) throws IOException {
String result="";
try {
ContentHandler handler = new DefaultHandler();
Metadata metadata = new Metadata();
Parser parser = new AutoDetectParser();
parser.parse(input, handler, metadata, new ParseContext());
for (int i = 0; i < metadata.names().length; i++) {
String name = metadata.names()[i];
result+=name.toUpperCase()+" : "+stripWhiteSpace(metadata.get(name))+"\n";
}
} catch (Exception e) {
} finally {
if (input != null)
input.close();
}
return result;
}
示例13: testNulls
import org.apache.tika.parser.ParseContext; //導入依賴的package包/類
@Test
public void testNulls() throws UnsupportedEncodingException, IOException,
SAXException, TikaException {
String text = "";
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
GeoParserConfig config = new GeoParserConfig();
config.setGazetterPath(gazetteer);
config.setNERModelPath(nerPath);
context.set(GeoParserConfig.class, config);
geoparser.parse(new ByteArrayInputStream(text.getBytes("UTF-8")),
new BodyContentHandler(), metadata, context);
assertNull(metadata.get("Geographic_NAME"));
assertNull(metadata.get("Geographic_LONGITUDE"));
assertNull(metadata.get("Geographic_LATITUDE"));
}
示例14: Convert
import org.apache.tika.parser.ParseContext; //導入依賴的package包/類
protected String Convert(InputStream Bytes) throws PDException
{
try {
ContentHandler textHandler=new BodyContentHandler(-1);
Metadata metadata=new Metadata();
Parser parser=new AutoDetectParser();
ParseContext context=new ParseContext();
parser.parse(Bytes, textHandler, metadata, context);
FileMetadata="";
for (String key : metadata.names())
FileMetadata+=key+"="+metadata.get(key)+"\n";
FullText=textHandler.toString();
} catch (Exception ex)
{
PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage());
}
return(FullText);
}
示例15: readXlsx
import org.apache.tika.parser.ParseContext; //導入依賴的package包/類
public static ExcelData readXlsx(String xlsxFilePath)
throws IOException, InvalidFormatException, XmlException, TikaException, SAXException {
BodyContentHandler bcHandler = new BodyContentHandler();
Metadata metadata = new Metadata();
FileInputStream inputStream = new FileInputStream(new File(xlsxFilePath));
ParseContext pcontext = new ParseContext();
OOXMLParser parser = new OOXMLParser();
parser.parse(inputStream, bcHandler, metadata, pcontext);
if (DEBUG_PRINT_META_DATA) {
System.err.println("Metadata:");
for (String name : metadata.names())
System.out.println(name + "\t:\t" + metadata.get(name));
}
ExcelData spreedsheet = new ExcelData(bcHandler.toString());
return spreedsheet;
}