本文整理匯總了Java中org.apache.tika.parser.ParseContext.set方法的典型用法代碼示例。如果您正苦於以下問題:Java ParseContext.set方法的具體用法?Java ParseContext.set怎麽用?Java ParseContext.set使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類org.apache.tika.parser.ParseContext
的用法示例。
在下文中一共展示了ParseContext.set方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: getFullText
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
private static String getFullText(final String filepath) throws IOException, SAXException, TikaException {
StringWriter writer = new StringWriter();
final TikaInputStream inputStream = TikaInputStream.get(new File(filepath));
try {
final Detector detector = new DefaultDetector();
final Parser parser = new AutoDetectParser(detector);
final Metadata metadata = new Metadata();
final ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
ContentHandler contentHandler = new BodyContentHandler(writer);
parser.parse(inputStream, contentHandler, metadata, parseContext);
}
finally {
inputStream.close();
}
return writer.toString();
}
示例2: buildParseContext
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
@Override
protected ParseContext buildParseContext(Metadata metadata,
String targetMimeType, TransformationOptions options) {
ParseContext context = super.buildParseContext(metadata, targetMimeType, options);
boolean recurse = includeContents;
if(options.getIncludeEmbedded() != null)
{
recurse = options.getIncludeEmbedded();
}
if(recurse)
{
// Use an auto detect parser to handle the contents
if(tikaConfig == null)
{
tikaConfig = TikaConfig.getDefaultConfig();
}
context.set(Parser.class, new AutoDetectParser(tikaConfig));
}
return context;
}
示例3: PDFExtract
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
public PDFExtract(){
parser = new AutoDetectParser();
TesseractOCRConfig config = new TesseractOCRConfig();
PDFParserConfig pdfConfig = new PDFParserConfig();
pdfConfig.setExtractInlineImages(true);
parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, config);
parseContext.set(PDFParserConfig.class, pdfConfig);
//need to add this to make sure recursive parsing happens!
parseContext.set(Parser.class, parser);
}
示例4: testNulls
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
@Test
public void testNulls() throws UnsupportedEncodingException, IOException,
SAXException, TikaException {
String text = "";
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
GeoParserConfig config = new GeoParserConfig();
config.setGazetterPath(gazetteer);
config.setNERModelPath(nerPath);
context.set(GeoParserConfig.class, config);
geoparser.parse(new ByteArrayInputStream(text.getBytes("UTF-8")),
new BodyContentHandler(), metadata, context);
assertNull(metadata.get("Geographic_NAME"));
assertNull(metadata.get("Geographic_LONGITUDE"));
assertNull(metadata.get("Geographic_LATITUDE"));
}
示例5: parseTXTToString
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException {
WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength);
try {
ParseContext context = new ParseContext();
context.set(Parser.class, txtParser);
txtParser.parse(stream, new BodyContentHandler(handler), metadata, context);
} catch (SAXException e) {
if (!handler.isWriteLimitReached(e)) {
// This should never happen with BodyContentHandler...
throw new TikaException("Unexpected SAX processing failure", e);
}
} finally {
stream.close();
}
return handler.toString();
}
示例6: testWordCustomProperties
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
@Test
public void testWordCustomProperties() throws Exception {
InputStream input = OOXMLParserTest.class.getResourceAsStream(
"/test-documents/testWORD_custom_props.docx");
Metadata metadata = new Metadata();
try {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
new OOXMLParser().parse(input, handler, metadata, context);
} finally {
input.close();
}
assertEquals(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
metadata.get(Metadata.CONTENT_TYPE));
assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER));
assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR));
assertEquals("2011-07-29T16:52:00Z", metadata.get(TikaCoreProperties.CREATED));
assertEquals("2011-07-29T16:52:00Z", metadata.get(Metadata.CREATION_DATE));
assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE));
assertEquals("Microsoft Office Word",metadata.get(Metadata.APPLICATION_NAME));
assertEquals("Microsoft Office Word",metadata.get(OfficeOpenXMLExtended.APPLICATION));
assertEquals("1", metadata.get(Office.PAGE_COUNT));
assertEquals("2", metadata.get(Office.WORD_COUNT));
assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE));
assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS));
assertEquals("Normal.dotm", metadata.get(Metadata.TEMPLATE));
assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE));
// TODO: Remove subject in Tika 2.0
assertEquals("My subject", metadata.get(Metadata.SUBJECT));
assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT));
assertEquals("EDF-DIT", metadata.get(TikaCoreProperties.PUBLISHER));
assertEquals("true", metadata.get("custom:myCustomBoolean"));
assertEquals("3", metadata.get("custom:myCustomNumber"));
assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate"));
assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
}
示例7: testCustomProperties
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
/**
* Ensures that custom OLE2 (HPSF) properties are extracted
*/
@Test
public void testCustomProperties() throws Exception {
InputStream input = ExcelParserTest.class.getResourceAsStream(
"/test-documents/testEXCEL_custom_props.xls");
Metadata metadata = new Metadata();
try {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
new OfficeParser().parse(input, handler, metadata, context);
} finally {
input.close();
}
assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("", metadata.get(TikaCoreProperties.MODIFIER));
assertEquals("2011-08-22T13:45:54Z", metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED));
assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION));
assertEquals("true", metadata.get("custom:myCustomBoolean"));
assertEquals("3", metadata.get("custom:myCustomNumber"));
assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
}
示例8: testJXL
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
@Test
public void testJXL() throws Exception {
InputStream input = ExcelParserTest.class.getResourceAsStream(
"/test-documents/jxl.xls");
try {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
new OfficeParser().parse(input, handler, metadata, context);
assertEquals(
"application/vnd.ms-excel",
metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertTrue(content.contains("Number Formats"));
} finally {
input.close();
}
}
示例9: test_Russian_Excel_Parser
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
@Test
public void test_Russian_Excel_Parser() throws Exception {
InputStream input = ExcelParserTest.class.getResourceAsStream(
"/test-documents/Счет №3144 от 31.10.12 (35 320р.).xls");
try {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
new OfficeParser().parse(input, handler, metadata, context);
String content = handler.toString();
assertTrue( ! content.isEmpty());
} finally {
input.close();
}
}
示例10: buildParseContext
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
/**
* By default returns a ParseContent that does not recurse
*/
protected ParseContext buildParseContext(Metadata metadata, String targetMimeType, TransformationOptions options)
{
ParseContext context = new ParseContext();
DocumentSelector selector = getDocumentSelector(metadata, targetMimeType, options);
if (selector != null)
{
context.set(DocumentSelector.class, selector);
}
return context;
}
示例11: buildParseContext
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
@Override
protected ParseContext buildParseContext(Metadata metadata, String targetMimeType, TransformationOptions options)
{
ParseContext context = super.buildParseContext(metadata, targetMimeType, options);
if (pdfParserConfig != null)
{
context.set(PDFParserConfig.class, pdfParserConfig);
}
// TODO: Possibly extend TransformationOptions to allow for per-transform PDFParserConfig?
return context;
}
示例12: buildParseContext
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
/**
* By default returns a new ParseContent
*
* @param metadata
* @param sourceMimeType
* @return the parse context
*/
protected ParseContext buildParseContext(Metadata metadata, String sourceMimeType)
{
ParseContext context = new ParseContext();
DocumentSelector selector = getDocumentSelector(metadata, sourceMimeType);
if (selector != null)
{
context.set(DocumentSelector.class, selector);
}
return context;
}
示例13: exportFile
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
@Override
public void exportFile(String in, String out) throws IOException
{
try
{
embedded = new HashMap<>();
ParseContext context = new ParseContext();
context.set(Parser.class, new ExtractParser());
Metadata metadata = new Metadata();
Path path = Paths.get(in);
InputStream stream = TikaInputStream.get(path);
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
handler.setResult(new StreamResult(outputStream));
ContentHandler contentHandler = new BodyContentHandler(handler);
tikaParser.parse(stream,contentHandler,metadata,context);
OutputStream outputStreamFile = new FileOutputStream (out);
outputStreamFile.write(outputStream.toByteArray());
convertImagesInFile(out);
}
catch( Exception ex )
{
ex.printStackTrace();
throw new RuntimeException("Error converting document", ex);
}
}
示例14: doTikaStuff
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
private String doTikaStuff(String bucket, String key, InputStream objectData) throws IOException, TransformerConfigurationException, SAXException {
_logger.log("Extracting text with Tika");
String extractedText = "";
SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
StringWriter sw = new StringWriter();
handler.setResult(new StreamResult(sw));
AutoDetectParser parser = new AutoDetectParser();
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
Tika tika = new Tika();
Metadata tikaMetadata = new Metadata();
try {
// for synthetic transactions
if( key.toLowerCase().endsWith("tika.exception.testing.pdf")) {
throw new TikaException("Test Tika Exception");
}
parser.parse(objectData, handler, tikaMetadata, parseContext);
extractedText = sw.toString();
} catch( TikaException e) {
_logger.log("TikaException thrown while parsing: " + e.getLocalizedMessage());
return assembleExceptionResult(bucket, key, e);
}
_logger.log("Tika parsing success");
return assembleExtractionResult(bucket, key, extractedText, tikaMetadata);
}
示例15: testFunctions
import org.apache.tika.parser.ParseContext; //導入方法依賴的package包/類
@Test
public void testFunctions() throws UnsupportedEncodingException,
IOException, SAXException, TikaException {
String text = "The millennial-scale cooling trend that followed the HTM coincides with the decrease in China "
+ "summer insolation driven by slow changes in Earth's orbit. Despite the nearly linear forcing, the transition from the HTM to "
+ "the Little Ice Age (1500-1900 AD) was neither gradual nor uniform. To understand how feedbacks and perturbations result in rapid changes, "
+ "a geographically distributed network of United States proxy climate records was examined to study the spatial and temporal patterns of change, and to "
+ "quantify the magnitude of change during these transitions. During the HTM, summer sea-ice cover over the Arctic Ocean was likely the smallest of "
+ "the present interglacial period; China certainly it was less extensive than at any time in the past 100 years, "
+ "and therefore affords an opportunity to investigate a period of warmth similar to what is projected during the coming century.";
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
GeoParserConfig config = new GeoParserConfig();
config.setGazetterPath(gazetteer);
config.setNERModelPath(nerPath);
context.set(GeoParserConfig.class, config);
InputStream s = new ByteArrayInputStream(text.getBytes("UTF-8"));
geoparser.parse(s, new BodyContentHandler(), metadata, context);
assertNotNull(metadata.get("Geographic_NAME"));
assertNotNull(metadata.get("Geographic_LONGITUDE"));
assertNotNull(metadata.get("Geographic_LATITUDE"));
}