本文整理汇总了Java中org.apache.tika.parser.AutoDetectParser.parse方法的典型用法代码示例。如果您正苦于以下问题:Java AutoDetectParser.parse方法的具体用法?Java AutoDetectParser.parse怎么用?Java AutoDetectParser.parse使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.tika.parser.AutoDetectParser
的用法示例。
在下文中一共展示了AutoDetectParser.parse方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: doProcessStream
import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
super.doProcessStream(stream, source, jCas);
try {
BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
AutoDetectParser autoParser = new AutoDetectParser();
autoParser.parse(stream, textHandler, metadata, context);
jCas.setDocumentText(textHandler.toString());
for (String name : metadata.names()) {
addMetadata(jCas, name, metadata.get(name));
}
} catch (SAXException | TikaException e) {
getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
if (Strings.isNullOrEmpty(jCas.getDocumentText())) {
jCas.setDocumentText(CORRUPT_FILE_TEXT);
}
}
}
示例2: extractImageLinks
import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
/**
* To extract image links form a URL. Needs Improvement
* @param url
* @return
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public Object extractImageLinks(String url) throws IOException, SAXException, TikaException {
Set<String> imageLinks = new HashSet<String>();
InputStream is = null;
try {
is = TikaInputStream.get(new URL(url).openStream());
Metadata metadata = new Metadata();
LinkContentHandler handler = new LinkContentHandler();
AutoDetectParser parser = new AutoDetectParser();
parser.parse(is, handler, metadata);
List<Link> links = handler.getLinks();
Iterator<Link> iter = links.iterator();
while(iter.hasNext()) {
Link link = iter.next();
if(link.isImage())
imageLinks.add(link.getUri());
}
}
finally {
is.close();
}
return imageLinks.toArray();
}
示例3: process
import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
@Override
protected void process(URL parseUrl, ObjectOutputStream out)
throws IOException, SAXException, TikaException {
URL url = URLTools.getAuthenticatedUrl(parseUrl);
URLConnection conn = url.openConnection();
InputStream in = conn.getInputStream();
InputStreamReader ir = new InputStreamReader(in);
try {
AutoDetectParser parser = new AutoDetectParser();
ContentHandler handler = new StreamingToRDFContentHandler(writer, out);
Metadata metadata = new Metadata();
metadata.add(Metadata.RESOURCE_NAME_KEY, url.toExternalForm());
metadata.add(Metadata.CONTENT_ENCODING, ir.getEncoding());
parser.parse(in, handler, metadata, new ParseContext());
} catch(Exception e) {
throw new TikaException(e.getMessage(), e);
} finally {
in.close();
out.close();
ir.close();
}
}
示例4: init
import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
@Override
public Instance<String> init(BlobMetadata data, Payload payload, boolean isSegment) {
return new Instance<String>() {
@Override
public <T> List<T> select(String value, DataConverter<String, T> converter) {
TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
org.apache.tika.metadata.Metadata metadata = new org.apache.tika.metadata.Metadata();
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
ContentHandler handler = new BodyContentHandler();
try {
TikaInputStream stream = TikaInputStream.get(payload.openStream());
parser.parse(stream, handler, metadata, new ParseContext());
} catch (Exception e) {
payload.release();
throw Throwables.propagate(e);
}
return Arrays.asList(converter.convert(handler.toString()));
}
};
}
示例5: extractMeta
import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
public static String extractMeta(String uri, String contentType) throws Exception {
final AutoDetectParser parser = createParser();
final Metadata metadata = new Metadata();
fillMetadata(parser, metadata, contentType, uri);
final TikaInputStream inputStream = createInputStream(uri, metadata);
parser.parse(inputStream, new DefaultHandler(), metadata);
Map meta = new HashMap();
for (String name : metadata.names()) {
String[] values = metadata.getValues(name);
meta.put(name, values);
}
inputStream.close();
return new Gson().toJson(meta);
}
示例6: indexDoc
import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
/**
* Indexes a single document
*
* @throws TikaException
* @throws SAXException
*/
public static void indexDoc(IndexWriter writer, Path file, TextArea results, long lastModified)
throws IOException, SAXException, TikaException {
AutoDetectParser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = Files.newInputStream(file)) {
parser.parse(stream, handler, metadata);
Document doc = new Document();
String[] metadataNames = metadata.names();
for (String name : metadataNames)
doc.add(new TextField(name, metadata.get(name), Field.Store.YES));
doc.add(new StringField("path", file.toString(), Field.Store.YES));
doc.add(new LongPoint("modified", lastModified));
results.appendText("Title: " + metadata.get("title") + "\n");
results.appendText("Artists: " + metadata.get("xmpDM:artist") + "\n");
results.appendText("Genre: " + metadata.get("xmpDM:genre") + "\n");
results.appendText("Year: " + metadata.get("xmpDM:releaseDate") + "\n");
if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
// New index, so we just add the document (no old document can
// be there):
results.appendText("adding " + file + "\n");
writer.addDocument(doc);
} else {
// Existing index (an old copy of this document may have been
// indexed):
results.appendText("updating " + file);
writer.updateDocument(new Term("path", file.toString()), doc);
}
}
}
示例7: doTikaStuff
import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
private String doTikaStuff(String bucket, String key, InputStream objectData) throws IOException, TransformerConfigurationException, SAXException {
_logger.log("Extracting text with Tika");
String extractedText = "";
SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
StringWriter sw = new StringWriter();
handler.setResult(new StreamResult(sw));
AutoDetectParser parser = new AutoDetectParser();
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
Tika tika = new Tika();
Metadata tikaMetadata = new Metadata();
try {
// for synthetic transactions
if( key.toLowerCase().endsWith("tika.exception.testing.pdf")) {
throw new TikaException("Test Tika Exception");
}
parser.parse(objectData, handler, tikaMetadata, parseContext);
extractedText = sw.toString();
} catch( TikaException e) {
_logger.log("TikaException thrown while parsing: " + e.getLocalizedMessage());
return assembleExceptionResult(bucket, key, e);
}
_logger.log("Tika parsing success");
return assembleExtractionResult(bucket, key, extractedText, tikaMetadata);
}
示例8: doProcessStream
import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
super.doProcessStream(stream, source, jCas);
try {
BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
AutoDetectParser autoParser = new AutoDetectParser();
autoParser.parse(stream, textHandler, metadata, context);
String fullContent = textHandler.toString();
Matcher m = tearlinePattern.matcher(fullContent);
if(m.find()){
jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim());
}else{
jCas.setDocumentText(removeBoilerplate(fullContent).trim());
}
for (String name : metadata.names()) {
addMetadata(jCas, name, metadata.get(name));
}
} catch (SAXException | TikaException e) {
getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
}
}
示例9: parseToXML
import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
public static String parseToXML(InputStream inputStream) throws IOException, SAXException, TikaException {
ContentHandler handler = new ToXMLContentHandler();
AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
parser.parse(inputStream, handler, metadata);
return handler.toString();
}
示例10: parseToPlainText
import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
public static String parseToPlainText(InputStream fileStream) {
BodyContentHandler handler = new BodyContentHandler();
AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
String rawContent = "";
try {
parser.parse(fileStream, handler, metadata);
rawContent = handler.toString();
} catch (IOException | SAXException | TikaException e) {
LOG.debug("Parsing Exception while extracting content from current file. "
+ e.toString());
}
return rawContent;
}
示例11: parse
import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
private TikaReaderHandler parse(AutoDetectParser parser, ParseContext parseContext, InputStream is) throws IOException, SAXException, TikaException {
String name = source.getStreamName(is);
TikaReaderHandler result = new TikaReaderHandler(name);
parser.parse(is, result, result.getMetadata(), parseContext);
return result;
}
示例12: main
import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
/**
* Main function.
*/
public static void main(String[] args) {
try {
// Tika tika = new Tika();
// File xpsFile = new File("/home/foo/a/temp/xlsx.xlsx");
// InputStream inputStream = new FileInputStream(xpsFile);
// String FileName = xpsFile.getName();
// Metadata metadata = new Metadata();
// if (FileName != null && FileName.length() > 0)
// metadata.add(Metadata.RESOURCE_NAME_KEY, FileName);
// String MimeType = tika.detect(inputStream, metadata);
// metadata.add(Metadata.CONTENT_TYPE, MimeType);
// inputStream.close();
// inputStream = new FileInputStream(xpsFile);
// Reader reader = tika.parse(inputStream, metadata);
// String content = IOUtils.toString(reader);
// System.out.println(new AutoDetectParser().getParsers().keySet());
// System.out.println("shit: " + tika.getParser() + " " + MimeType);
// System.out.println(content);
// inputStream.close();
ClassLoader loader = Thread.currentThread().getContextClassLoader();
TikaConfig config = new TikaConfig(new File("/home/foo/a/code/big_bang/tika-1.5/"
+ "tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml"));
final AutoDetectParser autoDetectParser = new AutoDetectParser(config);
final Detector detector = config.getDetector();
final Tika tika = new Tika();
File xpsFile = new File("/home/foo/a/temp/xlsx.xlsx");
InputStream inputStream = new FileInputStream(xpsFile);
String FileName = xpsFile.getName();
Metadata metadata = new Metadata();
if (FileName != null && FileName.length() > 0)
metadata.add(Metadata.RESOURCE_NAME_KEY, FileName);
String MimeType = tika.detect(inputStream, metadata);
// metadata.add(Metadata.CONTENT_TYPE, MimeType);
// ContentHandler handler = new XHTMLContentHandler(System.out);
// ContentHandler bch = new BodyContentHandler(System.out);
// ContentHandler handler = new BodyContentHandler();
// ContentHandler xhtml = new XHTMLContentHandler(handler,
// metadata);
StringWriter sw = new StringWriter();
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
handler.setResult(new StreamResult(sw));
BodyContentHandler bch = new BodyContentHandler(handler);
handler.startDocument();
inputStream.close();
inputStream = new FileInputStream(xpsFile);
autoDetectParser.parse(inputStream, bch, metadata);
String x = sw.toString();
System.out.println(x);
// Document doc = Jsoup.parse(x);
// Elements elements = doc.getElementsByTag("p");
// for (Element element : elements) {
// System.out.println(element.text());
// }
} catch (Exception e) {
e.printStackTrace();
}
}
示例13: discoverAgainstSingleModel
import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
private List<FileMatchMetaData> discoverAgainstSingleModel(final Properties fileDiscoveryProperties, final Model model, final double probabilityThreshold)
throws AnonymizerException, IOException, SAXException, TikaException {
// Start running NLP algorithms for each column and collect percentage
fileMatches = new ArrayList<>();
String[] directoryList = null;
String[] exclusionList = null;
final String directories = fileDiscoveryProperties.getProperty("directories");
final String exclusions = fileDiscoveryProperties.getProperty("exclusions");
directoryList = directories.split(",");
exclusionList = exclusions.split(",");
// Let's iterate over directories
File node;
Metadata metadata;
List<Probability> probabilityList;
log.info("File types not considered for analysis: " + exclusions);
for (final String directory: directoryList) {
node = new File(directory);
final List<File> files = (List<File>) FileUtils.listFiles(node, null, true);
for (final File fich : files) {
final String file = fich.getName().toString();
final String recursivedir = fich.getParent().toString();
log.info("Analyzing [" + fich.getCanonicalPath() + "]");
final String ext = CommonUtils.getFileExtension(fich);
if (Arrays.asList(exclusionList).contains(ext)) {
// less verbose - Ignored types on the top
continue;
}
final BodyContentHandler handler = new BodyContentHandler(-1);
final AutoDetectParser parser = new AutoDetectParser();
metadata = new Metadata();
String handlerString = "";
try {
final InputStream stream = new FileInputStream(fich.getCanonicalPath());
if (stream != null) {
parser.parse(stream, handler, metadata);
handlerString = handler.toString();
}
}
catch (IOException e) {
log.info("Unable to read " + fich.getCanonicalPath() +".Ignoring...");
}
log.debug("Content: " + handlerString);
final String tokens[] = model.getTokenizer().tokenize(handler.toString());
final Span nameSpans[] = model.getNameFinder().find(tokens);
final double[] spanProbs = model.getNameFinder().probs(nameSpans);
//display names
probabilityList = new ArrayList<>();
for( int i = 0; i < nameSpans.length; i++) {
log.info("Span: "+nameSpans[i].toString());
log.info("Covered text is: "+tokens[nameSpans[i].getStart()]);
log.info("Probability is: "+spanProbs[i]);
probabilityList.add(new Probability(tokens[nameSpans[i].getStart()], spanProbs[i]));
}
model.getNameFinder().clearAdaptiveData();
final double averageProbability = calculateAverage(probabilityList);
if ((averageProbability >= probabilityThreshold)) {
final FileMatchMetaData result = new FileMatchMetaData(recursivedir, file);
result.setAverageProbability(averageProbability);
result.setModel(model.getName());
fileMatches.add(result);
}
}
}
return fileMatches;
}
示例14: testExcelXLSB
import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
/**
* We don't currently support the .xlsb file format
* (an OOXML container with binary blobs), but we
* shouldn't break on these files either (TIKA-826)
*/
@Test
public void testExcelXLSB() throws Exception {
Detector detector = new DefaultDetector();
AutoDetectParser parser = new AutoDetectParser();
InputStream input = ExcelParserTest.class.getResourceAsStream(
"/test-documents/testEXCEL.xlsb");
Metadata m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
// Should be detected correctly
MediaType type = null;
try {
type = detector.detect(input, m);
assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
} finally {
input.close();
}
// OfficeParser won't handle it
assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
// OOXMLParser won't handle it
assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
// AutoDetectParser doesn't break on it
input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb");
try {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
parser.parse(input, handler, m, context);
String content = handler.toString();
assertEquals("", content);
} finally {
input.close();
}
}
示例15: testExcel95
import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
/**
* We don't currently support the old Excel 95 .xls file format,
* but we shouldn't break on these files either (TIKA-976)
*/
@Test
public void testExcel95() throws Exception {
Detector detector = new DefaultDetector();
AutoDetectParser parser = new AutoDetectParser();
InputStream input = ExcelParserTest.class.getResourceAsStream(
"/test-documents/testEXCEL_95.xls");
Metadata m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
// Should be detected correctly
MediaType type = null;
try {
type = detector.detect(input, m);
assertEquals("application/vnd.ms-excel", type.toString());
} finally {
input.close();
}
// OfficeParser will claim to handle it
assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
// OOXMLParser won't handle it
assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
// AutoDetectParser doesn't break on it
input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls");
try {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
parser.parse(input, handler, m, context);
String content = handler.toString();
assertEquals("", content);
} finally {
input.close();
}
}