当前位置: 首页>>代码示例>>Java>>正文


Java AutoDetectParser.parse方法代码示例

本文整理汇总了Java中org.apache.tika.parser.AutoDetectParser.parse方法的典型用法代码示例。如果您正苦于以下问题:Java AutoDetectParser.parse方法的具体用法?Java AutoDetectParser.parse怎么用?Java AutoDetectParser.parse使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.tika.parser.AutoDetectParser的用法示例。


在下文中一共展示了AutoDetectParser.parse方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: doProcessStream

import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
	super.doProcessStream(stream, source, jCas);

	try {
		BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
		Metadata metadata = new Metadata();
		ParseContext context = new ParseContext();

		AutoDetectParser autoParser = new AutoDetectParser();
		autoParser.parse(stream, textHandler, metadata, context);

		jCas.setDocumentText(textHandler.toString());

		for (String name : metadata.names()) {
			addMetadata(jCas, name, metadata.get(name));
		}
	} catch (SAXException | TikaException e) {
		getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
		if (Strings.isNullOrEmpty(jCas.getDocumentText())) {
			jCas.setDocumentText(CORRUPT_FILE_TEXT);
		}
	}
}
 
开发者ID:dstl,项目名称:baleen,代码行数:25,代码来源:TikaContentExtractor.java

示例2: extractImageLinks

import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
/**
 * To extract image links form a URL. Needs Improvement
 * @param url
 * @return
 * @throws IOException
 * @throws SAXException
 * @throws TikaException
 */
public Object extractImageLinks(String url) throws IOException, SAXException, TikaException {
	Set<String> imageLinks = new HashSet<String>();
	InputStream is = null;
	try {
		is = TikaInputStream.get(new URL(url).openStream());
		Metadata metadata = new Metadata();
		LinkContentHandler handler = new LinkContentHandler();
		AutoDetectParser parser = new AutoDetectParser();
		parser.parse(is, handler, metadata);
		List<Link> links = handler.getLinks();
		Iterator<Link> iter = links.iterator();
		while(iter.hasNext()) {
			Link link = iter.next();
			if(link.isImage())
				imageLinks.add(link.getUri());
		}
	}
	finally {
		is.close();
	}
	return imageLinks.toArray();
}
 
开发者ID:karanjeets,项目名称:SolrMerge,代码行数:31,代码来源:RoutineParser.java

示例3: process

import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
@Override
protected void process(URL parseUrl, ObjectOutputStream out)
		throws IOException, SAXException, TikaException {
	URL url = URLTools.getAuthenticatedUrl(parseUrl);
	URLConnection conn = url.openConnection();
	InputStream in = conn.getInputStream();
	InputStreamReader ir = new InputStreamReader(in);
	try {
		AutoDetectParser parser = new AutoDetectParser();
		ContentHandler handler = new StreamingToRDFContentHandler(writer, out);

		Metadata metadata = new Metadata();
		metadata.add(Metadata.RESOURCE_NAME_KEY, url.toExternalForm());
		metadata.add(Metadata.CONTENT_ENCODING, ir.getEncoding());

		parser.parse(in, handler, metadata, new ParseContext());
	} catch(Exception e) {
		throw new TikaException(e.getMessage(), e);
	} finally {
		in.close();
		out.close();
		ir.close();
	}
}
 
开发者ID:erfgoed-en-locatie,项目名称:artsholland-platform,代码行数:25,代码来源:StreamingTikaParserPipe.java

示例4: init

import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
@Override
public Instance<String> init(BlobMetadata data, Payload payload, boolean isSegment) {
	return new Instance<String>() {
		@Override
		public <T> List<T> select(String value, DataConverter<String, T> converter) {
			TikaConfig tikaConfig = TikaConfig.getDefaultConfig();

			org.apache.tika.metadata.Metadata metadata = new org.apache.tika.metadata.Metadata();
			AutoDetectParser parser = new AutoDetectParser(tikaConfig);
			ContentHandler handler = new BodyContentHandler();
			try {
				TikaInputStream stream = TikaInputStream.get(payload.openStream());
				parser.parse(stream, handler, metadata, new ParseContext());
			} catch (Exception e) {
				payload.release();
				throw Throwables.propagate(e);
			}
			return Arrays.asList(converter.convert(handler.toString()));
		}
	};
}
 
开发者ID:Treydone,项目名称:mandrel,代码行数:22,代码来源:TikaSelector.java

示例5: extractMeta

import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
public static String extractMeta(String uri, String contentType) throws Exception {
	final AutoDetectParser parser = createParser();
	final Metadata metadata = new Metadata();

	fillMetadata(parser, metadata, contentType, uri);

	final TikaInputStream inputStream = createInputStream(uri, metadata);

	parser.parse(inputStream, new DefaultHandler(), metadata);

	Map meta = new HashMap();
	for (String name : metadata.names()) {
		String[] values = metadata.getValues(name);
		meta.put(name, values);
	}

	inputStream.close();

	return new Gson().toJson(meta);
}
 
开发者ID:ICIJ,项目名称:node-tika,代码行数:21,代码来源:NodeTika.java

示例6: indexDoc

import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
/**
 * Indexes a single document
 * 
 * @throws TikaException
 * @throws SAXException
 */
public static void indexDoc(IndexWriter writer, Path file, TextArea results, long lastModified)
		throws IOException, SAXException, TikaException {
	AutoDetectParser parser = new AutoDetectParser();
	BodyContentHandler handler = new BodyContentHandler();
	Metadata metadata = new Metadata();
	try (InputStream stream = Files.newInputStream(file)) {
		parser.parse(stream, handler, metadata);
		Document doc = new Document();
		String[] metadataNames = metadata.names();
		for (String name : metadataNames)
			doc.add(new TextField(name, metadata.get(name), Field.Store.YES));
		doc.add(new StringField("path", file.toString(), Field.Store.YES));
		doc.add(new LongPoint("modified", lastModified));
		results.appendText("Title: " + metadata.get("title") + "\n");
		results.appendText("Artists: " + metadata.get("xmpDM:artist") + "\n");
		results.appendText("Genre: " + metadata.get("xmpDM:genre") + "\n");
		results.appendText("Year: " + metadata.get("xmpDM:releaseDate") + "\n");
		if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
			// New index, so we just add the document (no old document can
			// be there):
			results.appendText("adding " + file + "\n");
			writer.addDocument(doc);
		} else {
			// Existing index (an old copy of this document may have been
			// indexed):
			results.appendText("updating " + file);
			writer.updateDocument(new Term("path", file.toString()), doc);
		}
	}
}
 
开发者ID:Tregz,项目名称:mediaPlayerApp,代码行数:37,代码来源:mediaIndexer.java

示例7: doTikaStuff

import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
private String doTikaStuff(String bucket, String key, InputStream objectData) throws IOException, TransformerConfigurationException, SAXException {
  _logger.log("Extracting text with Tika");
  String extractedText = "";

  SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
  TransformerHandler handler = factory.newTransformerHandler();
  handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text");
  handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
  StringWriter sw = new StringWriter();
  handler.setResult(new StreamResult(sw));
  AutoDetectParser parser = new AutoDetectParser();
  ParseContext parseContext = new ParseContext();
  parseContext.set(Parser.class, parser);

  Tika tika = new Tika();
  Metadata tikaMetadata = new Metadata();
  try {
    // for synthetic transactions
    if( key.toLowerCase().endsWith("tika.exception.testing.pdf")) {
      throw new TikaException("Test Tika Exception");
    }
    parser.parse(objectData, handler, tikaMetadata, parseContext);
    extractedText = sw.toString();
  } catch( TikaException e) {
    _logger.log("TikaException thrown while parsing: " + e.getLocalizedMessage());
    return assembleExceptionResult(bucket, key, e);
  }
  _logger.log("Tika parsing success");
  return assembleExtractionResult(bucket, key, extractedText, tikaMetadata);
}
 
开发者ID:DovetailSoftware,项目名称:tika-lambda,代码行数:31,代码来源:TikaLambdaHandler.java

示例8: doProcessStream

import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
	super.doProcessStream(stream, source, jCas);

	try {
		BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
		Metadata metadata = new Metadata();
		ParseContext context = new ParseContext();

		AutoDetectParser autoParser = new AutoDetectParser();
		autoParser.parse(stream, textHandler, metadata, context);

		String fullContent = textHandler.toString();
		Matcher m = tearlinePattern.matcher(fullContent);
		if(m.find()){
			jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim());
		}else{
			jCas.setDocumentText(removeBoilerplate(fullContent).trim());
		}

		for (String name : metadata.names()) {
			addMetadata(jCas, name, metadata.get(name));
		}
	} catch (SAXException | TikaException e) {
		getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
	}
}
 
开发者ID:dstl,项目名称:baleen,代码行数:28,代码来源:TearlineContentExtractor.java

示例9: parseToXML

import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
public static String parseToXML(InputStream inputStream) throws IOException, SAXException, TikaException {
    ContentHandler handler = new ToXMLContentHandler();
    AutoDetectParser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    parser.parse(inputStream, handler, metadata);
    return handler.toString();
}
 
开发者ID:marklogic-community,项目名称:marklogic-spring-batch,代码行数:8,代码来源:TikaParser.java

示例10: parseToPlainText

import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
public static String parseToPlainText(InputStream fileStream) {
    BodyContentHandler handler = new BodyContentHandler();
    AutoDetectParser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    String rawContent = "";

    try {
        parser.parse(fileStream, handler, metadata);
        rawContent = handler.toString();
    } catch (IOException | SAXException | TikaException e) {
        LOG.debug("Parsing Exception while extracting content from current file. "
                + e.toString());
    }
    return rawContent;
}
 
开发者ID:ziqizhang,项目名称:jate,代码行数:16,代码来源:JATEUtil.java

示例11: parse

import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
private TikaReaderHandler parse(AutoDetectParser parser, ParseContext parseContext, InputStream is) throws IOException, SAXException, TikaException {
	String name = source.getStreamName(is);
	TikaReaderHandler result = new TikaReaderHandler(name);
	parser.parse(is, result, result.getMetadata(), parseContext);
	return result;
}
 
开发者ID:Bibliome,项目名称:alvisnlp,代码行数:7,代码来源:TikaReader.java

示例12: main

import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
/**
 * Main function.
 */
public static void main(String[] args) {

        try {
                // Tika tika = new Tika();
                // File xpsFile = new File("/home/foo/a/temp/xlsx.xlsx");
                // InputStream inputStream = new FileInputStream(xpsFile);
                // String FileName = xpsFile.getName();
                // Metadata metadata = new Metadata();
                // if (FileName != null && FileName.length() > 0)
                // metadata.add(Metadata.RESOURCE_NAME_KEY, FileName);
                // String MimeType = tika.detect(inputStream, metadata);

                // metadata.add(Metadata.CONTENT_TYPE, MimeType);
                // inputStream.close();
                // inputStream = new FileInputStream(xpsFile);
                // Reader reader = tika.parse(inputStream, metadata);
                // String content = IOUtils.toString(reader);

                // System.out.println(new AutoDetectParser().getParsers().keySet());
                // System.out.println("shit: " + tika.getParser() + " " + MimeType);
                // System.out.println(content);
                // inputStream.close();

                ClassLoader loader = Thread.currentThread().getContextClassLoader();
                TikaConfig config = new TikaConfig(new File("/home/foo/a/code/big_bang/tika-1.5/"
                                + "tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml"));

                final AutoDetectParser autoDetectParser = new AutoDetectParser(config);

                final Detector detector = config.getDetector();
                final Tika tika = new Tika();


                File xpsFile = new File("/home/foo/a/temp/xlsx.xlsx");
                InputStream inputStream = new FileInputStream(xpsFile);
                String FileName = xpsFile.getName();
                Metadata metadata = new Metadata();
                if (FileName != null && FileName.length() > 0)
                        metadata.add(Metadata.RESOURCE_NAME_KEY, FileName);

                String MimeType = tika.detect(inputStream, metadata);
                // metadata.add(Metadata.CONTENT_TYPE, MimeType);
                // ContentHandler handler = new XHTMLContentHandler(System.out);

                // ContentHandler bch = new BodyContentHandler(System.out);
                // ContentHandler handler = new BodyContentHandler();
                // ContentHandler xhtml = new XHTMLContentHandler(handler,
                // metadata);

                StringWriter sw = new StringWriter();

                SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
                TransformerHandler handler = factory.newTransformerHandler();
                handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
                handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
                handler.setResult(new StreamResult(sw));
                BodyContentHandler bch = new BodyContentHandler(handler);
                handler.startDocument();
                inputStream.close();
                inputStream = new FileInputStream(xpsFile);
                autoDetectParser.parse(inputStream, bch, metadata);
                String x = sw.toString();
                System.out.println(x);

                // Document doc = Jsoup.parse(x);

                // Elements elements = doc.getElementsByTag("p");
                // for (Element element : elements) {
                //         System.out.println(element.text());
                // }

        } catch (Exception e) {
                e.printStackTrace();
        }
}
 
开发者ID:kolbasa,项目名称:OCRaptor,代码行数:79,代码来源:OOXMLParser.java

示例13: discoverAgainstSingleModel

import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
private List<FileMatchMetaData> discoverAgainstSingleModel(final Properties fileDiscoveryProperties, final Model model, final double probabilityThreshold)
throws AnonymizerException, IOException, SAXException, TikaException {
    // Start running NLP algorithms for each column and collect percentage
    fileMatches = new ArrayList<>();
    String[] directoryList = null;
    String[] exclusionList = null;
    final String directories = fileDiscoveryProperties.getProperty("directories");
    final String exclusions = fileDiscoveryProperties.getProperty("exclusions");
    directoryList = directories.split(",");
    exclusionList = exclusions.split(",");

    // Let's iterate over directories
    File node;
    Metadata metadata;
    List<Probability> probabilityList;
    log.info("File types not considered for analysis: " + exclusions);

    for (final String directory: directoryList) {

        node = new File(directory);
        final List<File> files = (List<File>) FileUtils.listFiles(node, null, true);

        for (final File fich : files) {
                final String file = fich.getName().toString();
                final String recursivedir = fich.getParent().toString();

                log.info("Analyzing [" + fich.getCanonicalPath() + "]");

                final String ext = CommonUtils.getFileExtension(fich);

                  if (Arrays.asList(exclusionList).contains(ext)) {
                    // less verbose - Ignored types on the top
                      continue;
                  }

                final BodyContentHandler handler = new BodyContentHandler(-1);

                final AutoDetectParser parser = new AutoDetectParser();
                metadata = new Metadata();
                String handlerString = "";
                try  {
                final InputStream stream = new FileInputStream(fich.getCanonicalPath());
                    if (stream != null) {
                        parser.parse(stream, handler, metadata);
                        handlerString =  handler.toString();
                    }
                }
                catch (IOException e) {
                  log.info("Unable to read " + fich.getCanonicalPath() +".Ignoring...");
                  }


                log.debug("Content: " + handlerString);
                final String tokens[] = model.getTokenizer().tokenize(handler.toString());
                final Span nameSpans[] = model.getNameFinder().find(tokens);
                final double[] spanProbs = model.getNameFinder().probs(nameSpans);
                //display names
                probabilityList = new ArrayList<>();
                for( int i = 0; i < nameSpans.length; i++) {
                    log.info("Span: "+nameSpans[i].toString());
                    log.info("Covered text is: "+tokens[nameSpans[i].getStart()]);
                    log.info("Probability is: "+spanProbs[i]);
                    probabilityList.add(new Probability(tokens[nameSpans[i].getStart()], spanProbs[i]));
                }
                model.getNameFinder().clearAdaptiveData();

                final double averageProbability = calculateAverage(probabilityList);
                if ((averageProbability >= probabilityThreshold)) {
                    final FileMatchMetaData result = new FileMatchMetaData(recursivedir, file);
                    result.setAverageProbability(averageProbability);
                    result.setModel(model.getName());
                    fileMatches.add(result);
                }
            }
        }


    return fileMatches;
  }
 
开发者ID:armenak,项目名称:DataDefender,代码行数:80,代码来源:FileDiscoverer.java

示例14: testExcelXLSB

import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
/**
 * We don't currently support the .xlsb file format 
 *  (an OOXML container with binary blobs), but we 
 *  shouldn't break on these files either (TIKA-826)  
 */
@Test
public void testExcelXLSB() throws Exception {
   Detector detector = new DefaultDetector();
   AutoDetectParser parser = new AutoDetectParser();
   
   InputStream input = ExcelParserTest.class.getResourceAsStream(
         "/test-documents/testEXCEL.xlsb");
   Metadata m = new Metadata();
   m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
   
   // Should be detected correctly
   MediaType type = null;
   try {
      type = detector.detect(input, m);
      assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
   } finally {
      input.close();
   }
   
   // OfficeParser won't handle it
   assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
   
   // OOXMLParser won't handle it
   assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
   
   // AutoDetectParser doesn't break on it
   input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb");

   try {
      ContentHandler handler = new BodyContentHandler(-1);
      ParseContext context = new ParseContext();
      context.set(Locale.class, Locale.US);
      parser.parse(input, handler, m, context);

      String content = handler.toString();
      assertEquals("", content);
   } finally {
      input.close();
   }
}
 
开发者ID:kanrourou,项目名称:software-testing,代码行数:46,代码来源:ExcelParserTest.java

示例15: testExcel95

import org.apache.tika.parser.AutoDetectParser; //导入方法依赖的package包/类
/**
 * We don't currently support the old Excel 95 .xls file format, 
 *  but we shouldn't break on these files either (TIKA-976)  
 */
@Test
public void testExcel95() throws Exception {
   Detector detector = new DefaultDetector();
   AutoDetectParser parser = new AutoDetectParser();
   
   InputStream input = ExcelParserTest.class.getResourceAsStream(
         "/test-documents/testEXCEL_95.xls");
   Metadata m = new Metadata();
   m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
   
   // Should be detected correctly
   MediaType type = null;
   try {
      type = detector.detect(input, m);
      assertEquals("application/vnd.ms-excel", type.toString());
   } finally {
      input.close();
   }
   
   // OfficeParser will claim to handle it
   assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
   
   // OOXMLParser won't handle it
   assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
   
   // AutoDetectParser doesn't break on it
   input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls");

   try {
      ContentHandler handler = new BodyContentHandler(-1);
      ParseContext context = new ParseContext();
      context.set(Locale.class, Locale.US);
      parser.parse(input, handler, m, context);

      String content = handler.toString();
      assertEquals("", content);
   } finally {
      input.close();
   }
}
 
开发者ID:kanrourou,项目名称:software-testing,代码行数:45,代码来源:ExcelParserTest.java


注:本文中的org.apache.tika.parser.AutoDetectParser.parse方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。