本文整理汇总了Java中org.apache.tika.sax.BodyContentHandler.toString方法的典型用法代码示例。如果您正苦于以下问题:Java BodyContentHandler.toString方法的具体用法?Java BodyContentHandler.toString怎么用?Java BodyContentHandler.toString使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.tika.sax.BodyContentHandler
的用法示例。
在下文中一共展示了BodyContentHandler.toString方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: readXlsx
import org.apache.tika.sax.BodyContentHandler; //导入方法依赖的package包/类
public static ExcelData readXlsx(String xlsxFilePath)
throws IOException, InvalidFormatException, XmlException, TikaException, SAXException {
BodyContentHandler bcHandler = new BodyContentHandler();
Metadata metadata = new Metadata();
FileInputStream inputStream = new FileInputStream(new File(xlsxFilePath));
ParseContext pcontext = new ParseContext();
OOXMLParser parser = new OOXMLParser();
parser.parse(inputStream, bcHandler, metadata, pcontext);
if (DEBUG_PRINT_META_DATA) {
System.err.println("Metadata:");
for (String name : metadata.names())
System.out.println(name + "\t:\t" + metadata.get(name));
}
ExcelData spreedsheet = new ExcelData(bcHandler.toString());
return spreedsheet;
}
示例2: parse
import org.apache.tika.sax.BodyContentHandler; //导入方法依赖的package包/类
public ParsedData parse(InputStream stream, String fileName, String contentType) {
BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS);
BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE);
Metadata metadata = createMetadata(fileName, contentType);
ParseContext context = new ParseContext();
try {
parser.parse(stream, textHandler, metadata, context);
Map<String, String> metadataMap = new HashMap<String, String>();
for (String propertyName : metadata.names()) {
metadataMap.put(propertyName, metadata.get(propertyName));
}
return new ParsedData(handler.toString(), metadataMap);
} catch (IOException | SAXException | TikaException e) {
logger.error("Failed to extract metadata using Tika.", e);
return null;
}
}
示例3: fromFile
import org.apache.tika.sax.BodyContentHandler; //导入方法依赖的package包/类
@Override
public String fromFile(File file) {
String resultText = "";
try {
FileInputStream inputstream = new FileInputStream(file);
BodyContentHandler handler = new BodyContentHandler(-1);
Metadata metadata = new Metadata();
ParseContext pcontext = new ParseContext();
PDFParserConfig config = new PDFParserConfig();
config.setSortByPosition(true);
PDFParser pdfparser = new PDFParser();
pdfparser.setPDFParserConfig(config);
System.out.println("Parsing PDF to TEXT...");
pdfparser.parse(inputstream, handler, metadata, pcontext);
resultText = handler.toString();
System.out.println("Parsing complete");
} catch (Exception ex) {
throw new RuntimeException(ex);
}
return resultText;
}
示例4: extract
import org.apache.tika.sax.BodyContentHandler; //导入方法依赖的package包/类
public String extract(String path) throws Exception {
BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);
try (FileInputStream stream = new FileInputStream(path)) {
Metadata metadata = new Metadata();
parser.parse(stream, handler, metadata, parseContext);
//System.out.println(metadata);
return handler.toString();
}
}
示例5: parsePdfFileToPlainText
import org.apache.tika.sax.BodyContentHandler; //导入方法依赖的package包/类
/**
* 将PDF文件解析为文本
* @param file 本地PDF文件的相对路径或绝对路径
* @return 提取的文本
*/
public static String parsePdfFileToPlainText(String file) {
try(InputStream stream = new FileInputStream(file)) {
BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);
Metadata metadata = new Metadata();
PARSER.parse(stream, handler, metadata);
return handler.toString();
} catch (Exception e){
e.printStackTrace();
}
return "";
}
示例6: doProcessStream
import org.apache.tika.sax.BodyContentHandler; //导入方法依赖的package包/类
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
super.doProcessStream(stream, source, jCas);
try {
BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
AutoDetectParser autoParser = new AutoDetectParser();
autoParser.parse(stream, textHandler, metadata, context);
String fullContent = textHandler.toString();
Matcher m = tearlinePattern.matcher(fullContent);
if(m.find()){
jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim());
}else{
jCas.setDocumentText(removeBoilerplate(fullContent).trim());
}
for (String name : metadata.names()) {
addMetadata(jCas, name, metadata.get(name));
}
} catch (SAXException | TikaException e) {
getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
}
}
示例7: parseToPlainText
import org.apache.tika.sax.BodyContentHandler; //导入方法依赖的package包/类
public static String parseToPlainText(InputStream fileStream) {
BodyContentHandler handler = new BodyContentHandler();
AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
String rawContent = "";
try {
parser.parse(fileStream, handler, metadata);
rawContent = handler.toString();
} catch (IOException | SAXException | TikaException e) {
LOG.debug("Parsing Exception while extracting content from current file. "
+ e.toString());
}
return rawContent;
}
示例8: htmlParserShouldReturnIndentedSentenceWithTwoLineBreaks
import org.apache.tika.sax.BodyContentHandler; //导入方法依赖的package包/类
@Test
public void htmlParserShouldReturnIndentedSentenceWithTwoLineBreaks() throws Exception {
final String html = "<html><head><title>Title</title></head>" +
"<body><ul><li>one</li></ul></body></html>";
BodyContentHandler handler = new BodyContentHandler();
new HtmlParser().parse(
new ByteArrayInputStream(html.getBytes("UTF-8")),
handler, new Metadata(), new ParseContext());
// Make sure we get <tab>, "one", newline, newline
String result = handler.toString();
assertTrue(Pattern.matches("\tone\n\n", result));
}
示例9: conversionImplementation
import org.apache.tika.sax.BodyContentHandler; //导入方法依赖的package包/类
/**
* Common implementation -- take an input stream and return a ConvertedDoc;
*
* @param input stream for raw file
* @param doc raw file
* @return converted doc
* @throws IOException if underlying Tika parser/writer had an IO problem, an parser
* problem, or MAX_TEXT_SIZE is reached.
*/
@Override
protected ConvertedDocument conversionImplementation(InputStream input, java.io.File doc)
throws IOException {
Metadata metadata = new Metadata();
BodyContentHandler handler = new BodyContentHandler(maxBuffer);
try {
parser.parse(input, handler, metadata, ctx);
} catch (NoClassDefFoundError classErr){
throw new IOException("Unable to parse content due to Tika misconfiguration", classErr);
} catch (Exception xerr) {
throw new IOException("Unable to parse content", xerr);
} finally {
input.close();
}
ConvertedDocument textdoc = new ConvertedDocument(doc);
textdoc.addTitle(metadata.get(TikaCoreProperties.TITLE));
textdoc.setEncoding(metadata.get(Metadata.CONTENT_ENCODING));
textdoc.addCreateDate(metadata.getDate(TikaCoreProperties.CREATED));
textdoc.addAuthor(metadata.get(TikaCoreProperties.CREATOR));
// v1.5: until this version this blank line reducer was in place.
// Using Java6 it appeared to cause StackOverflow when it encountered a document hundreds of \n in a row.
// Eg.., a Spreadsheet doc converted to text may have thousands of empty lines following the last data row.
// TextUtils.reduce_line_breaks(txt)
String t = handler.toString();
if (t != null) {
if (textdoc.filename!= null && FileUtility.isSpreadsheet(textdoc.filename)) {
textdoc.setText(t.trim());
} else {
textdoc.setText(TextUtils.reduce_line_breaks(t));
}
}
return textdoc;
}
示例10: discoverAgainstSingleModel
import org.apache.tika.sax.BodyContentHandler; //导入方法依赖的package包/类
private List<FileMatchMetaData> discoverAgainstSingleModel(final Properties fileDiscoveryProperties, final Model model, final double probabilityThreshold)
throws AnonymizerException, IOException, SAXException, TikaException {
// Start running NLP algorithms for each column and collect percentage
fileMatches = new ArrayList<>();
String[] directoryList = null;
String[] exclusionList = null;
final String directories = fileDiscoveryProperties.getProperty("directories");
final String exclusions = fileDiscoveryProperties.getProperty("exclusions");
directoryList = directories.split(",");
exclusionList = exclusions.split(",");
// Let's iterate over directories
File node;
Metadata metadata;
List<Probability> probabilityList;
log.info("File types not considered for analysis: " + exclusions);
for (final String directory: directoryList) {
node = new File(directory);
final List<File> files = (List<File>) FileUtils.listFiles(node, null, true);
for (final File fich : files) {
final String file = fich.getName().toString();
final String recursivedir = fich.getParent().toString();
log.info("Analyzing [" + fich.getCanonicalPath() + "]");
final String ext = CommonUtils.getFileExtension(fich);
if (Arrays.asList(exclusionList).contains(ext)) {
// less verbose - Ignored types on the top
continue;
}
final BodyContentHandler handler = new BodyContentHandler(-1);
final AutoDetectParser parser = new AutoDetectParser();
metadata = new Metadata();
String handlerString = "";
try {
final InputStream stream = new FileInputStream(fich.getCanonicalPath());
if (stream != null) {
parser.parse(stream, handler, metadata);
handlerString = handler.toString();
}
}
catch (IOException e) {
log.info("Unable to read " + fich.getCanonicalPath() +".Ignoring...");
}
log.debug("Content: " + handlerString);
final String tokens[] = model.getTokenizer().tokenize(handler.toString());
final Span nameSpans[] = model.getNameFinder().find(tokens);
final double[] spanProbs = model.getNameFinder().probs(nameSpans);
//display names
probabilityList = new ArrayList<>();
for( int i = 0; i < nameSpans.length; i++) {
log.info("Span: "+nameSpans[i].toString());
log.info("Covered text is: "+tokens[nameSpans[i].getStart()]);
log.info("Probability is: "+spanProbs[i]);
probabilityList.add(new Probability(tokens[nameSpans[i].getStart()], spanProbs[i]));
}
model.getNameFinder().clearAdaptiveData();
final double averageProbability = calculateAverage(probabilityList);
if ((averageProbability >= probabilityThreshold)) {
final FileMatchMetaData result = new FileMatchMetaData(recursivedir, file);
result.setAverageProbability(averageProbability);
result.setModel(model.getName());
fileMatches.add(result);
}
}
}
return fileMatches;
}