本文整理汇总了Java中org.apache.tika.parser.Parser.parse方法的典型用法代码示例。如果您正苦于以下问题:Java Parser.parse方法的具体用法?Java Parser.parse怎么用?Java Parser.parse使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.tika.parser.Parser
的用法示例。
在下文中一共展示了Parser.parse方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: extractText
import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
@Override
public void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)
throws IOException
{
try
{
Metadata meta = new Metadata();
ContentHandler handler = new BodyContentHandler();
Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));
parser.parse(input, handler, meta, new ParseContext());
String content = handler.toString();
if( content.length() > maxSize )
{
content = content.substring(0, maxSize);
}
outputText.append(content);
if( LOGGER.isDebugEnabled() )
{
LOGGER.debug("Word Summary:" + content); //$NON-NLS-1$
}
}
catch( Exception e )
{
throw new RuntimeException(e);
}
}
示例2: convertWordDocumentIntoHtml
import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
/**
* Converts a .docx document into HTML markup. This code
* is based on <a href="http://stackoverflow.com/a/9053258/313554">this StackOverflow</a> answer.
*
* @param wordDocument The converted .docx document.
* @return
*/
public ConvertedDocumentDTO convertWordDocumentIntoHtml(MultipartFile wordDocument) {
LOGGER.info("Converting word document: {} into HTML", wordDocument.getOriginalFilename());
try {
InputStream input = wordDocument.getInputStream();
Parser parser = new OOXMLParser();
StringWriter sw = new StringWriter();
SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8");
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.setResult(new StreamResult(sw));
Metadata metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, "text/html;charset=utf-8");
parser.parse(input, handler, metadata, new ParseContext());
return new ConvertedDocumentDTO(wordDocument.getOriginalFilename(), sw.toString());
}
catch (IOException | SAXException | TransformerException | TikaException ex) {
LOGGER.error("Conversion failed because an exception was thrown", ex);
throw new DocumentConversionException(ex.getMessage(), ex);
}
}
示例3: getFullText
import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
private static String getFullText(final String filepath) throws IOException, SAXException, TikaException {
StringWriter writer = new StringWriter();
final TikaInputStream inputStream = TikaInputStream.get(new File(filepath));
try {
final Detector detector = new DefaultDetector();
final Parser parser = new AutoDetectParser(detector);
final Metadata metadata = new Metadata();
final ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
ContentHandler contentHandler = new BodyContentHandler(writer);
parser.parse(inputStream, contentHandler, metadata, parseContext);
}
finally {
inputStream.close();
}
return writer.toString();
}
示例4: extractText
import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
@Override
public void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)
throws IOException
{
WriteOutContentHandler wrapped = new WriteOutContentHandler(maxSize);
ContentHandler handler = new BodyContentHandler(wrapped);
try
{
Metadata meta = new Metadata();
Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));
parser.parse(input, handler, meta, new ParseContext());
appendText(handler, outputText, maxSize);
}
catch( Exception t )
{
if( wrapped.isWriteLimitReached(t) )
{
// keep going
LOGGER.debug("PDF size limit reached. Indexing truncated text");
appendText(handler, outputText, maxSize);
return;
}
throw Throwables.propagate(t);
}
}
示例5: extractMetaData
import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
public static TreeMap<String, String> extractMetaData(InputStream input) throws IOException {
TreeMap<String, String> treeMap = new TreeMap<String, String>();
try {
ContentHandler handler = new DefaultHandler();
Metadata metadata = new Metadata();
Parser parser = new AutoDetectParser();
parser.parse(input, handler, metadata, new ParseContext());
for (int i = 0; i < metadata.names().length; i++) {
String name = metadata.names()[i];
treeMap.put(name, stripWhiteSpace(metadata.get(name)));
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (input != null)
input.close();
}
return treeMap;
}
示例6: extractStringMetaData
import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
public static String extractStringMetaData(InputStream input) throws IOException {
String result="";
try {
ContentHandler handler = new DefaultHandler();
Metadata metadata = new Metadata();
Parser parser = new AutoDetectParser();
parser.parse(input, handler, metadata, new ParseContext());
for (int i = 0; i < metadata.names().length; i++) {
String name = metadata.names()[i];
result+=name.toUpperCase()+" : "+stripWhiteSpace(metadata.get(name))+"\n";
}
} catch (Exception e) {
} finally {
if (input != null)
input.close();
}
return result;
}
示例7: Convert
import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
protected String Convert(InputStream Bytes) throws PDException
{
try {
ContentHandler textHandler=new BodyContentHandler(-1);
Metadata metadata=new Metadata();
Parser parser=new AutoDetectParser();
ParseContext context=new ParseContext();
parser.parse(Bytes, textHandler, metadata, context);
FileMetadata="";
for (String key : metadata.names())
FileMetadata+=key+"="+metadata.get(key)+"\n";
FullText=textHandler.toString();
} catch (Exception ex)
{
PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage());
}
return(FullText);
}
示例8: initSize
import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
private static ProcessResult initSize(BaseFileItem f, File img, String mime) {
ProcessResult res = new ProcessResult();
res.setProcess("get image dimensions :: " + f.getId());
final Parser parser = new ImageParser();
try (InputStream is = new FileInputStream(img)) {
Metadata metadata = new Metadata();
metadata.set(CONTENT_TYPE, mime);
parser.parse(is, new DefaultHandler(), metadata, new ParseContext());
f.setWidth(Integer.valueOf(metadata.get(TIFF.IMAGE_WIDTH)));
f.setHeight(Integer.valueOf(metadata.get(TIFF.IMAGE_LENGTH)));
res.setExitCode(ZERO);
} catch (Exception e) {
log.error("Error while getting dimensions", e);
res.setError("Error while getting dimensions");
res.setException(e.getMessage());
res.setExitCode(-1);
}
return res;
}
示例9: main
import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
public static void main(String[] args) {
String[] urls = {"http://t.co/hP5PM6fm", "http://t.co/xSFteG23"};
for (String url : urls)
{
try {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
URL urlObject = new URL(url);
ContentHandler handler = new BodyContentHandler(10 *
1024 * 1024);
parser.parse((InputStream) urlObject.getContent(),
handler, metadata, parseContext);
String[] mimeDetails = metadata.get("Content-Type")
.split(";");
logger.info("execute: url = "+url+", mimeDetails = "+Arrays.asList(mimeDetails));
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
示例10: imageParserShouldReturnMarkerInformationOfImage
import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
@Test
public void imageParserShouldReturnMarkerInformationOfImage() throws Exception {
Parser parser = new ImageParser();
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
InputStream stream =getClass().getResourceAsStream("/test-documents/testJPEG.jpg");
parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
assertEquals("class=0, htableId=0", metadata.get("markerSequence dht dhtable"));
assertEquals("225", metadata.get("markerSequence unknown"));
assertEquals("componentSelector=1, dcHuffTable=0, acHuffTable=0", metadata.get("markerSequence sos scanComponentSpec"));
assertEquals("elementPrecision=0, qtableId=0", metadata.get("markerSequence dqt dqtable"));
assertEquals("numScanComponents=3, startSpectralSelection=0, endSpectralSelection=63, approxHigh=0, approxLow=0", metadata.get("markerSequence sos"));
assertEquals("componentId=1, HsamplingFactor=1, VsamplingFactor=1, QtableSelector=0", metadata.get("markerSequence sof componentSpec"));
assertEquals("process=0, samplePrecision=8, numLines=75, samplesPerLine=100, numFrameComponents=3", metadata.get("markerSequence sof"));
}
示例11: testProtectedExcelSheets
import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
/**
* Documents with some sheets are protected, but not all.
* See TIKA-364.
*/
@Test
public void testProtectedExcelSheets() throws Exception {
InputStream input = OOXMLParserTest.class
.getResourceAsStream("/test-documents/protectedSheets.xlsx");
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
try {
parser.parse(input, handler, metadata, context);
assertEquals(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
metadata.get(Metadata.CONTENT_TYPE));
assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
} finally {
input.close();
}
}
示例12: testNullHeaders
import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
/**
* Test docx without headers
* TIKA-633
*/
@Test
public void testNullHeaders() throws Exception {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
InputStream input = getTestDocument("NullHeader.docx");
try {
parser.parse(input, handler, metadata, context);
assertFalse(handler.toString().length()==0);
} finally {
input.close();
}
}
示例13: extractText
import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
@Override
public void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)
throws IOException
{
try
{
Metadata meta = new Metadata();
ContentHandler handler = new BodyContentHandler();
Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));
parser.parse(input, handler, meta, new ParseContext());
String content = handler.toString();
if( content.length() > maxSize )
{
content = content.substring(0, maxSize);
}
outputText.append(content);
if( LOGGER.isDebugEnabled() )
{
LOGGER.debug("Excel Summary:" + content); //$NON-NLS-1$
}
}
catch( Exception e )
{
// Do nothing
}
}
示例14: parse
import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
private static String parse(final InputStream input) throws TikaException, SAXException, IOException {
final Parser parser = new PDFParser();
final ContentHandler handler = new BodyContentHandler();
final Metadata metadata = new Metadata();
final ParseContext parseContext = new ParseContext();
parser.parse(input, handler, metadata, parseContext);
return handler.toString();
}
示例15: main
import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
public static void main(final String[] args) throws IOException,
TikaException, SAXException {
File file = new File("/home/aditya/dataset/oca.pdf");
Parser parser = new AutoDetectParser();
BodyContentHandler handler = null;
Metadata metadata = new Metadata();
FileInputStream inputstream = new FileInputStream(file);
ParseContext context = new ParseContext();
parser.parse(inputstream, handler, metadata, context);
String[] metadataNames = metadata.names();
// Metadata Properties
// for (String name : metadataNames) {
//
// System.out.println(name);
// }
// Get specific metadata
System.out.println(metadata.get(MetadataProperties.TITLE));
System.out.println(metadata.get(MetadataProperties.AUTHOR));
System.out.println(metadata.get(MetadataProperties.CREATOR));
System.out.println(metadata.get(MetadataProperties.CONTENT_TYPE));
System.out.println(metadata.get(MetadataProperties.ENCRYPTION));
}