当前位置: 首页>>代码示例>>Java>>正文


Java Parser.parse方法代码示例

本文整理汇总了Java中org.apache.tika.parser.Parser.parse方法的典型用法代码示例。如果您正苦于以下问题:Java Parser.parse方法的具体用法?Java Parser.parse怎么用?Java Parser.parse使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.tika.parser.Parser的用法示例。


在下文中一共展示了Parser.parse方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: extractText

import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
@Override
public void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)
	throws IOException
{
	try
	{
		Metadata meta = new Metadata();
		ContentHandler handler = new BodyContentHandler();
		Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));
		parser.parse(input, handler, meta, new ParseContext());

		String content = handler.toString();

		if( content.length() > maxSize )
		{
			content = content.substring(0, maxSize);
		}
		outputText.append(content);
		if( LOGGER.isDebugEnabled() )
		{
			LOGGER.debug("Word Summary:" + content); //$NON-NLS-1$
		}
	}
	catch( Exception e )
	{
		throw new RuntimeException(e);
	}
}
 
开发者ID:equella,项目名称:Equella,代码行数:29,代码来源:MsWordExtracter.java

示例2: convertWordDocumentIntoHtml

import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
/**
 * Converts a .docx document into HTML markup. This code
 * is based on <a href="http://stackoverflow.com/a/9053258/313554">this StackOverflow</a> answer.
 *
 * @param wordDocument  The converted .docx document.
 * @return
 */
public ConvertedDocumentDTO convertWordDocumentIntoHtml(MultipartFile wordDocument) {
    LOGGER.info("Converting word document: {} into HTML", wordDocument.getOriginalFilename());
    try {
        InputStream input = wordDocument.getInputStream();
        Parser parser = new OOXMLParser();

        StringWriter sw = new StringWriter();
        SAXTransformerFactory factory = (SAXTransformerFactory)
                SAXTransformerFactory.newInstance();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8");
        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
        handler.setResult(new StreamResult(sw));

        Metadata metadata = new Metadata();
        metadata.add(Metadata.CONTENT_TYPE, "text/html;charset=utf-8");
        parser.parse(input, handler, metadata, new ParseContext());
        return new ConvertedDocumentDTO(wordDocument.getOriginalFilename(), sw.toString());
    }
    catch (IOException | SAXException | TransformerException | TikaException ex) {
        LOGGER.error("Conversion failed because an exception was thrown", ex);
        throw new DocumentConversionException(ex.getMessage(), ex);
    }
}
 
开发者ID:Vincit,项目名称:spring-boot-word-to-html-example,代码行数:33,代码来源:WordToHtmlConverter.java

示例3: getFullText

import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
private static String getFullText(final String filepath) throws IOException, SAXException, TikaException {
    StringWriter writer = new StringWriter();
    
    final TikaInputStream inputStream =  TikaInputStream.get(new File(filepath));
    try {
        final Detector detector = new DefaultDetector();
        final Parser parser = new AutoDetectParser(detector);

        final Metadata metadata = new Metadata();
        final ParseContext parseContext = new ParseContext();
        parseContext.set(Parser.class, parser);

        
        ContentHandler contentHandler = new BodyContentHandler(writer);
        parser.parse(inputStream, contentHandler, metadata, parseContext);
    }
    finally {
        inputStream.close();
    }
    
    return writer.toString();
}
 
开发者ID:CoEIA,项目名称:DEM,代码行数:23,代码来源:ItemFactory.java

示例4: extractText

import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
@Override
public void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)
	throws IOException
{
	WriteOutContentHandler wrapped = new WriteOutContentHandler(maxSize);
	ContentHandler handler = new BodyContentHandler(wrapped);
	try
	{
		Metadata meta = new Metadata();
		Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));
		parser.parse(input, handler, meta, new ParseContext());

		appendText(handler, outputText, maxSize);
	}
	catch( Exception t )
	{
		if( wrapped.isWriteLimitReached(t) )
		{
			// keep going
			LOGGER.debug("PDF size limit reached.  Indexing truncated text");
			appendText(handler, outputText, maxSize);
			return;
		}
		throw Throwables.propagate(t);
	}
}
 
开发者ID:equella,项目名称:Equella,代码行数:27,代码来源:PdfExtracter.java

示例5: extractMetaData

import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
public static TreeMap<String, String> extractMetaData(InputStream input) throws IOException {
	TreeMap<String, String> treeMap = new TreeMap<String, String>();
	try {
		ContentHandler handler = new DefaultHandler();
		Metadata metadata = new Metadata();
		Parser parser = new AutoDetectParser();
		parser.parse(input, handler, metadata, new ParseContext());
		for (int i = 0; i < metadata.names().length; i++) {
			String name = metadata.names()[i];
			treeMap.put(name, stripWhiteSpace(metadata.get(name)));
		}
	} catch (Exception e) {
		e.printStackTrace();
	} finally {
		if (input != null)
			input.close();
	}
	return treeMap;
}
 
开发者ID:regestaexe,项目名称:bygle-ldp,代码行数:20,代码来源:FileInfoReader.java

示例6: extractStringMetaData

import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
public static String extractStringMetaData(InputStream input) throws IOException {
	String result="";
	try {
		ContentHandler handler = new DefaultHandler();
		Metadata metadata = new Metadata();
		Parser parser = new AutoDetectParser();
		parser.parse(input, handler, metadata, new ParseContext());
		for (int i = 0; i < metadata.names().length; i++) {
			String name = metadata.names()[i];
			result+=name.toUpperCase()+" : "+stripWhiteSpace(metadata.get(name))+"\n";
		}
	} catch (Exception e) {
	} finally {
		if (input != null)
			input.close();
	}
	return result;
}
 
开发者ID:regestaexe,项目名称:bygle-ldp,代码行数:19,代码来源:FileInfoReader.java

示例7: Convert

import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
protected String Convert(InputStream Bytes) throws PDException
{  
try {                
ContentHandler textHandler=new BodyContentHandler(-1);
Metadata metadata=new Metadata();
Parser parser=new AutoDetectParser();
ParseContext context=new ParseContext();
parser.parse(Bytes, textHandler, metadata, context);
FileMetadata="";
for (String key : metadata.names()) 
    FileMetadata+=key+"="+metadata.get(key)+"\n";
FullText=textHandler.toString();
} catch (Exception ex)
    {
    PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage());
    }

return(FullText); 
}
 
开发者ID:JHierrot,项目名称:openprodoc,代码行数:20,代码来源:FTConnector.java

示例8: initSize

import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
private static ProcessResult initSize(BaseFileItem f, File img, String mime) {
	ProcessResult res = new ProcessResult();
	res.setProcess("get image dimensions :: " + f.getId());
	final Parser parser = new ImageParser();
	try (InputStream is = new FileInputStream(img)) {
		Metadata metadata = new Metadata();
		metadata.set(CONTENT_TYPE, mime);
		parser.parse(is, new DefaultHandler(), metadata, new ParseContext());
		f.setWidth(Integer.valueOf(metadata.get(TIFF.IMAGE_WIDTH)));
		f.setHeight(Integer.valueOf(metadata.get(TIFF.IMAGE_LENGTH)));
		res.setExitCode(ZERO);
	} catch (Exception e) {
		log.error("Error while getting dimensions", e);
		res.setError("Error while getting dimensions");
		res.setException(e.getMessage());
		res.setExitCode(-1);
	}
	return res;
}
 
开发者ID:apache,项目名称:openmeetings,代码行数:20,代码来源:ImageConverter.java

示例9: main

import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
public static void main(String[] args) {
	String[] urls = {"http://t.co/hP5PM6fm", "http://t.co/xSFteG23"};
	for (String url : urls)
	{
		try {
	          Parser parser = new AutoDetectParser();
	          Metadata metadata = new Metadata();
	          ParseContext parseContext = new ParseContext();
	          URL urlObject = new URL(url);
	          ContentHandler handler = new BodyContentHandler(10 *
	                                   1024 * 1024);
	          parser.parse((InputStream) urlObject.getContent(),
	                         handler, metadata, parseContext);
	          String[] mimeDetails = metadata.get("Content-Type")
	                                              .split(";");
	          logger.info("execute: url = "+url+", mimeDetails = "+Arrays.asList(mimeDetails));
	          
	          
	        } catch (Exception ex) {
	        	ex.printStackTrace();
	        }
	}
       
}
 
开发者ID:BinitaBharati,项目名称:storm-trident-example,代码行数:25,代码来源:TikaParserTest.java

示例10: imageParserShouldReturnMarkerInformationOfImage

import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
@Test
public void imageParserShouldReturnMarkerInformationOfImage() throws Exception {
	Parser parser = new ImageParser();
       Metadata metadata = new Metadata();
       metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
       InputStream stream =getClass().getResourceAsStream("/test-documents/testJPEG.jpg");
       parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
       
       assertEquals("class=0, htableId=0", metadata.get("markerSequence dht dhtable"));
       assertEquals("225", metadata.get("markerSequence unknown"));
       assertEquals("componentSelector=1, dcHuffTable=0, acHuffTable=0", metadata.get("markerSequence sos scanComponentSpec"));
       assertEquals("elementPrecision=0, qtableId=0", metadata.get("markerSequence dqt dqtable"));
       assertEquals("numScanComponents=3, startSpectralSelection=0, endSpectralSelection=63, approxHigh=0, approxLow=0", metadata.get("markerSequence sos"));
       assertEquals("componentId=1, HsamplingFactor=1, VsamplingFactor=1, QtableSelector=0", metadata.get("markerSequence sof componentSpec"));
       assertEquals("process=0, samplePrecision=8, numLines=75, samplesPerLine=100, numFrameComponents=3", metadata.get("markerSequence sof"));
}
 
开发者ID:kanrourou,项目名称:software-testing,代码行数:17,代码来源:ImageParserBDDTest3.java

示例11: testProtectedExcelSheets

import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
/**
 * Documents with some sheets are protected, but not all. 
 * See TIKA-364.
 */
@Test
public void testProtectedExcelSheets() throws Exception {
    InputStream input = OOXMLParserTest.class
            .getResourceAsStream("/test-documents/protectedSheets.xlsx");

    Parser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    ParseContext context = new ParseContext();

    try {
        parser.parse(input, handler, metadata, context);

        assertEquals(
                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                metadata.get(Metadata.CONTENT_TYPE));

        assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
    } finally {
        input.close();
    }
}
 
开发者ID:kanrourou,项目名称:software-testing,代码行数:27,代码来源:OOXMLParserTest.java

示例12: testNullHeaders

import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
/**
 * Test docx without headers
 * TIKA-633
 */
@Test
public void testNullHeaders() throws Exception {
    Parser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    ContentHandler handler = new BodyContentHandler();
    ParseContext context = new ParseContext();

    InputStream input = getTestDocument("NullHeader.docx");
    try {
        parser.parse(input, handler, metadata, context);
        assertFalse(handler.toString().length()==0);
    } finally {
        input.close();
    }
}
 
开发者ID:kanrourou,项目名称:software-testing,代码行数:20,代码来源:OOXMLParserTest.java

示例13: extractText

import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
@Override
public void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)
	throws IOException
{

	try
	{
		Metadata meta = new Metadata();
		ContentHandler handler = new BodyContentHandler();
		Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));
		parser.parse(input, handler, meta, new ParseContext());

		String content = handler.toString();

		if( content.length() > maxSize )
		{
			content = content.substring(0, maxSize);
		}

		outputText.append(content);
		if( LOGGER.isDebugEnabled() )
		{
			LOGGER.debug("Excel Summary:" + content); //$NON-NLS-1$
		}
	}
	catch( Exception e )
	{
		// Do nothing
	}
}
 
开发者ID:equella,项目名称:Equella,代码行数:31,代码来源:MsExcelExtracter.java

示例14: parse

import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
private static String parse(final InputStream input) throws TikaException, SAXException, IOException {
    final Parser parser = new PDFParser();
    final ContentHandler handler = new BodyContentHandler();
    final Metadata metadata = new Metadata();
    final ParseContext parseContext = new ParseContext();

    parser.parse(input, handler, metadata, parseContext);

    return handler.toString();
}
 
开发者ID:tnovo,项目名称:which-food-uptec-cli,代码行数:11,代码来源:App.java

示例15: main

import org.apache.tika.parser.Parser; //导入方法依赖的package包/类
public static void main(final String[] args) throws IOException,
        TikaException, SAXException {

    File file = new File("/home/aditya/dataset/oca.pdf");

    Parser parser = new AutoDetectParser();
    BodyContentHandler handler = null;
    Metadata metadata = new Metadata();
    FileInputStream inputstream = new FileInputStream(file);
    ParseContext context = new ParseContext();

    parser.parse(inputstream, handler, metadata, context);

    String[] metadataNames = metadata.names();

    // Metadata Properties
    // for (String name : metadataNames) {
    //
    // System.out.println(name);
    // }

    // Get specific metadata
    System.out.println(metadata.get(MetadataProperties.TITLE));
    System.out.println(metadata.get(MetadataProperties.AUTHOR));
    System.out.println(metadata.get(MetadataProperties.CREATOR));
    System.out.println(metadata.get(MetadataProperties.CONTENT_TYPE));
    System.out.println(metadata.get(MetadataProperties.ENCRYPTION));
}
 
开发者ID:arks-api,项目名称:arks-api,代码行数:29,代码来源:MetadataExtraction.java


注:本文中的org.apache.tika.parser.Parser.parse方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。