当前位置: 首页>>代码示例>>Java>>正文


Java WordExtractor类代码示例

本文整理汇总了Java中org.apache.poi.hwpf.extractor.WordExtractor的典型用法代码示例。如果您正苦于以下问题:Java WordExtractor类的具体用法?Java WordExtractor怎么用?Java WordExtractor使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


WordExtractor类属于org.apache.poi.hwpf.extractor包,在下文中一共展示了WordExtractor类的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: readDoc

import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
private static String readDoc (String filePath, InputStream is) throws Exception {
    String text= "";
    is = FileMagic.prepareToCheckMagic(is);
    try {
        if (FileMagic.valueOf(is) == FileMagic.OLE2) {
            WordExtractor ex = new WordExtractor(is);
            text = ex.getText();
            ex.close();
        } else if(FileMagic.valueOf(is) == FileMagic.OOXML) {
            XWPFDocument doc = new XWPFDocument(is);
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
            text = extractor.getText();
            extractor.close();
        }
    } catch (OfficeXmlFileException e) {
        logger.error(filePath, e);
    } finally {
        if (is != null) {
            is.close();
        }
    }
    return text;
}
 
开发者ID:neal1991,项目名称:everywhere,代码行数:24,代码来源:FileBeanParser.java

示例2: microsoftWordDocumentToString

import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
private static String microsoftWordDocumentToString(InputStream inputStream) throws IOException {
    String strRet;

    try (InputStream wordStream = new BufferedInputStream(inputStream)) {
        if (POIFSFileSystem.hasPOIFSHeader(wordStream)) {
            WordExtractor wordExtractor = new WordExtractor(wordStream);
            strRet = wordExtractor.getText();
            wordExtractor.close();
        } else {
            XWPFWordExtractor wordXExtractor = new XWPFWordExtractor(new XWPFDocument(wordStream));
            strRet = wordXExtractor.getText();
            wordXExtractor.close();
        }
    }

    return strRet;
}
 
开发者ID:polarsys,项目名称:eplmp,代码行数:18,代码来源:IndexerUtils.java

示例3: readDoc

import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
private String readDoc(String path) {
    String content = "";
    try {
        File file = new File(path);
        FileInputStream fis = new FileInputStream(file.getAbsolutePath());

        HWPFDocument doc = new HWPFDocument(fis);

        WordExtractor we = new WordExtractor(doc);
        String[] paragraphs = we.getParagraphText();
        for (String para : paragraphs) {
            content += para.toString();
        }
        fis.close();
        return content;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return content;
}
 
开发者ID:jatanrathod,项目名称:Idea-Plagiarism,代码行数:21,代码来源:checkPlagiarism.java

示例4: conversionImplementation

import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
/** 
 */
@Override
protected ConvertedDocument conversionImplementation(InputStream input, java.io.File doc) throws IOException {
    org.apache.poi.hwpf.extractor.WordExtractor ex = new WordExtractor(input);

    String[] ps = ex.getParagraphText();
    input.close();

    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < ps.length; i++) {
        sb.append(WordExtractor.stripFields(ps[i]).trim());
        sb.append('\n');
    }
    ConvertedDocument textdoc = new ConvertedDocument(doc);
    textdoc.setText(sb.toString());
    
    ex.close();

    return textdoc;
}
 
开发者ID:OpenSextant,项目名称:Xponents,代码行数:22,代码来源:MSDocConverter.java

示例5: extractText

import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
/**
 * {@inheritDoc} Returns an empty reader if an error occured extracting text from
 * the word document.
 */
public String extractText(InputStream stream, String type, String encoding) throws IOException {
	try {
		return new WordExtractor(stream).getText();
	} catch (Exception e) {
		logger.warn("Failed to extract Word text content", e);
		throw new IOException(e.getMessage(), e);
	} finally {
		stream.close();
	}
}
 
开发者ID:openkm,项目名称:document-management-system,代码行数:15,代码来源:MsWordTextExtractor.java

示例6: readContent

import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
@Override
public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) {
  try {
    WordExtractor extractor = new WordExtractor(new BufferedInputStream(new ByteArrayInputStream(cc.getContent())));
    String s = extractor.getText();
    char[] c = s.toCharArray();
    handler.startRegion("document");
    handler.text(c, 0, c.length);
    handler.endRegion();
  } catch (Exception e) {
    throw new OntopiaRuntimeException(e);
  }    
}
 
开发者ID:ontopia,项目名称:ontopia,代码行数:14,代码来源:WordFormatModule.java

示例7: getUnfilteredTextContent

import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
/** {@inheritDoc} */
@Override
public String getUnfilteredTextContent() throws ConQATException {
	InputStream in = new ByteArrayInputStream(getContent());
	String[] paragraphs;
	try {
		WordExtractor extractor = new WordExtractor(in);
		paragraphs = extractor.getParagraphText();
	} catch (IOException e) {
		throw new ConQATException(
				"Had an error while reading word document: "
						+ e.getMessage(), e);
	} finally {
		FileSystemUtils.close(in);
	}

	String text = StringUtils.concat(paragraphs, StringUtils.CR);

	if (wrapAtDot) {
		text = text.replaceAll("[.]", "." + StringUtils.CR);
	}

	if (wrapAtWhitespace) {
		text = text.replaceAll("\\s+", StringUtils.CR);
	}

	// normalize linebreaks according to method's contract
	return StringUtils.replaceLineBreaks(text, "\n");
}
 
开发者ID:vimaier,项目名称:conqat,代码行数:30,代码来源:MSWordTextElement.java

示例8: getIndexedDocument

import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
public IndexDocument getIndexedDocument(File2Index fileData)
		throws SolrException {
	try {
		POIFSFileSystem fs = new POIFSFileSystem(new ByteArrayInputStream(fileData.data));
		WordExtractor extractor = new WordExtractor(fs);
		String wordText = extractor.getText();

		return new IndexDocument(fileData.path, wordText, null);
	} catch (IOException e) {
		String msg = "Failed to write to the index";
		log.error(msg, e);
		throw new SolrException(ErrorCode.SERVER_ERROR, msg);
	}
}
 
开发者ID:wso2,项目名称:carbon-registry,代码行数:15,代码来源:MSWordIndexer.java

示例9: writePDFFromDoc

import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
public static void writePDFFromDoc(
  final String docFilePath,
  final String pdfFilePath) throws SSErr{
  
  try{
    final Document        document = new Document();
    final POIFSFileSystem fs       = new POIFSFileSystem(openFileForRead(docFilePath));
    final HWPFDocument    word     = new HWPFDocument  (fs);
    final WordExtractor   we       = new WordExtractor (word);
    final OutputStream    out      = openOrCreateFileWithPathForWrite(pdfFilePath);
    final PdfWriter       writer   = PdfWriter.getInstance(document, out);
    final Range           range    = word.getRange();
    
    document.open();
    writer.setPageEmpty(true);
    document.newPage();
    writer.setPageEmpty(true);
    
    String[] paragraphs = we.getParagraphText();
    
    for (int i = 0; i < paragraphs.length; i++) {
      
      org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i);
      // CharacterRun run = pr.getCharacterRun(i);
      // run.setBold(true);
      // run.setCapitalized(true);
      // run.setItalic(true);
      paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
      System.out.println("Length:" + paragraphs[i].length());
      System.out.println("Paragraph" + i + ": " + paragraphs[i].toString());
      
      // add the paragraph to the document
      document.add(new Paragraph(paragraphs[i]));
    }
    
    document.close();
  }catch(Exception error){
    SSServErrReg.regErrThrow(error);
  }
}
 
开发者ID:learning-layers,项目名称:SocialSemanticServer,代码行数:41,代码来源:SSFileU.java

示例10: officeExtractor

import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
/**
 * Extract metadata from Office Word
 */
public static OfficeMetadata officeExtractor(InputStream is, String mimeType) throws IOException {
	POIFSFileSystem fs = new POIFSFileSystem(is);
	OfficeMetadata md = new OfficeMetadata();
	SummaryInformation si = null;

	if (MimeTypeConfig.MIME_MS_WORD.equals(mimeType)) {
		si = new WordExtractor(fs).getSummaryInformation();
	} else if (MimeTypeConfig.MIME_MS_EXCEL.equals(mimeType)) {
		si = new ExcelExtractor(fs).getSummaryInformation();
	} else if (MimeTypeConfig.MIME_MS_POWERPOINT.equals(mimeType)) {
		si = new PowerPointExtractor(fs).getSummaryInformation();
	}

	if (si != null) {
		md.setTitle(si.getTitle());
		md.setSubject(si.getSubject());
		md.setAuthor(si.getAuthor());
		md.setLastAuthor(si.getLastAuthor());
		md.setKeywords(si.getKeywords());
		md.setComments(si.getComments());
		md.setTemplate(si.getTemplate());
		md.setRevNumber(si.getRevNumber());
		md.setApplicationName(si.getApplicationName());
		md.setEditTime(si.getEditTime());
		md.setPageCount(si.getPageCount());
		md.setWordCount(si.getWordCount());
		md.setCharCount(si.getCharCount());
		md.setSecurity(si.getSecurity());

		Calendar createDateTime = Calendar.getInstance();
		createDateTime.setTime(si.getCreateDateTime());
		md.setCreateDateTime(createDateTime);

		Calendar lastSaveDateTime = Calendar.getInstance();
		lastSaveDateTime.setTime(si.getLastSaveDateTime());
		md.setLastSaveDateTime(lastSaveDateTime);

		Calendar lastPrinted = Calendar.getInstance();
		lastPrinted.setTime(si.getLastPrinted());
		md.setLastPrinted(lastPrinted);
	}

	log.info("officeExtractor: {}", md);
	return md;
}
 
开发者ID:openkm,项目名称:document-management-system,代码行数:49,代码来源:MetadataExtractor.java

示例11: wordCountNew

import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
static int[] wordCountNew(String doc, boolean isDebug) throws Exception {
        long time = System.currentTimeMillis();
        InputStream is = new FileInputStream(new File(doc));
        WordExtractor ex = new WordExtractor(is);
        int cnt = 0;
        StringBuilder builder = new StringBuilder();
        for (String text : ex.getParagraphText()) {
//            text = text.replaceAll("\u0007", "").replaceAll("\f", "")
//                    .replaceAll("\r", "").replaceAll("\n", "")
//                    .replaceAll("\u0015", "");
            if (isDebug) {
                text = trimAllChars(text, new char[] { '\u0007', '\f', '\b', '\u0015' });
            } else {
                text = trimAllChars(text, new char[] { '\u0007', '\f', '\b', '\u0015', '\r', '\n' });
            }

            String prefix = " TOC \\o \\u \u0014";
            if (text.startsWith(prefix))
                text = text.substring(prefix.length());
//            flag = "\u0013 EMBED Visio.Drawing.11 \u0014\u0001";
//            flag = "\u0013 EMBED Word.Document.12 \\s \u0014\u0001";
            int start = text.indexOf("\u0013");
            int end = text.indexOf("\u0014\u0001");
            if (start >= 0 && end > start) {
                text = text.replaceAll("\u0013[^\u0014\u0001]+\u0014\u0001", "");
            }
            text = text.replaceAll("\u0013[^\u0014\u0013]+\u0014", "");

            String flag = "\u0013 HYPERLINK";
            int pos = text.indexOf(flag);
            if (pos >= 0) {
                String[] arr = text.split(" \u0014");
                text = text.substring(0, pos) + arr[1];
            }

            if (text.length() >= 767) {
                // word doc格式时, 如果连续字符数数大于767个(大于等于768), 则该段落的字数不计入
//                if (text.replaceAll(" ", "").length() < text.length() - 767) { //
                text = text.replaceAll(" {767,}", "");
//                }
            }

            if (isDebug)
                builder.append(text);
            cnt += text.length();
        }

        int t = Long.valueOf(System.currentTimeMillis() - time).intValue();

        if (isDebug) {
            System.out.println(builder.toString()); // .replaceAll("\r", "").replaceAll("\n", "")
            System.out.println(cnt);
            System.out.println(t + " ms");
        }
        return new int[] { cnt, t };
    }
 
开发者ID:wangshichun,项目名称:office_word_wordCount,代码行数:57,代码来源:CountDoc.java

示例12: doc2text

import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
public String doc2text(InputStream is) throws IOException {
    WordExtractor wd = new WordExtractor(is);
    String text = wd.getText();
    wd.close();
    return text;
}
 
开发者ID:mariosotil,项目名称:text-extractor,代码行数:7,代码来源:TextExtractor.java

示例13: getText

import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
/**
 * Extract text from a word 97-2003 document.
 * @throws Exception 
 * 
 * @see edu.ur.ir.index.FileTextExtractor#getText(java.io.File)
 */
public String getText(File f) throws Exception {
	
	String text = null;
	if( isFileTooLarge(f) || f.length() <= 0l)
	{
		return text;
	}
	
	FileInputStream inputStream = null;
	try
	{
		inputStream = new FileInputStream(f);
		HWPFDocument wordDocument = new HWPFDocument(inputStream);
		WordExtractor wordExtractor = new WordExtractor(wordDocument);
		

		String myText = wordExtractor.getText();
		if( myText != null && !myText.trim().equals(""))
		{
		    text = myText;
		}
	}
	catch(OutOfMemoryError oome)
	{
		text = null;
		log.error("could not extract text", oome);
		throw(oome);
	}
	catch(Exception e)
	{
		text = null;
		log.error("could not get text for word document " + f.getAbsolutePath(), e);
		throw(e);
	}
	
	finally
	{
		closeInputStream(inputStream);
	}
	return text;
}
 
开发者ID:nate-rcl,项目名称:irplus,代码行数:48,代码来源:DefaultWordTextExtractor.java


注:本文中的org.apache.poi.hwpf.extractor.WordExtractor类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。