本文整理汇总了Java中org.apache.poi.hwpf.extractor.WordExtractor类的典型用法代码示例。如果您正苦于以下问题:Java WordExtractor类的具体用法?Java WordExtractor怎么用?Java WordExtractor使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
WordExtractor类属于org.apache.poi.hwpf.extractor包,在下文中一共展示了WordExtractor类的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: readDoc
import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
private static String readDoc (String filePath, InputStream is) throws Exception {
String text= "";
is = FileMagic.prepareToCheckMagic(is);
try {
if (FileMagic.valueOf(is) == FileMagic.OLE2) {
WordExtractor ex = new WordExtractor(is);
text = ex.getText();
ex.close();
} else if(FileMagic.valueOf(is) == FileMagic.OOXML) {
XWPFDocument doc = new XWPFDocument(is);
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
text = extractor.getText();
extractor.close();
}
} catch (OfficeXmlFileException e) {
logger.error(filePath, e);
} finally {
if (is != null) {
is.close();
}
}
return text;
}
示例2: microsoftWordDocumentToString
import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
private static String microsoftWordDocumentToString(InputStream inputStream) throws IOException {
String strRet;
try (InputStream wordStream = new BufferedInputStream(inputStream)) {
if (POIFSFileSystem.hasPOIFSHeader(wordStream)) {
WordExtractor wordExtractor = new WordExtractor(wordStream);
strRet = wordExtractor.getText();
wordExtractor.close();
} else {
XWPFWordExtractor wordXExtractor = new XWPFWordExtractor(new XWPFDocument(wordStream));
strRet = wordXExtractor.getText();
wordXExtractor.close();
}
}
return strRet;
}
示例3: readDoc
import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
private String readDoc(String path) {
String content = "";
try {
File file = new File(path);
FileInputStream fis = new FileInputStream(file.getAbsolutePath());
HWPFDocument doc = new HWPFDocument(fis);
WordExtractor we = new WordExtractor(doc);
String[] paragraphs = we.getParagraphText();
for (String para : paragraphs) {
content += para.toString();
}
fis.close();
return content;
} catch (Exception e) {
e.printStackTrace();
}
return content;
}
示例4: conversionImplementation
import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
/**
*/
@Override
protected ConvertedDocument conversionImplementation(InputStream input, java.io.File doc) throws IOException {
org.apache.poi.hwpf.extractor.WordExtractor ex = new WordExtractor(input);
String[] ps = ex.getParagraphText();
input.close();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < ps.length; i++) {
sb.append(WordExtractor.stripFields(ps[i]).trim());
sb.append('\n');
}
ConvertedDocument textdoc = new ConvertedDocument(doc);
textdoc.setText(sb.toString());
ex.close();
return textdoc;
}
示例5: extractText
import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
/**
* {@inheritDoc} Returns an empty reader if an error occured extracting text from
* the word document.
*/
public String extractText(InputStream stream, String type, String encoding) throws IOException {
try {
return new WordExtractor(stream).getText();
} catch (Exception e) {
logger.warn("Failed to extract Word text content", e);
throw new IOException(e.getMessage(), e);
} finally {
stream.close();
}
}
示例6: readContent
import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
@Override
public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) {
try {
WordExtractor extractor = new WordExtractor(new BufferedInputStream(new ByteArrayInputStream(cc.getContent())));
String s = extractor.getText();
char[] c = s.toCharArray();
handler.startRegion("document");
handler.text(c, 0, c.length);
handler.endRegion();
} catch (Exception e) {
throw new OntopiaRuntimeException(e);
}
}
示例7: getUnfilteredTextContent
import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
/** {@inheritDoc} */
@Override
public String getUnfilteredTextContent() throws ConQATException {
InputStream in = new ByteArrayInputStream(getContent());
String[] paragraphs;
try {
WordExtractor extractor = new WordExtractor(in);
paragraphs = extractor.getParagraphText();
} catch (IOException e) {
throw new ConQATException(
"Had an error while reading word document: "
+ e.getMessage(), e);
} finally {
FileSystemUtils.close(in);
}
String text = StringUtils.concat(paragraphs, StringUtils.CR);
if (wrapAtDot) {
text = text.replaceAll("[.]", "." + StringUtils.CR);
}
if (wrapAtWhitespace) {
text = text.replaceAll("\\s+", StringUtils.CR);
}
// normalize linebreaks according to method's contract
return StringUtils.replaceLineBreaks(text, "\n");
}
示例8: getIndexedDocument
import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
public IndexDocument getIndexedDocument(File2Index fileData)
throws SolrException {
try {
POIFSFileSystem fs = new POIFSFileSystem(new ByteArrayInputStream(fileData.data));
WordExtractor extractor = new WordExtractor(fs);
String wordText = extractor.getText();
return new IndexDocument(fileData.path, wordText, null);
} catch (IOException e) {
String msg = "Failed to write to the index";
log.error(msg, e);
throw new SolrException(ErrorCode.SERVER_ERROR, msg);
}
}
示例9: writePDFFromDoc
import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
public static void writePDFFromDoc(
final String docFilePath,
final String pdfFilePath) throws SSErr{
try{
final Document document = new Document();
final POIFSFileSystem fs = new POIFSFileSystem(openFileForRead(docFilePath));
final HWPFDocument word = new HWPFDocument (fs);
final WordExtractor we = new WordExtractor (word);
final OutputStream out = openOrCreateFileWithPathForWrite(pdfFilePath);
final PdfWriter writer = PdfWriter.getInstance(document, out);
final Range range = word.getRange();
document.open();
writer.setPageEmpty(true);
document.newPage();
writer.setPageEmpty(true);
String[] paragraphs = we.getParagraphText();
for (int i = 0; i < paragraphs.length; i++) {
org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i);
// CharacterRun run = pr.getCharacterRun(i);
// run.setBold(true);
// run.setCapitalized(true);
// run.setItalic(true);
paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
System.out.println("Length:" + paragraphs[i].length());
System.out.println("Paragraph" + i + ": " + paragraphs[i].toString());
// add the paragraph to the document
document.add(new Paragraph(paragraphs[i]));
}
document.close();
}catch(Exception error){
SSServErrReg.regErrThrow(error);
}
}
示例10: officeExtractor
import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
/**
* Extract metadata from Office Word
*/
public static OfficeMetadata officeExtractor(InputStream is, String mimeType) throws IOException {
POIFSFileSystem fs = new POIFSFileSystem(is);
OfficeMetadata md = new OfficeMetadata();
SummaryInformation si = null;
if (MimeTypeConfig.MIME_MS_WORD.equals(mimeType)) {
si = new WordExtractor(fs).getSummaryInformation();
} else if (MimeTypeConfig.MIME_MS_EXCEL.equals(mimeType)) {
si = new ExcelExtractor(fs).getSummaryInformation();
} else if (MimeTypeConfig.MIME_MS_POWERPOINT.equals(mimeType)) {
si = new PowerPointExtractor(fs).getSummaryInformation();
}
if (si != null) {
md.setTitle(si.getTitle());
md.setSubject(si.getSubject());
md.setAuthor(si.getAuthor());
md.setLastAuthor(si.getLastAuthor());
md.setKeywords(si.getKeywords());
md.setComments(si.getComments());
md.setTemplate(si.getTemplate());
md.setRevNumber(si.getRevNumber());
md.setApplicationName(si.getApplicationName());
md.setEditTime(si.getEditTime());
md.setPageCount(si.getPageCount());
md.setWordCount(si.getWordCount());
md.setCharCount(si.getCharCount());
md.setSecurity(si.getSecurity());
Calendar createDateTime = Calendar.getInstance();
createDateTime.setTime(si.getCreateDateTime());
md.setCreateDateTime(createDateTime);
Calendar lastSaveDateTime = Calendar.getInstance();
lastSaveDateTime.setTime(si.getLastSaveDateTime());
md.setLastSaveDateTime(lastSaveDateTime);
Calendar lastPrinted = Calendar.getInstance();
lastPrinted.setTime(si.getLastPrinted());
md.setLastPrinted(lastPrinted);
}
log.info("officeExtractor: {}", md);
return md;
}
示例11: wordCountNew
import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
static int[] wordCountNew(String doc, boolean isDebug) throws Exception {
long time = System.currentTimeMillis();
InputStream is = new FileInputStream(new File(doc));
WordExtractor ex = new WordExtractor(is);
int cnt = 0;
StringBuilder builder = new StringBuilder();
for (String text : ex.getParagraphText()) {
// text = text.replaceAll("\u0007", "").replaceAll("\f", "")
// .replaceAll("\r", "").replaceAll("\n", "")
// .replaceAll("\u0015", "");
if (isDebug) {
text = trimAllChars(text, new char[] { '\u0007', '\f', '\b', '\u0015' });
} else {
text = trimAllChars(text, new char[] { '\u0007', '\f', '\b', '\u0015', '\r', '\n' });
}
String prefix = " TOC \\o \\u \u0014";
if (text.startsWith(prefix))
text = text.substring(prefix.length());
// flag = "\u0013 EMBED Visio.Drawing.11 \u0014\u0001";
// flag = "\u0013 EMBED Word.Document.12 \\s \u0014\u0001";
int start = text.indexOf("\u0013");
int end = text.indexOf("\u0014\u0001");
if (start >= 0 && end > start) {
text = text.replaceAll("\u0013[^\u0014\u0001]+\u0014\u0001", "");
}
text = text.replaceAll("\u0013[^\u0014\u0013]+\u0014", "");
String flag = "\u0013 HYPERLINK";
int pos = text.indexOf(flag);
if (pos >= 0) {
String[] arr = text.split(" \u0014");
text = text.substring(0, pos) + arr[1];
}
if (text.length() >= 767) {
// word doc格式时, 如果连续字符数数大于767个(大于等于768), 则该段落的字数不计入
// if (text.replaceAll(" ", "").length() < text.length() - 767) { //
text = text.replaceAll(" {767,}", "");
// }
}
if (isDebug)
builder.append(text);
cnt += text.length();
}
int t = Long.valueOf(System.currentTimeMillis() - time).intValue();
if (isDebug) {
System.out.println(builder.toString()); // .replaceAll("\r", "").replaceAll("\n", "")
System.out.println(cnt);
System.out.println(t + " ms");
}
return new int[] { cnt, t };
}
示例12: doc2text
import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
public String doc2text(InputStream is) throws IOException {
WordExtractor wd = new WordExtractor(is);
String text = wd.getText();
wd.close();
return text;
}
示例13: getText
import org.apache.poi.hwpf.extractor.WordExtractor; //导入依赖的package包/类
/**
* Extract text from a word 97-2003 document.
* @throws Exception
*
* @see edu.ur.ir.index.FileTextExtractor#getText(java.io.File)
*/
public String getText(File f) throws Exception {
String text = null;
if( isFileTooLarge(f) || f.length() <= 0l)
{
return text;
}
FileInputStream inputStream = null;
try
{
inputStream = new FileInputStream(f);
HWPFDocument wordDocument = new HWPFDocument(inputStream);
WordExtractor wordExtractor = new WordExtractor(wordDocument);
String myText = wordExtractor.getText();
if( myText != null && !myText.trim().equals(""))
{
text = myText;
}
}
catch(OutOfMemoryError oome)
{
text = null;
log.error("could not extract text", oome);
throw(oome);
}
catch(Exception e)
{
text = null;
log.error("could not get text for word document " + f.getAbsolutePath(), e);
throw(e);
}
finally
{
closeInputStream(inputStream);
}
return text;
}