本文整理汇总了Java中org.apache.poi.hwpf.extractor.WordExtractor.getParagraphText方法的典型用法代码示例。如果您正苦于以下问题:Java WordExtractor.getParagraphText方法的具体用法?Java WordExtractor.getParagraphText怎么用?Java WordExtractor.getParagraphText使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.poi.hwpf.extractor.WordExtractor
的用法示例。
在下文中一共展示了WordExtractor.getParagraphText方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: readDoc
import org.apache.poi.hwpf.extractor.WordExtractor; //导入方法依赖的package包/类
private String readDoc(String path) {
String content = "";
try {
File file = new File(path);
FileInputStream fis = new FileInputStream(file.getAbsolutePath());
HWPFDocument doc = new HWPFDocument(fis);
WordExtractor we = new WordExtractor(doc);
String[] paragraphs = we.getParagraphText();
for (String para : paragraphs) {
content += para.toString();
}
fis.close();
return content;
} catch (Exception e) {
e.printStackTrace();
}
return content;
}
示例2: getUnfilteredTextContent
import org.apache.poi.hwpf.extractor.WordExtractor; //导入方法依赖的package包/类
/** {@inheritDoc} */
@Override
public String getUnfilteredTextContent() throws ConQATException {
InputStream in = new ByteArrayInputStream(getContent());
String[] paragraphs;
try {
WordExtractor extractor = new WordExtractor(in);
paragraphs = extractor.getParagraphText();
} catch (IOException e) {
throw new ConQATException(
"Had an error while reading word document: "
+ e.getMessage(), e);
} finally {
FileSystemUtils.close(in);
}
String text = StringUtils.concat(paragraphs, StringUtils.CR);
if (wrapAtDot) {
text = text.replaceAll("[.]", "." + StringUtils.CR);
}
if (wrapAtWhitespace) {
text = text.replaceAll("\\s+", StringUtils.CR);
}
// normalize linebreaks according to method's contract
return StringUtils.replaceLineBreaks(text, "\n");
}
示例3: writePDFFromDoc
import org.apache.poi.hwpf.extractor.WordExtractor; //导入方法依赖的package包/类
public static void writePDFFromDoc(
final String docFilePath,
final String pdfFilePath) throws SSErr{
try{
final Document document = new Document();
final POIFSFileSystem fs = new POIFSFileSystem(openFileForRead(docFilePath));
final HWPFDocument word = new HWPFDocument (fs);
final WordExtractor we = new WordExtractor (word);
final OutputStream out = openOrCreateFileWithPathForWrite(pdfFilePath);
final PdfWriter writer = PdfWriter.getInstance(document, out);
final Range range = word.getRange();
document.open();
writer.setPageEmpty(true);
document.newPage();
writer.setPageEmpty(true);
String[] paragraphs = we.getParagraphText();
for (int i = 0; i < paragraphs.length; i++) {
org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i);
// CharacterRun run = pr.getCharacterRun(i);
// run.setBold(true);
// run.setCapitalized(true);
// run.setItalic(true);
paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
System.out.println("Length:" + paragraphs[i].length());
System.out.println("Paragraph" + i + ": " + paragraphs[i].toString());
// add the paragraph to the document
document.add(new Paragraph(paragraphs[i]));
}
document.close();
}catch(Exception error){
SSServErrReg.regErrThrow(error);
}
}
示例4: wordCountNew
import org.apache.poi.hwpf.extractor.WordExtractor; //导入方法依赖的package包/类
static int[] wordCountNew(String doc, boolean isDebug) throws Exception {
long time = System.currentTimeMillis();
InputStream is = new FileInputStream(new File(doc));
WordExtractor ex = new WordExtractor(is);
int cnt = 0;
StringBuilder builder = new StringBuilder();
for (String text : ex.getParagraphText()) {
// text = text.replaceAll("\u0007", "").replaceAll("\f", "")
// .replaceAll("\r", "").replaceAll("\n", "")
// .replaceAll("\u0015", "");
if (isDebug) {
text = trimAllChars(text, new char[] { '\u0007', '\f', '\b', '\u0015' });
} else {
text = trimAllChars(text, new char[] { '\u0007', '\f', '\b', '\u0015', '\r', '\n' });
}
String prefix = " TOC \\o \\u \u0014";
if (text.startsWith(prefix))
text = text.substring(prefix.length());
// flag = "\u0013 EMBED Visio.Drawing.11 \u0014\u0001";
// flag = "\u0013 EMBED Word.Document.12 \\s \u0014\u0001";
int start = text.indexOf("\u0013");
int end = text.indexOf("\u0014\u0001");
if (start >= 0 && end > start) {
text = text.replaceAll("\u0013[^\u0014\u0001]+\u0014\u0001", "");
}
text = text.replaceAll("\u0013[^\u0014\u0013]+\u0014", "");
String flag = "\u0013 HYPERLINK";
int pos = text.indexOf(flag);
if (pos >= 0) {
String[] arr = text.split(" \u0014");
text = text.substring(0, pos) + arr[1];
}
if (text.length() >= 767) {
// word doc格式时, 如果连续字符数数大于767个(大于等于768), 则该段落的字数不计入
// if (text.replaceAll(" ", "").length() < text.length() - 767) { //
text = text.replaceAll(" {767,}", "");
// }
}
if (isDebug)
builder.append(text);
cnt += text.length();
}
int t = Long.valueOf(System.currentTimeMillis() - time).intValue();
if (isDebug) {
System.out.println(builder.toString()); // .replaceAll("\r", "").replaceAll("\n", "")
System.out.println(cnt);
System.out.println(t + " ms");
}
return new int[] { cnt, t };
}