本文整理汇总了Java中org.apache.tika.sax.XHTMLContentHandler类的典型用法代码示例。如果您正苦于以下问题:Java XHTMLContentHandler类的具体用法?Java XHTMLContentHandler怎么用?Java XHTMLContentHandler使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
XHTMLContentHandler类属于org.apache.tika.sax包,在下文中一共展示了XHTMLContentHandler类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: buildXHTML
import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
/**
* @see org.apache.poi.xwpf.extractor.XWPFWordExtractor#getText()
*/
@Override
protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
XmlException, IOException {
XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
// headers
if (hfPolicy != null) {
extractHeaders(xhtml, hfPolicy);
}
// process text in the order that it occurs in
extractIBodyText(document, xhtml);
// then all document tables
if (hfPolicy != null) {
extractFooters(xhtml, hfPolicy);
}
if (Config.inst().getProp(ConfigBool.ENABLE_IMAGE_OCR)) {
extractImageText(xhtml);
}
}
示例2: extractHeaders
import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
private void extractHeaders(XHTMLContentHandler xhtml,
XWPFHeaderFooterPolicy hfPolicy) throws SAXException, XmlException,
IOException {
if (hfPolicy == null)
return;
if (hfPolicy.getFirstPageHeader() != null) {
extractHeaderText(xhtml, hfPolicy.getFirstPageHeader());
}
if (hfPolicy.getEvenPageHeader() != null) {
extractHeaderText(xhtml, hfPolicy.getEvenPageHeader());
}
if (hfPolicy.getDefaultHeader() != null) {
extractHeaderText(xhtml, hfPolicy.getDefaultHeader());
}
}
示例3: parse
import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
new ImageMetadataExtractor(metadata).parseJpeg(tis.getFile());
new JempboxExtractor(metadata).parse(tis);
} finally {
tmp.dispose();
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
示例4: extractOutput
import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
/**
* Reads the contents of the given stream and write it to the given XHTML
* content handler. The stream is closed once fully processed.
*
* @param stream Stream where is the result of ocr
* @param xhtml XHTML content handler
* @throws SAXException if the XHTML SAX events could not be handled
* @throws IOException if an input error occurred
*/
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {
xhtml.startDocument();
xhtml.startElement("div");
try (Reader reader = new InputStreamReader(stream, UTF_8)) {
char[] buffer = new char[1024];
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
if (n > 0) {
xhtml.characters(buffer, 0, n);
}
}
}
xhtml.endElement("div");
xhtml.endDocument();
}
示例5: extractImageText
import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
private void extractImageText(XHTMLContentHandler xhtml,
HSLFSlideShow document) {
if (Config.inst().getProp(ConfigBool.ENABLE_IMAGE_OCR)) {
TikaImageHelper helper = new TikaImageHelper(metadata);
try {
PictureData[] pictures = document.getPictures();
for (PictureData picture : pictures) {
ByteArrayInputStream imageData = new ByteArrayInputStream(
picture.getData());
helper.addImage(ImageIO.read(imageData));
}
// TODO: find out page number
helper.addTextToHandler(xhtml);
} catch (Exception e) {
e.printStackTrace();
} finally {
if (helper != null) {
helper.close();
}
}
}
}
示例6: extractMaster
import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
private void extractMaster(XHTMLContentHandler xhtml, MasterSheet master)
throws SAXException {
if (master == null) {
return;
}
Shape[] shapes = master.getShapes();
if (shapes == null || shapes.length == 0) {
return;
}
xhtml.startElement("div", "class", "slide-master-content");
for (int i = 0; i < shapes.length; i++) {
Shape sh = shapes[i];
if (sh != null && !MasterSheet.isPlaceholder(sh)) {
if (sh instanceof TextShape) {
TextShape tsh = (TextShape) sh;
String text = tsh.getText();
if (text != null) {
xhtml.element("p", text);
}
}
}
}
xhtml.endElement("div");
}
示例7: extractTableText
import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
private void extractTableText(XHTMLContentHandler xhtml, Table shape)
throws SAXException {
xhtml.startElement("table");
for (int row = 0; row < shape.getNumberOfRows(); row++) {
xhtml.startElement("tr");
for (int col = 0; col < shape.getNumberOfColumns(); col++) {
TableCell cell = shape.getCell(row, col);
// insert empty string for empty cell if cell is null
String txt = "";
if (cell != null) {
txt = cell.getText();
}
xhtml.element("td", txt);
}
xhtml.endElement("tr");
}
xhtml.endElement("table");
}
示例8: textRunsToText
import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs)
throws SAXException {
if (runs == null) {
return;
}
for (TextRun run : runs) {
if (run != null) {
// Leaving in wisdom from TIKA-712 for easy revert.
// Avoid boiler-plate text on the master slide (0
// = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
// if (!isMaster || (run.getRunType() != 0 && run.getRunType()
// != 1)) {
String txt = run.getText();
if (txt != null) {
xhtml.characters(txt);
xhtml.startElement("br");
xhtml.endElement("br");
}
}
}
}
示例9: handleEmbeddedResource
import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
protected void handleEmbeddedResource(TikaInputStream resource, String filename,
String relationshipID, String mediaType, XHTMLContentHandler xhtml,
boolean outputHtml)
throws IOException, SAXException, TikaException {
try {
Metadata metadata = new Metadata();
if (filename != null) {
metadata.set(Metadata.TIKA_MIME_FILE, filename);
metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
}
if (relationshipID != null) {
metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, relationshipID);
}
if (mediaType != null) {
metadata.set(Metadata.CONTENT_TYPE, mediaType);
}
if (extractor.shouldParseEmbedded(metadata)) {
extractor.parseEmbedded(resource, xhtml, metadata, outputHtml);
}
} finally {
resource.close();
}
}
示例10: handleHeaderFooter
import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
private void handleHeaderFooter(Range[] ranges, String type,
HWPFDocument document, PicturesSource pictures,
PicturesTable pictureTable, XHTMLContentHandler xhtml)
throws SAXException, IOException, TikaException {
if (countParagraphs(ranges) > 0) {
xhtml.startElement("div", "class", type);
for (Range r : ranges) {
if (r != null) {
for (int i = 0; i < r.numParagraphs(); i++) {
Paragraph p = r.getParagraph(i);
String text = p.text();
if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
// Skip empty header or footer paragraphs
} else {
i += handleParagraph(p, 0, r, document,
FieldsDocumentPart.HEADER, pictures, pictureTable, xhtml);
}
}
}
}
xhtml.endElement("div");
}
}
示例11: parseWord6
import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
protected void parseWord6(DirectoryNode root, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
HWPFOldDocument doc = new HWPFOldDocument(root);
Word6Extractor extractor = new Word6Extractor(doc);
for (String p : extractor.getParagraphText()) {
xhtml.element("p", p);
}
}
示例12: extractIBodyText
import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
private void extractIBodyText(IBody bodyElement, XHTMLContentHandler xhtml)
throws SAXException, XmlException, IOException {
for (IBodyElement element : bodyElement.getBodyElements()) {
if (element instanceof XWPFParagraph) {
XWPFParagraph paragraph = (XWPFParagraph) element;
extractParagraph(paragraph, xhtml);
}
if (element instanceof XWPFTable) {
XWPFTable table = (XWPFTable) element;
extractTable(table, xhtml);
}
if (element instanceof XWPFSDT) {
extractSDT((XWPFSDT) element, xhtml);
}
}
}
示例13: extractImageText
import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
private void extractImageText(XHTMLContentHandler xhtml) {
if (Config.inst().getProp(ConfigBool.ENABLE_IMAGE_OCR)) {
TikaImageHelper helper = new TikaImageHelper(metadata);
try {
List<XWPFPictureData> pictures = document.getAllPictures();
Iterator<XWPFPictureData> iterator = pictures.iterator();
while (iterator.hasNext()) {
ByteArrayInputStream imageData = new ByteArrayInputStream(iterator
.next().getData());
helper.addImage(ImageIO.read(imageData));
}
// TODO: find out page number
helper.addTextToHandler(xhtml);
} catch (Exception e) {
e.printStackTrace();
} finally {
if (helper != null) {
helper.close();
}
}
}
}
示例14: extractTable
import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
private void extractTable(XWPFTable table, XHTMLContentHandler xhtml)
throws SAXException, XmlException, IOException {
xhtml.startElement("table");
xhtml.startElement("tbody");
for (XWPFTableRow row : table.getRows()) {
xhtml.startElement("tr");
for (XWPFTableCell cell : row.getTableCells()) {
xhtml.startElement("td");
extractIBodyText(cell, xhtml);
xhtml.endElement("td");
}
xhtml.endElement("tr");
}
xhtml.endElement("tbody");
xhtml.endElement("table");
}
示例15: extractFooters
import org.apache.tika.sax.XHTMLContentHandler; //导入依赖的package包/类
private void extractFooters(XHTMLContentHandler xhtml,
XWPFHeaderFooterPolicy hfPolicy) throws SAXException, XmlException,
IOException {
// footers
if (hfPolicy.getFirstPageFooter() != null) {
extractHeaderText(xhtml, hfPolicy.getFirstPageFooter());
}
if (hfPolicy.getEvenPageFooter() != null) {
extractHeaderText(xhtml, hfPolicy.getEvenPageFooter());
}
if (hfPolicy.getDefaultFooter() != null) {
extractHeaderText(xhtml, hfPolicy.getDefaultFooter());
}
}