本文整理汇总了Java中org.apache.tika.sax.XHTMLContentHandler.startElement方法的典型用法代码示例。如果您正苦于以下问题:Java XHTMLContentHandler.startElement方法的具体用法?Java XHTMLContentHandler.startElement怎么用?Java XHTMLContentHandler.startElement使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.tika.sax.XHTMLContentHandler
的用法示例。
在下文中一共展示了XHTMLContentHandler.startElement方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: extractOutput
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
/**
* Reads the contents of the given stream and write it to the given XHTML
* content handler. The stream is closed once fully processed.
*
* @param stream Stream where is the result of ocr
* @param xhtml XHTML content handler
* @throws SAXException if the XHTML SAX events could not be handled
* @throws IOException if an input error occurred
*/
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {
xhtml.startDocument();
xhtml.startElement("div");
try (Reader reader = new InputStreamReader(stream, UTF_8)) {
char[] buffer = new char[1024];
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
if (n > 0) {
xhtml.characters(buffer, 0, n);
}
}
}
xhtml.endElement("div");
xhtml.endDocument();
}
示例2: extractMaster
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void extractMaster(XHTMLContentHandler xhtml, MasterSheet master)
throws SAXException {
if (master == null) {
return;
}
Shape[] shapes = master.getShapes();
if (shapes == null || shapes.length == 0) {
return;
}
xhtml.startElement("div", "class", "slide-master-content");
for (int i = 0; i < shapes.length; i++) {
Shape sh = shapes[i];
if (sh != null && !MasterSheet.isPlaceholder(sh)) {
if (sh instanceof TextShape) {
TextShape tsh = (TextShape) sh;
String text = tsh.getText();
if (text != null) {
xhtml.element("p", text);
}
}
}
}
xhtml.endElement("div");
}
示例3: extractTableText
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void extractTableText(XHTMLContentHandler xhtml, Table shape)
throws SAXException {
xhtml.startElement("table");
for (int row = 0; row < shape.getNumberOfRows(); row++) {
xhtml.startElement("tr");
for (int col = 0; col < shape.getNumberOfColumns(); col++) {
TableCell cell = shape.getCell(row, col);
// insert empty string for empty cell if cell is null
String txt = "";
if (cell != null) {
txt = cell.getText();
}
xhtml.element("td", txt);
}
xhtml.endElement("tr");
}
xhtml.endElement("table");
}
示例4: textRunsToText
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs)
throws SAXException {
if (runs == null) {
return;
}
for (TextRun run : runs) {
if (run != null) {
// Leaving in wisdom from TIKA-712 for easy revert.
// Avoid boiler-plate text on the master slide (0
// = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
// if (!isMaster || (run.getRunType() != 0 && run.getRunType()
// != 1)) {
String txt = run.getText();
if (txt != null) {
xhtml.characters(txt);
xhtml.startElement("br");
xhtml.endElement("br");
}
}
}
}
示例5: handleHeaderFooter
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void handleHeaderFooter(Range[] ranges, String type,
HWPFDocument document, PicturesSource pictures,
PicturesTable pictureTable, XHTMLContentHandler xhtml)
throws SAXException, IOException, TikaException {
if (countParagraphs(ranges) > 0) {
xhtml.startElement("div", "class", type);
for (Range r : ranges) {
if (r != null) {
for (int i = 0; i < r.numParagraphs(); i++) {
Paragraph p = r.getParagraph(i);
String text = p.text();
if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
// Skip empty header or footer paragraphs
} else {
i += handleParagraph(p, 0, r, document,
FieldsDocumentPart.HEADER, pictures, pictureTable, xhtml);
}
}
}
}
xhtml.endElement("div");
}
}
示例6: extractTable
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void extractTable(XWPFTable table, XHTMLContentHandler xhtml)
throws SAXException, XmlException, IOException {
xhtml.startElement("table");
xhtml.startElement("tbody");
for (XWPFTableRow row : table.getRows()) {
xhtml.startElement("tr");
for (XWPFTableCell cell : row.getTableCells()) {
xhtml.startElement("td");
extractIBodyText(cell, xhtml);
xhtml.endElement("td");
}
xhtml.endElement("tr");
}
xhtml.endElement("tbody");
xhtml.endElement("table");
}
示例7: parse
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
if (metadata.get(Metadata.CONTENT_TYPE) == null) {
metadata.set(Metadata.CONTENT_TYPE, "application/xml");
}
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
TaggedContentHandler tagged = new TaggedContentHandler(handler);
try {
context.getSAXParser().parse(
new CloseShieldInputStream(stream),
new OfflineContentHandler(new EmbeddedContentHandler(getContentHandler(tagged, metadata,
context))));
} catch (SAXException e) {
tagged.throwIfCauseOf(e);
LOG.info("XML parse error", e);
// TODO:
// throw new TikaException("XML parse error", e);
}
xhtml.endElement("p");
xhtml.endDocument();
}
示例8: extractAcroForm
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler)
throws IOException, SAXException {
// Thank you, Ben Litchfield, for
// org.apache.pdfbox.examples.fdf.PrintFields
// this code derives from Ben's code
PDDocumentCatalog catalog = pdf.getDocumentCatalog();
if (catalog == null)
return;
PDAcroForm form = catalog.getAcroForm();
if (form == null)
return;
@SuppressWarnings("rawtypes")
List fields = form.getFields();
if (fields == null)
return;
@SuppressWarnings("rawtypes")
ListIterator itr = fields.listIterator();
if (itr == null)
return;
handler.startElement("div", "class", "acroform");
handler.startElement("ol");
while (itr.hasNext()) {
Object obj = itr.next();
if (obj != null && obj instanceof PDField) {
processAcroField((PDField) obj, handler, 0);
}
}
handler.endElement("ol");
handler.endElement("div");
}
示例9: extractOutput
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
/**
* Starts a thread that extracts the contents of the standard output
* stream of the given process to the given XHTML content handler.
* The standard output stream is closed once fully processed.
*
* @param process process
* @param xhtml XHTML content handler
* @throws SAXException if the XHTML SAX events could not be handled
* @throws IOException if an input error occurred
*/
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
throws SAXException, IOException {
Reader reader = new InputStreamReader(stream);
try {
xhtml.startDocument();
xhtml.startElement("p");
char[] buffer = new char[1024];
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
xhtml.characters(buffer, 0, n);
}
xhtml.endElement("p");
xhtml.endDocument();
} finally {
reader.close();
}
}
示例10: handleChildTasks
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
protected static void handleChildTasks(ChildTaskContainer parentTask, XHTMLContentHandler xhtml,
Set<Integer> usedResources) throws SAXException {
List<Task> tasks = parentTask.getChildTasks();
if (tasks != null && ! tasks.isEmpty()) {
xhtml.startElement("ol");
for (Task task : tasks) {
xhtml.startElement("li", "id", task.getID().toString());
// Firstly, output the task details
xhtml.startElement("div", "class", "task");
handleTask(task, xhtml, usedResources);
xhtml.endElement("div");
// Then recurse into children (if any)
handleChildTasks(task, xhtml, usedResources);
xhtml.endElement("li");
}
xhtml.endElement("ol");
}
}
示例11: handleDates
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
/**
* Render a date range
*/
protected static void handleDates(String what, Date start, Date finish,
Duration duration, XHTMLContentHandler xhtml) throws SAXException {
if (start == null && finish == null) {
// Assume there's nothing there, and skip
return;
}
String cls = what.toLowerCase() + "Dates";
xhtml.startElement("div", "class", "fromTo " + cls);
xhtml.characters(what);
xhtml.characters(" from ");
xhtml.characters(buildDate(start));
xhtml.characters(" to ");
xhtml.characters(buildDate(finish));
if (duration != null) {
xhtml.characters(" taking ");
xhtml.characters(buildDuration(duration));
}
xhtml.endElement("div");
}
示例12: handlePictureCharacterRun
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void handlePictureCharacterRun(CharacterRun cr, Picture picture,
PicturesSource pictures, XHTMLContentHandler xhtml) throws SAXException,
IOException, TikaException {
if (!isRendered(cr) || picture == null) {
// Oh dear, we've run out...
// Probably caused by multiple \u0008 images referencing
// the same real image
return;
}
// Which one is it?
String extension = picture.suggestFileExtension();
int pictureNumber = pictures.pictureNumber(picture);
// Make up a name for the picture
// There isn't one in the file, but we need to be able to reference
// the picture from the img tag and the embedded resource
String filename = "image" + pictureNumber
+ (extension.length() > 0 ? "." + extension : "");
// Grab the mime type for the picture
String mimeType = picture.getMimeType();
// Output the img tag
AttributesImpl attr = new AttributesImpl();
attr.addAttribute("", "src", "src", "CDATA", "embedded:" + filename);
attr.addAttribute("", "alt", "alt", "CDATA", filename);
xhtml.startElement("img", attr);
xhtml.endElement("img");
// Have we already output this one?
// (Only expose each individual image once)
if (!pictures.hasOutput(picture)) {
TikaInputStream stream = TikaInputStream.get(picture.getContent());
handleEmbeddedResource(stream, filename, null, mimeType, xhtml, false);
pictures.recordOutput(picture);
}
}
示例13: extractSDT
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void extractSDT(XWPFSDT element, XHTMLContentHandler xhtml)
throws SAXException, XmlException, IOException {
XWPFSDTContent content = element.getContent();
String tag = "p";
xhtml.startElement(tag);
xhtml.characters(content.getText());
xhtml.endElement(tag);
}
示例14: parse
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
// Automatically detect the character encoding
AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(
stream), metadata, context.get(ServiceLoader.class, LOADER));
try {
Charset charset = reader.getCharset();
MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
char[] buffer = new char[4096];
int n = reader.read(buffer);
while (n != -1) {
xhtml.characters(buffer, 0, n);
n = reader.read(buffer);
}
xhtml.endElement("p");
xhtml.endDocument();
} finally {
reader.close();
}
}
示例15: parseEntry
import org.apache.tika.sax.XHTMLContentHandler; //导入方法依赖的package包/类
private void parseEntry(
ArchiveInputStream archive, ArchiveEntry entry,
EmbeddedDocumentExtractor extractor, XHTMLContentHandler xhtml)
throws SAXException, IOException, TikaException {
String name = entry.getName();
if (archive.canReadEntryData(entry)) {
Metadata entrydata = new Metadata();
if (name != null && name.length() > 0) {
entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", name);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name);
}
if (extractor.shouldParseEmbedded(entrydata)) {
// For detectors to work, we need a mark/reset supporting
// InputStream, which ArchiveInputStream isn't, so wrap
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(archive, tmp);
extractor.parseEmbedded(tis, xhtml, entrydata, true);
} finally {
tmp.dispose();
}
}
} else if (name != null && name.length() > 0) {
xhtml.element("p", name);
}
}