本文整理汇总了Java中org.apache.tika.parser.html.HtmlParser类的典型用法代码示例。如果您正苦于以下问题:Java HtmlParser类的具体用法?Java HtmlParser怎么用?Java HtmlParser使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
HtmlParser类属于org.apache.tika.parser.html包,在下文中一共展示了HtmlParser类的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: parse
import org.apache.tika.parser.html.HtmlParser; //导入依赖的package包/类
/**
* @see SchemaParser#parse(Request)
*/
@Override
public Schema parse(Request request) {
try {
SimpleHeadersContentHandler headersContentHandler = new SimpleHeadersContentHandler();
InputStream inputStream = request.getContent();
HtmlParser htmlParser = new HtmlParser();
Metadata metadata = new Metadata();
htmlParser.parse(inputStream, headersContentHandler, metadata, new ParseContext());
List<ColumnMetadata> columns = new ArrayList<>(headersContentHandler.getHeaderValues().size());
for (String headerValue : headersContentHandler.getHeaderValues()) {
columns.add(ColumnMetadata.Builder.column() //
.type(Type.STRING) // ATM not doing any complicated type calculation
.name(headerValue) //
.id(columns.size()) //
.build());
}
Schema.SheetContent sheetContent = new Schema.SheetContent();
sheetContent.setColumnMetadatas(columns);
return Schema.Builder.parserResult() //
.sheetContents(Collections.singletonList(sheetContent)) //
.draft(false) //
.build();
} catch (Exception e) {
LOGGER.debug("Exception during parsing html request :" + e.getMessage(), e);
throw new TDPException(CommonErrorCodes.UNEXPECTED_EXCEPTION, e);
}
}
示例2: collectPaths
import org.apache.tika.parser.html.HtmlParser; //导入依赖的package包/类
/**
* Collect references from a JCR property.
* A property can be one of:
* <ol>
* <li>A string containing a reference, e.g, fileReference=/content/dam/image.png. </li>
* <li>An array of strings, e.g, fileReference=[/content/dam/image1.png, /content/dam/image2.png]</li>
* <li>An html fragment containing links , e.g,
* <pre>
* <p>
* <a href="/content/site/page.html">hello</a>
* <img src="/content/dam/image1.png">hello</a>
* </p>
* </pre>
* </li>
* </ol>
*
* @param property an entry from a ValueMap
* @param htmlFields lst of properties containing html
* @return stream containing extracted references
*/
static Stream<String> collectPaths(Map.Entry<String, Object> property, Set<String> htmlFields) {
Object p = property.getValue();
Stream<String> stream;
if (p.getClass() == String[].class) {
stream = Arrays.stream((String[]) p);
} else if (p.getClass() == String.class){
stream = Stream.of((String) p);
} else {
stream = Stream.empty();
}
if (htmlFields.contains(property.getKey())) {
stream = stream.flatMap(val -> {
try {
// parse html and extract links via underlying tagsoup library
LinkContentHandler linkHandler = new LinkContentHandler();
HtmlParser parser = new HtmlParser();
parser.parse(new ByteArrayInputStream(val.getBytes("utf-8")), linkHandler, new Metadata(), new ParseContext());
return linkHandler.getLinks().stream().map(Link::getUri);
} catch (Exception e) {
return Stream.empty();
}
});
}
return stream;
}
示例3: init
import org.apache.tika.parser.html.HtmlParser; //导入依赖的package包/类
/**
* Initialize the index writer.
*
* @throws IOException Unspecified IO exception.
*/
public void init() throws IOException {
File path = resolveIndexDirectoryPath();
indexTracker = new IndexTracker(path);
indexDirectory = FSDirectory.open(path);
tika = new Tika(null, new HtmlParser());
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, analyzer);
writer = new IndexWriter(indexDirectory, config);
}
示例4: Parser
import org.apache.tika.parser.html.HtmlParser; //导入依赖的package包/类
public Parser(CrawlConfig config) {
super(config);
htmlParser = new HtmlParser();
parseContext = new ParseContext();
}
示例5: parseHTML
import org.apache.tika.parser.html.HtmlParser; //导入依赖的package包/类
private String parseHTML(String text, HtmlParser htmlParser) throws IOException, SAXException, TikaException {
StringWriter writer = new StringWriter();
ContentHandler contentHandler = new BodyContentHandler(writer);
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "text/html");
htmlParser.parse(StringUtils.getInputStream(text), contentHandler, metadata, new ParseContext());
return writer.toString();
}
示例6: FlowIndexer
import org.apache.tika.parser.html.HtmlParser; //导入依赖的package包/类
public FlowIndexer(Collection<Flow> flows, int maxHitCount) throws IOException{
this.maxHitCount = maxHitCount;
HtmlParser htmlParser = new HtmlParser();
indexWriter = new IndexWriter(index, new IndexWriterConfig(Version.LUCENE_44, analyzer));
for(Flow flow : flows){
try{
Document doc = new Document();
doc.add(new IntField(ID_FIELD, flow.getFlowID(), Field.Store.YES));
doc.add(new TextField(NAME_FIELD, flow.getName(), Field.Store.NO));
doc.add(new TextField(SHORT_DESCRIPTION_FIELD, parseHTML(flow.getShortDescription(), htmlParser), Field.Store.NO));
doc.add(new TextField(LONG_DESCRIPTION_FIELD, parseHTML(flow.getLongDescription(), htmlParser), Field.Store.NO));
if(flow.getTags() != null){
for(String tag : flow.getTags()){
doc.add(new TextField(TAGS_FIELD, tag, Field.Store.NO));
}
}
if(flow.getCategory() != null){
doc.add(new TextField(CATEGORY_FIELD, flow.getCategory().getName(), Field.Store.NO));
}
indexWriter.addDocument(doc);
}catch(Exception e){
log.error("Error indexing flow " + flow, e);
}
}
this.indexWriter.commit();
this.indexReader = DirectoryReader.open(index);
this.searcher = new IndexSearcher(indexReader);
}