本文整理汇总了Java中org.apache.tika.sax.Link类的典型用法代码示例。如果您正苦于以下问题:Java Link类的具体用法?Java Link怎么用?Java Link使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
Link类属于org.apache.tika.sax包,在下文中一共展示了Link类的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getAbsoluteLinks
import org.apache.tika.sax.Link; //导入依赖的package包/类
private List<String> getAbsoluteLinks(URL parent_url, List<Link> links) {
List<String> links_out = new ArrayList<String>();
String uri;
for (Link link : links) {
uri = link.getUri();
if (StringUtils.isBlank(uri)) {
continue;
}
String abs_uri;
// build an absolute URL
try {
URL tmpURL = new URL(parent_url, uri);
abs_uri = tmpURL.toExternalForm();
} catch (MalformedURLException e) {
System.err.println("MalformedURLException:\n" + e.getMessage());
continue;
}
links_out.add(abs_uri.toString());
}
return links_out;
}
示例2: extractImageLinks
import org.apache.tika.sax.Link; //导入依赖的package包/类
/**
* To extract image links form a URL. Needs Improvement
* @param url
* @return
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public Object extractImageLinks(String url) throws IOException, SAXException, TikaException {
Set<String> imageLinks = new HashSet<String>();
InputStream is = null;
try {
is = TikaInputStream.get(new URL(url).openStream());
Metadata metadata = new Metadata();
LinkContentHandler handler = new LinkContentHandler();
AutoDetectParser parser = new AutoDetectParser();
parser.parse(is, handler, metadata);
List<Link> links = handler.getLinks();
Iterator<Link> iter = links.iterator();
while(iter.hasNext()) {
Link link = iter.next();
if(link.isImage())
imageLinks.add(link.getUri());
}
}
finally {
is.close();
}
return imageLinks.toArray();
}
示例3: getOutlinks
import org.apache.tika.sax.Link; //导入依赖的package包/类
public void getOutlinks(URL base, ArrayList<Outlink> outlinks, List<Link> tikaExtractedOutlinks) {
String target = null;
String anchor = null;
boolean noFollow = false;
for (Link link : tikaExtractedOutlinks) {
target = link.getUri();
noFollow = (link.getRel().toLowerCase().equals("nofollow")) ? true : false;
anchor = link.getText();
if (!ignoredTags.contains(link.getType())) {
if (target != null && !noFollow) {
try {
URL url = URLUtil.resolveURL(base, target);
// clean the anchor
anchor = anchor.replaceAll("\\s+", " ");
anchor = anchor.trim();
outlinks.add(new Outlink(url.toString(), anchor));
} catch (MalformedURLException e) {
// don't care
}
}
}
}
}
示例4: collectPaths
import org.apache.tika.sax.Link; //导入依赖的package包/类
/**
* Collect references from a JCR property.
* A property can be one of:
* <ol>
* <li>A string containing a reference, e.g, fileReference=/content/dam/image.png. </li>
* <li>An array of strings, e.g, fileReference=[/content/dam/image1.png, /content/dam/image2.png]</li>
* <li>An html fragment containing links , e.g,
* <pre>
* <p>
* <a href="/content/site/page.html">hello</a>
* <img src="/content/dam/image1.png">hello</a>
* </p>
* </pre>
* </li>
* </ol>
*
* @param property an entry from a ValueMap
* @param htmlFields lst of properties containing html
* @return stream containing extracted references
*/
static Stream<String> collectPaths(Map.Entry<String, Object> property, Set<String> htmlFields) {
Object p = property.getValue();
Stream<String> stream;
if (p.getClass() == String[].class) {
stream = Arrays.stream((String[]) p);
} else if (p.getClass() == String.class){
stream = Stream.of((String) p);
} else {
stream = Stream.empty();
}
if (htmlFields.contains(property.getKey())) {
stream = stream.flatMap(val -> {
try {
// parse html and extract links via underlying tagsoup library
LinkContentHandler linkHandler = new LinkContentHandler();
HtmlParser parser = new HtmlParser();
parser.parse(new ByteArrayInputStream(val.getBytes("utf-8")), linkHandler, new Metadata(), new ParseContext());
return linkHandler.getLinks().stream().map(Link::getUri);
} catch (Exception e) {
return Stream.empty();
}
});
}
return stream;
}
示例5: toOutlinks
import org.apache.tika.sax.Link; //导入依赖的package包/类
private List<Outlink> toOutlinks(String parentURL, List<Link> links,
Metadata parentMetadata) {
List<Outlink> outlinks = new ArrayList<Outlink>(links.size());
URL url_;
try {
url_ = new URL(parentURL);
} catch (MalformedURLException e1) {
// we would have known by now as previous
// components check whether the URL is valid
LOG.error("MalformedURLException on {}", parentURL);
eventCounter.scope("error_invalid_source_url").incrBy(1);
return outlinks;
}
for (Link l : links) {
if (StringUtils.isBlank(l.getUri())) {
continue;
}
String urlOL = null;
// build an absolute URL
try {
URL tmpURL = URLUtil.resolveURL(url_, l.getUri());
urlOL = tmpURL.toExternalForm();
} catch (MalformedURLException e) {
LOG.debug("MalformedURLException on {}", l.getUri());
eventCounter
.scope("error_outlink_parsing_"
+ e.getClass().getSimpleName()).incrBy(1);
continue;
}
// applies the URL filters
if (urlFilters != null) {
urlOL = urlFilters.filter(url_, parentMetadata, urlOL);
if (urlOL == null) {
eventCounter.scope("outlink_filtered").incrBy(1);
continue;
}
}
eventCounter.scope("outlink_kept").incrBy(1);
Outlink ol = new Outlink(urlOL);
// add the anchor
ol.setAnchor(l.getText());
// get the metadata for the outlink from the parent ones
ol.setMetadata(metadataTransfer.getMetaForOutlink(urlOL, parentURL,
parentMetadata));
outlinks.add(ol);
}
return outlinks;
}
示例6: toOutlinks
import org.apache.tika.sax.Link; //导入依赖的package包/类
private List<Outlink> toOutlinks(String parentURL, List<Link> links,
Metadata parentMetadata) {
Map<String, Outlink> outlinks = new HashMap<String, Outlink>();
URL url_;
try {
url_ = new URL(parentURL);
} catch (MalformedURLException e1) {
// we would have known by now as previous
// components check whether the URL is valid
LOG.error("MalformedURLException on {}", parentURL);
eventCounter.scope("error_invalid_source_url").incrBy(1);
return new LinkedList<Outlink>();
}
for (Link l : links) {
if (StringUtils.isBlank(l.getUri())) {
continue;
}
String urlOL;
// build an absolute URL
try {
URL tmpURL = URLUtil.resolveURL(url_, l.getUri());
urlOL = tmpURL.toExternalForm();
} catch (MalformedURLException e) {
LOG.debug("MalformedURLException on {}", l.getUri());
eventCounter
.scope("error_outlink_parsing_"
+ e.getClass().getSimpleName()).incrBy(1);
continue;
}
// applies the URL filters
if (urlFilters != null) {
urlOL = urlFilters.filter(url_, parentMetadata, urlOL);
if (urlOL == null) {
eventCounter.scope("outlink_filtered").incrBy(1);
continue;
}
}
eventCounter.scope("outlink_kept").incrBy(1);
Outlink ol = new Outlink(urlOL);
// add the anchor
ol.setAnchor(l.getText());
// get the metadata for the outlink from the parent ones
ol.setMetadata(metadataTransfer.getMetaForOutlink(urlOL, parentURL,
parentMetadata));
// keep only one instance of outlink per URL
Outlink ol2 = outlinks.get(urlOL);
if (ol2 == null) {
outlinks.put(urlOL, ol);
}
}
return new ArrayList<Outlink>(outlinks.values());
}