本文整理汇总了Java中edu.uci.ics.crawler4j.parser.HtmlParseData.getOutgoingUrls方法的典型用法代码示例。如果您正苦于以下问题:Java HtmlParseData.getOutgoingUrls方法的具体用法?Java HtmlParseData.getOutgoingUrls怎么用?Java HtmlParseData.getOutgoingUrls使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类edu.uci.ics.crawler4j.parser.HtmlParseData
的用法示例。
在下文中一共展示了HtmlParseData.getOutgoingUrls方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: visit
import edu.uci.ics.crawler4j.parser.HtmlParseData; //导入方法依赖的package包/类
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
logger.info("URL: " + url);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
logger.info("Text length: " + text.length());
logger.info("Html length: " + html.length());
logger.info("Number of outgoing links: " + links.size());
try {
postgresDBService.store(page);
} catch (RuntimeException e) {
logger.error("Storing failed", e);
}
}
}
示例2: visit
import edu.uci.ics.crawler4j.parser.HtmlParseData; //导入方法依赖的package包/类
/**
* This function is called when a page is fetched and ready
* to be processed by your program.
*/
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
System.out.println("URL: " + url);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
System.out.println("Text length: " + text.length());
System.out.println("Html length: " + html.length());
System.out.println("Number of outgoing links: " + links.size());
}
}
示例3: visit
import edu.uci.ics.crawler4j.parser.HtmlParseData; //导入方法依赖的package包/类
@Override
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
int parentDocid = page.getWebURL().getParentDocid();
System.out.println("Docid: " + docid);
System.out.println("URL: " + url);
System.out.println("Docid of parent page: " + parentDocid);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
List<WebURL> links = htmlParseData.getOutgoingUrls();
System.out.println("Text length: " + text.length());
System.out.println("Html length: " + html.length());
System.out.println("Number of outgoing links: " + links.size());
}
System.out.println("=============");
}
示例4: visit
import edu.uci.ics.crawler4j.parser.HtmlParseData; //导入方法依赖的package包/类
@Override
public void visit(Page page) {
System.out.println("Visited: " + page.getWebURL().getURL());
myCrawlStat.incProcessedPages();
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData parseData = (HtmlParseData) page.getParseData();
List<WebURL> links = parseData.getOutgoingUrls();
myCrawlStat.incTotalLinks(links.size());
try {
myCrawlStat.incTotalTextSize(parseData.getText().getBytes("UTF-8").length);
} catch (UnsupportedEncodingException ignored) {
// Do nothing
}
}
// We dump this crawler statistics after processing every 50 pages
if (myCrawlStat.getTotalProcessedPages() % 50 == 0) {
dumpMyData();
}
}
示例5: visit
import edu.uci.ics.crawler4j.parser.HtmlParseData; //导入方法依赖的package包/类
/**
* This function is called when a page is fetched and ready
* to be processed by your program.
*/
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
System.out.println("URL: " + url);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
System.out.println("Text length: " + text.length());
System.out.println("Html length: " + html.length());
System.out.println("Number of outgoing links: " + links.size());
// System.out.println(html);
org.jsoup.nodes.Document doc = Jsoup.parseBodyFragment(html);
Elements alinks = doc.select("a[href]");
for (Element a : alinks) {
if(a.attr("title").equals("Full text at publisher's site")) {
String aurl = a.attr("abs:href");
String aurlParts[] = url.split("/");
System.out.println("" + aurlParts[aurlParts.length -1] + ", " + aurl);
localData.put(Integer.parseInt(aurlParts[aurlParts.length -1]),aurl);
}
}
}
}
示例6: visit
import edu.uci.ics.crawler4j.parser.HtmlParseData; //导入方法依赖的package包/类
/**
* This function is called when a page is fetched and ready to be processed
* by your program.
*/
@Override
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
String domain = page.getWebURL().getDomain();
String path = page.getWebURL().getPath();
String subDomain = page.getWebURL().getSubDomain();
String parentUrl = page.getWebURL().getParentUrl();
String anchor = page.getWebURL().getAnchor();
System.out.println("Docid: " + docid);
System.out.println("URL: " + url);
System.out.println("Domain: '" + domain + "'");
System.out.println("Sub-domain: '" + subDomain + "'");
System.out.println("Path: '" + path + "'");
System.out.println("Parent page: " + parentUrl);
System.out.println("Anchor text: " + anchor);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
List<WebURL> links = htmlParseData.getOutgoingUrls();
System.out.println("Text length: " + text.length());
System.out.println("Html length: " + html.length());
System.out.println("Number of outgoing links: " + links.size());
}
Header[] responseHeaders = page.getFetchResponseHeaders();
if (responseHeaders != null) {
System.out.println("Response headers:");
for (Header header : responseHeaders) {
System.out.println("\t" + header.getName() + ": " + header.getValue());
}
}
System.out.println("=============");
}