本文整理汇总了Java中edu.uci.ics.crawler4j.crawler.Page.getParseData方法的典型用法代码示例。如果您正苦于以下问题:Java Page.getParseData方法的具体用法?Java Page.getParseData怎么用?Java Page.getParseData使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类edu.uci.ics.crawler4j.crawler.Page
的用法示例。
在下文中一共展示了Page.getParseData方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
System.out.println("URL: " + url);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String html = htmlParseData.getHtml();
String title = htmlParseData.getTitle();
System.out.println("Title: "+ title);
String baseUri = url;
Elements validLinks = PageParser.getLinks(html, baseUri);
writeContentToDB(url,validLinks); //д�����ݿ�
System.out.println("Saved updates to database.");
}
}
示例2: store
import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
@Override
public void store(Page page) {
if (page.getParseData() instanceof HtmlParseData) {
try {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
insertKeyStatement.setString(1, htmlParseData.getHtml());
insertKeyStatement.setString(2, htmlParseData.getText());
insertKeyStatement.setString(3, page.getWebURL().getURL());
insertKeyStatement.setTimestamp(4, new Timestamp(new java.util.Date().getTime()));
insertKeyStatement.executeUpdate();
} catch (SQLException e) {
logger.error("SQL Exception while storing webpage for url'{}'", page.getWebURL().getURL(), e);
throw new RuntimeException(e);
}
}
}
示例3: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
logger.info("URL: " + url);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
logger.info("Text length: " + text.length());
logger.info("Html length: " + html.length());
logger.info("Number of outgoing links: " + links.size());
try {
postgresDBService.store(page);
} catch (RuntimeException e) {
logger.error("Storing failed", e);
}
}
}
示例4: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
/**
* This function is called when a page is fetched and ready
* to be processed by your program.
*/
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
System.out.println("URL: " + url);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
System.out.println("Text length: " + text.length());
System.out.println("Html length: " + html.length());
System.out.println("Number of outgoing links: " + links.size());
}
}
示例5: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
@Override
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
int parentDocid = page.getWebURL().getParentDocid();
System.out.println("Docid: " + docid);
System.out.println("URL: " + url);
System.out.println("Docid of parent page: " + parentDocid);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
List<WebURL> links = htmlParseData.getOutgoingUrls();
System.out.println("Text length: " + text.length());
System.out.println("Html length: " + html.length());
System.out.println("Number of outgoing links: " + links.size());
}
System.out.println("=============");
}
示例6: processUrl
import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
public void processUrl(String url) {
System.out.println("Processing: " + url);
Page page = download(url);
if (page != null) {
ParseData parseData = page.getParseData();
if (parseData != null) {
if (parseData instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) parseData;
System.out.println("Title: " + htmlParseData.getTitle());
System.out.println("Text length: " + htmlParseData.getText().length());
System.out.println("Html length: " + htmlParseData.getHtml().length());
}
} else {
System.out.println("Couldn't parse the content of the page.");
}
} else {
System.out.println("Couldn't fetch the content of the page.");
}
System.out.println("==============");
}
示例7: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
@Override
public void visit(Page page) {
System.out.println("Visited: " + page.getWebURL().getURL());
myCrawlStat.incProcessedPages();
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData parseData = (HtmlParseData) page.getParseData();
List<WebURL> links = parseData.getOutgoingUrls();
myCrawlStat.incTotalLinks(links.size());
try {
myCrawlStat.incTotalTextSize(parseData.getText().getBytes("UTF-8").length);
} catch (UnsupportedEncodingException ignored) {
// Do nothing
}
}
// We dump this crawler statistics after processing every 50 pages
if (myCrawlStat.getTotalProcessedPages() % 50 == 0) {
dumpMyData();
}
}
示例8: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
/**
* This function is called when a page is fetched and ready to be processed. It is important that we don't let any
* Graph exceptions escape this method as this would cause the calling thread to die and eventually the crawler
* would run out of threads.
*/
@Override
public void visit(Page page) {
WebURL webUrl = page.getWebURL();
String statusMessage = FailedUrls.getInstance().getStatusMessage("FailedRequest", webUrl);
if (statusMessage != null) {
logger.warn("Ignoring bad URL " + webUrl + " - " + statusMessage);
return;
}
int pageCounter = atomicPageCounter.incrementAndGet();
if (graphImporter != null) {
logger.info("Importing page # " + pageCounter + ": " + webUrl + " (node count so far: "
+ graphImporter.getNumberOfPageNodes() + ")");
if (page.getParseData() instanceof HtmlParseData) {
visitHtmlPage(webUrl.getURL(), (HtmlParseData) page.getParseData());
} else {
visitNonHtmlPage(webUrl.getURL());
}
}
}
示例9: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
@Override
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
if (text.contains("shipping route")) {
out.println("\nURL: " + url);
out.println("Text: " + text);
out.println("Text length: " + text.length());
}
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-End-to-Endguide-for-Java-developers,代码行数:16,代码来源:SampleCrawler.java
示例10: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
/**
* 由crawler4j调用,链接(访问)过滤将在这里进行匹配
*/
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
if (App.visitFilterPattern.matcher(url).find() && page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
Platform.runLater(() -> {
App.mainController.stautsLabel.setText("validating url: " + url);
App.mainController.htmlContent.appendText(Values.VISITING_TIP + url + "\r\n");
});
downloadURL(url, htmlParseData.getHtml());
}
}
示例11: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
// We are only interested in processing images which are bigger than 10k
if (!imgPatterns.matcher(url).matches() || !((page.getParseData() instanceof BinaryParseData)
|| (page.getContentData().length < (10 * 1024)))) {
return;
}
gatheredURLs.add(url);
System.out.println("Fetched URL : " + url);
}
示例12: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
@Override
public void visit(Page page) {
logger.info("URL: " + page.getWebURL().getURL());
if (page.getParseData() instanceof HtmlParseData) {
String text = getPageText(page);
for (Memo memo :
memos) {
findAndSaveMemo(text, memo);
}
}
}
示例13: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
/**
* This function is called when a page is fetched and ready
* to be processed by your program.
*/
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
System.out.println("URL: " + url);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
System.out.println("Text length: " + text.length());
System.out.println("Html length: " + html.length());
System.out.println("Number of outgoing links: " + links.size());
// System.out.println(html);
org.jsoup.nodes.Document doc = Jsoup.parseBodyFragment(html);
Elements alinks = doc.select("a[href]");
for (Element a : alinks) {
if(a.attr("title").equals("Full text at publisher's site")) {
String aurl = a.attr("abs:href");
String aurlParts[] = url.split("/");
System.out.println("" + aurlParts[aurlParts.length -1] + ", " + aurl);
localData.put(Integer.parseInt(aurlParts[aurlParts.length -1]),aurl);
}
}
}
}
示例14: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
// We are only interested in processing images
if (!(page.getParseData() instanceof BinaryParseData)) {
return;
}
if (!imgPatterns.matcher(url).matches()) {
return;
}
// Not interested in very small images
if (page.getContentData().length < 10 * 1024) {
return;
}
// get a unique name for storing this image
String extension = url.substring(url.lastIndexOf("."));
String hashedName = Cryptography.MD5(url) + extension;
// store image
IO.writeBytesToFile(page.getContentData(), storageFolder.getAbsolutePath() + "/" + hashedName);
System.out.println("Stored: " + url);
}
示例15: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入方法依赖的package包/类
/**
* This function is called when a page is fetched and ready to be processed
* by your program.
*/
@Override
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
String domain = page.getWebURL().getDomain();
String path = page.getWebURL().getPath();
String subDomain = page.getWebURL().getSubDomain();
String parentUrl = page.getWebURL().getParentUrl();
String anchor = page.getWebURL().getAnchor();
System.out.println("Docid: " + docid);
System.out.println("URL: " + url);
System.out.println("Domain: '" + domain + "'");
System.out.println("Sub-domain: '" + subDomain + "'");
System.out.println("Path: '" + path + "'");
System.out.println("Parent page: " + parentUrl);
System.out.println("Anchor text: " + anchor);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
List<WebURL> links = htmlParseData.getOutgoingUrls();
System.out.println("Text length: " + text.length());
System.out.println("Html length: " + html.length());
System.out.println("Number of outgoing links: " + links.size());
}
Header[] responseHeaders = page.getFetchResponseHeaders();
if (responseHeaders != null) {
System.out.println("Response headers:");
for (Header header : responseHeaders) {
System.out.println("\t" + header.getName() + ": " + header.getValue());
}
}
System.out.println("=============");
}