本文整理汇总了Java中edu.uci.ics.crawler4j.crawler.Page类的典型用法代码示例。如果您正苦于以下问题:Java Page类的具体用法?Java Page怎么用?Java Page使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
Page类属于edu.uci.ics.crawler4j.crawler包,在下文中一共展示了Page类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
System.out.println("URL: " + url);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String html = htmlParseData.getHtml();
String title = htmlParseData.getTitle();
System.out.println("Title: "+ title);
String baseUri = url;
Elements validLinks = PageParser.getLinks(html, baseUri);
writeContentToDB(url,validLinks); //д�����ݿ�
System.out.println("Saved updates to database.");
}
}
示例2: store
import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
@Override
public void store(Page page) {
if (page.getParseData() instanceof HtmlParseData) {
try {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
insertKeyStatement.setString(1, htmlParseData.getHtml());
insertKeyStatement.setString(2, htmlParseData.getText());
insertKeyStatement.setString(3, page.getWebURL().getURL());
insertKeyStatement.setTimestamp(4, new Timestamp(new java.util.Date().getTime()));
insertKeyStatement.executeUpdate();
} catch (SQLException e) {
logger.error("SQL Exception while storing webpage for url'{}'", page.getWebURL().getURL(), e);
throw new RuntimeException(e);
}
}
}
示例3: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
logger.info("URL: " + url);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
logger.info("Text length: " + text.length());
logger.info("Html length: " + html.length());
logger.info("Number of outgoing links: " + links.size());
try {
postgresDBService.store(page);
} catch (RuntimeException e) {
logger.error("Storing failed", e);
}
}
}
示例4: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
/**
* This function is called when a page is fetched and ready
* to be processed by your program.
*/
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
System.out.println("URL: " + url);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
System.out.println("Text length: " + text.length());
System.out.println("Html length: " + html.length());
System.out.println("Number of outgoing links: " + links.size());
}
}
示例5: shouldVisit
import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
/**
* You should implement this function to specify whether the given url
* should be crawled or not (based on your crawling logic).
*/
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
String href = url.getURL().toLowerCase();
// Ignore the url if it has an extension that matches our defined set of
// image extensions.
if (IMAGE_EXTENSIONS.matcher(href).matches()) {
return false;
}
// return href.startsWith("http://www.ladyironchef.com/");
return href.startsWith("http://www.misstamchiak.com/");
}
示例6: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
@Override
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
int parentDocid = page.getWebURL().getParentDocid();
System.out.println("Docid: " + docid);
System.out.println("URL: " + url);
System.out.println("Docid of parent page: " + parentDocid);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
List<WebURL> links = htmlParseData.getOutgoingUrls();
System.out.println("Text length: " + text.length());
System.out.println("Html length: " + html.length());
System.out.println("Number of outgoing links: " + links.size());
}
System.out.println("=============");
}
示例7: download
import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
private Page download(String url) {
WebURL curURL = new WebURL();
curURL.setURL(url);
PageFetchResult fetchResult = null;
try {
fetchResult = pageFetcher.fetchHeader(curURL);
if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
try {
Page page = new Page(curURL);
fetchResult.fetchContent(page);
if (parser.parse(page, curURL.getURL())) {
return page;
}
} catch (Exception e) {
e.printStackTrace();
}
}
} finally {
if (fetchResult != null)
{
fetchResult.discardContentIfNotConsumed();
}
}
return null;
}
示例8: processUrl
import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
public void processUrl(String url) {
System.out.println("Processing: " + url);
Page page = download(url);
if (page != null) {
ParseData parseData = page.getParseData();
if (parseData != null) {
if (parseData instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) parseData;
System.out.println("Title: " + htmlParseData.getTitle());
System.out.println("Text length: " + htmlParseData.getText().length());
System.out.println("Html length: " + htmlParseData.getHtml().length());
}
} else {
System.out.println("Couldn't parse the content of the page.");
}
} else {
System.out.println("Couldn't fetch the content of the page.");
}
System.out.println("==============");
}
示例9: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
@Override
public void visit(Page page) {
System.out.println("Visited: " + page.getWebURL().getURL());
myCrawlStat.incProcessedPages();
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData parseData = (HtmlParseData) page.getParseData();
List<WebURL> links = parseData.getOutgoingUrls();
myCrawlStat.incTotalLinks(links.size());
try {
myCrawlStat.incTotalTextSize(parseData.getText().getBytes("UTF-8").length);
} catch (UnsupportedEncodingException ignored) {
// Do nothing
}
}
// We dump this crawler statistics after processing every 50 pages
if (myCrawlStat.getTotalProcessedPages() % 50 == 0) {
dumpMyData();
}
}
示例10: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
/**
* This function is called when a page is fetched and ready to be processed. It is important that we don't let any
* Graph exceptions escape this method as this would cause the calling thread to die and eventually the crawler
* would run out of threads.
*/
@Override
public void visit(Page page) {
WebURL webUrl = page.getWebURL();
String statusMessage = FailedUrls.getInstance().getStatusMessage("FailedRequest", webUrl);
if (statusMessage != null) {
logger.warn("Ignoring bad URL " + webUrl + " - " + statusMessage);
return;
}
int pageCounter = atomicPageCounter.incrementAndGet();
if (graphImporter != null) {
logger.info("Importing page # " + pageCounter + ": " + webUrl + " (node count so far: "
+ graphImporter.getNumberOfPageNodes() + ")");
if (page.getParseData() instanceof HtmlParseData) {
visitHtmlPage(webUrl.getURL(), (HtmlParseData) page.getParseData());
} else {
visitNonHtmlPage(webUrl.getURL());
}
}
}
示例11: shouldVisit
import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
String href = url.getURL().toLowerCase();
if (IMAGE_EXTENSIONS.matcher(href).matches()) {
return false;
}
return href.startsWith("https://en.wikipedia.org/wiki/");
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-End-to-Endguide-for-Java-developers,代码行数:9,代码来源:SampleCrawler.java
示例12: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
@Override
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
if (text.contains("shipping route")) {
out.println("\nURL: " + url);
out.println("Text: " + text);
out.println("Text length: " + text.length());
}
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-End-to-Endguide-for-Java-developers,代码行数:16,代码来源:SampleCrawler.java
示例13: shouldVisit
import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
/**
* 由crawler4j调用,前置(爬虫)过滤将在这里进行匹配
*/
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
String urlStr = url.getURL();
if (App.crawlFilterPattern.matcher(urlStr).find() && !App.visitUrls.contains(urlStr)) {
for (String domain : App.domains) {
if (urlStr.contains(domain)) {
App.visitUrls.add(urlStr);
return true;
}
}
}
return false;
}
示例14: visit
import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
/**
* 由crawler4j调用,链接(访问)过滤将在这里进行匹配
*/
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
if (App.visitFilterPattern.matcher(url).find() && page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
Platform.runLater(() -> {
App.mainController.stautsLabel.setText("validating url: " + url);
App.mainController.htmlContent.appendText(Values.VISITING_TIP + url + "\r\n");
});
downloadURL(url, htmlParseData.getHtml());
}
}
示例15: shouldVisit
import edu.uci.ics.crawler4j.crawler.Page; //导入依赖的package包/类
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
String href = url.getURL().toLowerCase();
if (imgPatterns.matcher(href).matches()) {
return true;
}
if (href.startsWith(crawlDomain)) {
return true;
}
return false;
}