本文整理汇总了Java中edu.uci.ics.crawler4j.url.WebURL类的典型用法代码示例。如果您正苦于以下问题:Java WebURL类的具体用法?Java WebURL怎么用?Java WebURL使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
WebURL类属于edu.uci.ics.crawler4j.url包,在下文中一共展示了WebURL类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: visit
import edu.uci.ics.crawler4j.url.WebURL; //导入依赖的package包/类
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
logger.info("URL: " + url);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
logger.info("Text length: " + text.length());
logger.info("Html length: " + html.length());
logger.info("Number of outgoing links: " + links.size());
try {
postgresDBService.store(page);
} catch (RuntimeException e) {
logger.error("Storing failed", e);
}
}
}
示例2: visit
import edu.uci.ics.crawler4j.url.WebURL; //导入依赖的package包/类
/**
* This function is called when a page is fetched and ready
* to be processed by your program.
*/
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
System.out.println("URL: " + url);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
System.out.println("Text length: " + text.length());
System.out.println("Html length: " + html.length());
System.out.println("Number of outgoing links: " + links.size());
}
}
示例3: shouldVisit
import edu.uci.ics.crawler4j.url.WebURL; //导入依赖的package包/类
/**
* You should implement this function to specify whether the given url
* should be crawled or not (based on your crawling logic).
*/
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
String href = url.getURL().toLowerCase();
// Ignore the url if it has an extension that matches our defined set of
// image extensions.
if (IMAGE_EXTENSIONS.matcher(href).matches()) {
return false;
}
// return href.startsWith("http://www.ladyironchef.com/");
return href.startsWith("http://www.misstamchiak.com/");
}
示例4: put
import edu.uci.ics.crawler4j.url.WebURL; //导入依赖的package包/类
public void put(WebURL url) throws DatabaseException {
DatabaseEntry value = new DatabaseEntry();
webURLBinding.objectToEntry(url, value);
Transaction txn;
if (resumable) {
txn = env.beginTransaction(null, null);
} else {
txn = null;
}
urlsDB.put(txn, getDatabaseEntryKey(url), value);
if (resumable) {
if (txn != null) {
txn.commit();
}
}
}
示例5: scheduleAll
import edu.uci.ics.crawler4j.url.WebURL; //导入依赖的package包/类
public void scheduleAll(List<WebURL> urls) {
int maxPagesToFetch = config().maxPagesToFetch();
synchronized (mutex) {
int newScheduledPage = 0;
for (WebURL url : urls) {
if (maxPagesToFetch > 0 && (scheduledPages + newScheduledPage) >= maxPagesToFetch) {
break;
}
try {
workQueues.put(url);
newScheduledPage++;
} catch (DatabaseException e) {
logger.error("Error while puting the url in the work queue.");
}
}
if (newScheduledPage > 0) {
scheduledPages += newScheduledPage;
counters.increment(Counters.ReservedCounterNames.SCHEDULED_PAGES, newScheduledPage);
}
synchronized (waitingList) {
waitingList.notifyAll();
}
}
}
示例6: shouldVisit
import edu.uci.ics.crawler4j.url.WebURL; //导入依赖的package包/类
@Override
public boolean shouldVisit(WebURL url) {
String href = url.getURL().toLowerCase();
if (filters.matcher(href).matches()) {
return false;
}
if (imgPatterns.matcher(href).matches()) {
return true;
}
for (String domain : crawlDomains) {
if (href.startsWith(domain)) {
return true;
}
}
return false;
}
示例7: download
import edu.uci.ics.crawler4j.url.WebURL; //导入依赖的package包/类
private Page download(String url) {
WebURL curURL = new WebURL();
curURL.setURL(url);
PageFetchResult fetchResult = null;
try {
fetchResult = pageFetcher.fetchHeader(curURL);
if (fetchResult.statusCode() == HttpStatus.SC_OK) {
try {
Page page = new Page(curURL);
fetchResult.fetchContent(page);
if (parser.parse(page, curURL.getURL())) {
return page;
}
} catch (Exception e) {
e.printStackTrace();
}
}
} finally {
if (fetchResult != null)
{
fetchResult.discardContentIfNotConsumed();
}
}
return null;
}
示例8: scheduleAll
import edu.uci.ics.crawler4j.url.WebURL; //导入依赖的package包/类
public void scheduleAll(List<WebURL> urls) {
int maxPagesToFetch = config.getMaxPagesToFetch();
synchronized (mutex) {
int newScheduledPage = 0;
for (WebURL url : urls) {
if (maxPagesToFetch > 0 && (scheduledPages + newScheduledPage) >= maxPagesToFetch) {
break;
}
try {
workQueues.put(url);
newScheduledPage++;
} catch (DatabaseException e) {
logger.error("Error while putting the url in the work queue.");
}
}
if (newScheduledPage > 0) {
scheduledPages += newScheduledPage;
counters.increment(Counters.ReservedCounterNames.SCHEDULED_PAGES, newScheduledPage);
}
synchronized (waitingList) {
waitingList.notifyAll();
}
}
}
示例9: visit
import edu.uci.ics.crawler4j.url.WebURL; //导入依赖的package包/类
@Override
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
int parentDocid = page.getWebURL().getParentDocid();
System.out.println("Docid: " + docid);
System.out.println("URL: " + url);
System.out.println("Docid of parent page: " + parentDocid);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
List<WebURL> links = htmlParseData.getOutgoingUrls();
System.out.println("Text length: " + text.length());
System.out.println("Html length: " + html.length());
System.out.println("Number of outgoing links: " + links.size());
}
System.out.println("=============");
}
示例10: download
import edu.uci.ics.crawler4j.url.WebURL; //导入依赖的package包/类
private Page download(String url) {
WebURL curURL = new WebURL();
curURL.setURL(url);
PageFetchResult fetchResult = null;
try {
fetchResult = pageFetcher.fetchHeader(curURL);
if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
try {
Page page = new Page(curURL);
fetchResult.fetchContent(page);
if (parser.parse(page, curURL.getURL())) {
return page;
}
} catch (Exception e) {
e.printStackTrace();
}
}
} finally {
if (fetchResult != null)
{
fetchResult.discardContentIfNotConsumed();
}
}
return null;
}
示例11: visit
import edu.uci.ics.crawler4j.url.WebURL; //导入依赖的package包/类
@Override
public void visit(Page page) {
System.out.println("Visited: " + page.getWebURL().getURL());
myCrawlStat.incProcessedPages();
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData parseData = (HtmlParseData) page.getParseData();
List<WebURL> links = parseData.getOutgoingUrls();
myCrawlStat.incTotalLinks(links.size());
try {
myCrawlStat.incTotalTextSize(parseData.getText().getBytes("UTF-8").length);
} catch (UnsupportedEncodingException ignored) {
// Do nothing
}
}
// We dump this crawler statistics after processing every 50 pages
if (myCrawlStat.getTotalProcessedPages() % 50 == 0) {
dumpMyData();
}
}
示例12: shouldVisit
import edu.uci.ics.crawler4j.url.WebURL; //导入依赖的package包/类
@Override
public boolean shouldVisit(WebURL url) {
String href = url.getURL().toLowerCase();
String domain = url.getDomain().toLowerCase();
//System.out.println("should visit "+href+"?");
boolean result = true;
// reject css, js, etc. as in filters
if (filters.matcher(href).matches()) {
result = false;
}
if (ImageCrawler.domainFilter != null
&& !url.getDomain().contains(ImageCrawler.domainFilter)) {
result = false;
}
//System.out.format("%s (%d): %s\n", domain, url.getDepth(), href);
System.out.print(".");
return result;
}
示例13: visit
import edu.uci.ics.crawler4j.url.WebURL; //导入依赖的package包/类
/**
* This function is called when a page is fetched and ready to be processed. It is important that we don't let any
* Graph exceptions escape this method as this would cause the calling thread to die and eventually the crawler
* would run out of threads.
*/
@Override
public void visit(Page page) {
WebURL webUrl = page.getWebURL();
String statusMessage = FailedUrls.getInstance().getStatusMessage("FailedRequest", webUrl);
if (statusMessage != null) {
logger.warn("Ignoring bad URL " + webUrl + " - " + statusMessage);
return;
}
int pageCounter = atomicPageCounter.incrementAndGet();
if (graphImporter != null) {
logger.info("Importing page # " + pageCounter + ": " + webUrl + " (node count so far: "
+ graphImporter.getNumberOfPageNodes() + ")");
if (page.getParseData() instanceof HtmlParseData) {
visitHtmlPage(webUrl.getURL(), (HtmlParseData) page.getParseData());
} else {
visitNonHtmlPage(webUrl.getURL());
}
}
}
示例14: visitHtmlLinks
import edu.uci.ics.crawler4j.url.WebURL; //导入依赖的package包/类
private void visitHtmlLinks(Node pageNode, List<WebURL> links) {
logger.trace("Number of outgoing links from " + PageNode.getUrl(pageNode) + ": " + links.size());
if (links == null || links.isEmpty()) {
return;
}
List<String> crawlableLinks = new ArrayList<String>(links.size());
for (WebURL link : links) {
if (shouldVisit(link, pageNode)) {
crawlableLinks.add(link.getURL());
}
}
try {
graphImporter.addLinks(pageNode, crawlableLinks);
} catch (Exception e) {
logger.error("Error creating " + crawlableLinks.size() + " links for " + PageNode.getUrl(pageNode), e);
}
}
示例15: shouldVisit
import edu.uci.ics.crawler4j.url.WebURL; //导入依赖的package包/类
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
String href = url.getURL().toLowerCase();
if (IMAGE_EXTENSIONS.matcher(href).matches()) {
return false;
}
return href.startsWith("https://en.wikipedia.org/wiki/");
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-End-to-Endguide-for-Java-developers,代码行数:9,代码来源:SampleCrawler.java