本文整理汇总了Java中edu.uci.ics.crawler4j.crawler.CrawlConfig.setMaxPagesToFetch方法的典型用法代码示例。如果您正苦于以下问题:Java CrawlConfig.setMaxPagesToFetch方法的具体用法?Java CrawlConfig.setMaxPagesToFetch怎么用?Java CrawlConfig.setMaxPagesToFetch使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类edu.uci.ics.crawler4j.crawler.CrawlConfig
的用法示例。
在下文中一共展示了CrawlConfig.setMaxPagesToFetch方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
int numberOfCrawlers = 2;
CrawlConfig config = new CrawlConfig();
String crawlStorageFolder = "data";
config.setCrawlStorageFolder(crawlStorageFolder);
config.setPolitenessDelay(500);
config.setMaxDepthOfCrawling(2);
config.setMaxPagesToFetch(20);
config.setIncludeBinaryContentInCrawling(false);
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
controller.addSeed("https://en.wikipedia.org/wiki/Bishop_Rock,_Isles_of_Scilly");
controller.start(SampleCrawler.class, numberOfCrawlers);
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-End-to-Endguide-for-Java-developers,代码行数:21,代码来源:CrawlerController.java
示例2: main
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
String crawlStorageFolder = "/data/crawl/root";
int numberOfCrawlers = 7;
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
config.setPolitenessDelay(500);
config.setMaxDepthOfCrawling(2);
config.setMaxPagesToFetch(1000);
config.setResumableCrawling(false);
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController crawlController = new CrawlController(config, pageFetcher, robotstxtServer);
crawlController.addSeed("http://www.11st.co.kr/html/main.html");
crawlController.addSeed("http://www.11st.co.kr/html/category/1930.html");
crawlController.start(MyCrawler.class, numberOfCrawlers);
}
示例3: init
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入方法依赖的package包/类
/**
* 初始化
*
* @param numberOfCrawlers
* 爬虫线程数
* @param maxDepthOfCrawling
* 抓取深度
* @param maxPagesToFetch
* 最大抓取页数
* @param politenessDelay
* 延迟
* @param links
* 待爬取链接
*/
public void init(int numberOfCrawlers, int maxDepthOfCrawling, int maxPagesToFetch, int politenessDelay,
String[] links) {
this.numberOfCrawlers = numberOfCrawlers;
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(DefaultConfigValues.CRAWL_STORAGE_FOLDER);
config.setMaxDepthOfCrawling(maxDepthOfCrawling);
config.setIncludeHttpsPages(true);
config.setMaxPagesToFetch(maxPagesToFetch);
config.setIncludeBinaryContentInCrawling(false);
config.setPolitenessDelay(politenessDelay);
config.setUserAgentString(DefaultConfigValues.USER_AGENT);
config.setResumableCrawling(true);
if (com.zhazhapan.vspider.models.CrawlConfig.getTurnOnProxy().get()) {
logger.info("open proxy");
config.setProxyHost(com.zhazhapan.vspider.models.CrawlConfig.getProxyServer().get());
config.setProxyPort(Formatter.stringToInt(com.zhazhapan.vspider.models.CrawlConfig.getProxyPort().get()));
config.setProxyUsername(com.zhazhapan.vspider.models.CrawlConfig.getProxyUser().get());
config.setProxyPassword(com.zhazhapan.vspider.models.CrawlConfig.getProxyPass().get());
}
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
robotstxtConfig.setEnabled(false);
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
try {
controller = new CrawlController(config, pageFetcher, robotstxtServer);
for (String link : links) {
if (Checker.isHyperLink(link)) {
controller.addSeed(link);
}
}
isInited = true;
} catch (Exception e) {
logger.error("start to crawl urls error: " + e.getMessage());
}
}
示例4: setCrawlConfig
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入方法依赖的package包/类
private CrawlConfig setCrawlConfig() {
CrawlConfig crawlConfig = new CrawlConfig();
crawlConfig.setCrawlStorageFolder(CRAWL_TEMP_DIR);
crawlConfig.setMaxDepthOfCrawling(MAX_DEPTH_OF_CRAWLING);
crawlConfig.setMaxPagesToFetch(MAX_PAGES_TO_FETCH);
return crawlConfig;
}
示例5: setCrawlConfig
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入方法依赖的package包/类
private CrawlConfig setCrawlConfig() {
CrawlConfig crawlConfig = new CrawlConfig();
crawlConfig.setCrawlStorageFolder("src/resources/test/crawlerTemporaryDirectory");
crawlConfig.setMaxDepthOfCrawling(1);
crawlConfig.setMaxPagesToFetch(1);
return crawlConfig;
}
示例6: main
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.out.println("Needed parameters: ");
System.out.println("\t rootFolder (it will contain intermediate crawl data)");
System.out.println("\t numberOfCralwers (number of concurrent threads)");
return;
}
String rootFolder = args[0];
int numberOfCrawlers = Integer.parseInt(args[1]);
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(rootFolder);
config.setMaxPagesToFetch(10);
config.setPolitenessDelay(1000);
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
controller.addSeed("http://www.ics.uci.edu/");
controller.start(LocalDataCollectorCrawler.class, numberOfCrawlers);
List<Object> crawlersLocalData = controller.getCrawlersLocalData();
long totalLinks = 0;
long totalTextSize = 0;
int totalProcessedPages = 0;
for (Object localData : crawlersLocalData) {
CrawlStat stat = (CrawlStat) localData;
totalLinks += stat.getTotalLinks();
totalTextSize += stat.getTotalTextSize();
totalProcessedPages += stat.getTotalProcessedPages();
}
System.out.println("Aggregated Statistics:");
System.out.println(" Processed Pages: " + totalProcessedPages);
System.out.println(" Total Links found: " + totalLinks);
System.out.println(" Total Text Size: " + totalTextSize);
}
示例7: main
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 1) {
System.out.println("Needed parameter: ");
System.out.println("\t rootFolder (it will contain intermediate crawl data)");
return;
}
/*
* crawlStorageFolder is a folder where intermediate crawl data is
* stored.
*/
String crawlStorageFolder = args[0];
CrawlConfig config1 = new CrawlConfig();
CrawlConfig config2 = new CrawlConfig();
/*
* The two crawlers should have different storage folders for their
* intermediate data
*/
config1.setCrawlStorageFolder(crawlStorageFolder + "/crawler1");
config2.setCrawlStorageFolder(crawlStorageFolder + "/crawler2");
config1.setPolitenessDelay(1000);
config2.setPolitenessDelay(2000);
config1.setMaxPagesToFetch(50);
config2.setMaxPagesToFetch(100);
/*
* We will use different PageFetchers for the two crawlers.
*/
PageFetcher pageFetcher1 = new PageFetcher(config1);
PageFetcher pageFetcher2 = new PageFetcher(config2);
/*
* We will use the same RobotstxtServer for both of the crawlers.
*/
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher1);
CrawlController controller1 = new CrawlController(config1, pageFetcher1, robotstxtServer);
CrawlController controller2 = new CrawlController(config2, pageFetcher2, robotstxtServer);
String[] crawler1Domains = new String[] { "http://www.ics.uci.edu/", "http://www.cnn.com/" };
String[] crawler2Domains = new String[] { "http://en.wikipedia.org/" };
controller1.setCustomData(crawler1Domains);
controller2.setCustomData(crawler2Domains);
controller1.addSeed("http://www.ics.uci.edu/");
controller1.addSeed("http://www.cnn.com/");
controller1.addSeed("http://www.ics.uci.edu/~lopes/");
controller1.addSeed("http://www.cnn.com/POLITICS/");
controller2.addSeed("http://en.wikipedia.org/wiki/Main_Page");
controller2.addSeed("http://en.wikipedia.org/wiki/Obama");
controller2.addSeed("http://en.wikipedia.org/wiki/Bing");
/*
* The first crawler will have 5 cuncurrent threads and the second
* crawler will have 7 threads.
*/
controller1.startNonBlocking(BasicCrawler.class, 5);
controller2.startNonBlocking(BasicCrawler.class, 7);
controller1.waitUntilFinish();
System.out.println("Crawler 1 is finished.");
controller2.waitUntilFinish();
System.out.println("Crawler 2 is finished.");
}
示例8: main
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.out.println("Needed parameters: ");
System.out.println("\t rootFolder (it will contain intermediate crawl data)");
System.out.println("\t numberOfCralwers (number of concurrent threads)");
return;
}
/*
* crawlStorageFolder is a folder where intermediate crawl data is
* stored.
*/
String crawlStorageFolder = args[0];
/*
* numberOfCrawlers shows the number of concurrent threads that should
* be initiated for crawling.
*/
int numberOfCrawlers = Integer.parseInt(args[1]);
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
config.setPolitenessDelay(1000);
// Unlimited number of pages can be crawled.
config.setMaxPagesToFetch(-1);
/*
* Instantiate the controller for this crawl.
*/
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* For each crawl, you need to add some seed urls. These are the first
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("http://www.ics.uci.edu/~welling/");
controller.addSeed("http://www.ics.uci.edu/~lopes/");
controller.addSeed("http://www.ics.uci.edu/");
/*
* Start the crawl. This is a blocking operation, meaning that your code
* will reach the line after this only when crawling is finished.
*/
controller.startNonBlocking(BasicCrawler.class, numberOfCrawlers);
// Wait for 30 seconds
Thread.sleep(30 * 1000);
// Send the shutdown request and then wait for finishing
controller.shutdown();
controller.waitUntilFinish();
}
示例9: buildCrawlConfig
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入方法依赖的package包/类
private CrawlConfig buildCrawlConfig() {
CrawlConfig crawlConfig = new CrawlConfig();
crawlConfig.setCrawlStorageFolder(appConfig.getCrawlStorageDir());
crawlConfig.setIncludeBinaryContentInCrawling(appConfig.isIncludeBinaryContentInCrawling());
crawlConfig.setIncludeHttpsPages(appConfig.isIncludeHttpsPages());
/*
* Be polite (200 ms min).
*/
crawlConfig.setPolitenessDelay(appConfig.getPolitenessDelay());
/*
* You can set the maximum crawl depth here. The default value is -1 for unlimited depth
*/
crawlConfig.setMaxDepthOfCrawling(appConfig.getMaxDepthOfCrawling());
/*
* You can set the maximum number of pages to crawl. The default value is -1 for unlimited number of pages.
*/
crawlConfig.setMaxPagesToFetch(appConfig.getMaxPagesToFetch());
/*
* This config parameter can be used to set your crawl to be resumable (meaning that you can resume the crawl
* from a previously interrupted/crashed crawl). Note: if you enable resuming feature and want to start a fresh
* crawl, you need to delete the contents of rootFolder manually.
*/
crawlConfig.setResumableCrawling(appConfig.isResumableImport());
crawlConfig.setMaxConnectionsPerHost(appConfig.getMaxConnectionsPerHost());
crawlConfig.setMaxTotalConnections(appConfig.getMaxTotalConnections());
if (appConfig.getProxyHost() != null) {
crawlConfig.setProxyHost(appConfig.getProxyHost());
crawlConfig.setProxyPort(appConfig.getProxyPort());
}
if (appConfig.getProxyUsername() != null) {
crawlConfig.setProxyUsername(appConfig.getProxyUsername());
}
if (appConfig.getProxyPassword() != null) {
crawlConfig.setProxyPassword(appConfig.getProxyPassword());
}
return crawlConfig;
}