本文整理汇总了Java中edu.uci.ics.crawler4j.crawler.CrawlController.waitUntilFinish方法的典型用法代码示例。如果您正苦于以下问题:Java CrawlController.waitUntilFinish方法的具体用法?Java CrawlController.waitUntilFinish怎么用?Java CrawlController.waitUntilFinish使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类edu.uci.ics.crawler4j.crawler.CrawlController
的用法示例。
在下文中一共展示了CrawlController.waitUntilFinish方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: execute
import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void execute() throws Exception {
urlMap = GetURL.getAllUrl();
String crawlStorageFolder = "/data/crawl/root";
//�����������
int numberOfCrawlers = 2;
CrawlConfig config = new CrawlConfig();
//���ô�������м���Ϣ���ļ�Ŀ¼
config.setCrawlStorageFolder(crawlStorageFolder);
//������ȡ���
config.setMaxDepthOfCrawling(0);
//�����Ƿ���ȡ���������ݵ�ҳ��
config.setIncludeBinaryContentInCrawling(false);
//���⼫����ʷ��������������ɣ����������������ǰ�ȴ�200���루Ĭ�ϣ�
config.setPolitenessDelay(200);
//���¿�������
//config.setResumableCrawling(true);
//��ʼ������������Ϣ
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* Ϊÿ��������ӳ�ʼ��ȡҳ�棬������ÿ��ҳ�淢�ֵ�������Ϊ��ȡ����
* �����ݿ���Ҫ��ȡ��url��ӵ���ȡ�б���
*/
//note: map.values and map.keySet ˳���Ƿ�һ�£���飩
for (String url : urlMap.keySet()) {
controller.addSeed(url);
}
/*
* Start the crawl. This is a blocking operation, meaning that your code
* will reach the line after this only when crawling is finished.
*/
controller.startNonBlocking(MyCrawler.class, numberOfCrawlers);
//�ȴ� 1 ���ӣ���ֹ������ȡ������վ��ֹ
Thread.sleep(1000);
controller.waitUntilFinish();
}
示例2: main
import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 1) {
System.out.println("Needed parameter: ");
System.out.println("\t rootFolder (it will contain intermediate crawl data)");
return;
}
/*
* crawlStorageFolder is a folder where intermediate crawl data is
* stored.
*/
String crawlStorageFolder = args[0];
CrawlConfig config1 = new CrawlConfig();
CrawlConfig config2 = new CrawlConfig();
/*
* The two crawlers should have different storage folders for their
* intermediate data
*/
config1.crawlStorageFolder_$eq(crawlStorageFolder + "/crawler1");
config2.crawlStorageFolder_$eq(crawlStorageFolder + "/crawler2");
config1.politenessDelay_$eq(1000);
config2.politenessDelay_$eq(2000);
config1.maxPagesToFetch_$eq(50);
config2.maxPagesToFetch_$eq(100);
/*
* We will use different PageFetchers for the two crawlers.
*/
PageFetcher pageFetcher1 = new PageFetcher(config1);
PageFetcher pageFetcher2 = new PageFetcher(config2);
/*
* We will use the same RobotstxtServer for both of the crawlers.
*/
RobotsTxtConfig robotstxtConfig = new RobotsTxtConfig();
RobotsTxtServer robotstxtServer = new RobotsTxtServer(robotstxtConfig, pageFetcher1);
CrawlController controller1 = new CrawlController(config1, pageFetcher1, robotstxtServer);
CrawlController controller2 = new CrawlController(config2, pageFetcher2, robotstxtServer);
String[] crawler1Domains = new String[] { "http://www.ics.uci.edu/", "http://www.cnn.com/" };
String[] crawler2Domains = new String[] { "http://en.wikipedia.org/" };
controller1.setCustomData(crawler1Domains);
controller2.setCustomData(crawler2Domains);
controller1.addSeed("http://www.ics.uci.edu/");
controller1.addSeed("http://www.cnn.com/");
controller1.addSeed("http://www.ics.uci.edu/~lopes/");
controller1.addSeed("http://www.cnn.com/POLITICS/");
controller2.addSeed("http://en.wikipedia.org/wiki/Main_Page");
controller2.addSeed("http://en.wikipedia.org/wiki/Obama");
controller2.addSeed("http://en.wikipedia.org/wiki/Bing");
/*
* The first crawler will have 5 cuncurrent threads and the second
* crawler will have 7 threads.
*/
controller1.startNonBlocking(BasicCrawler.class, 5);
controller2.startNonBlocking(BasicCrawler.class, 7);
controller1.waitUntilFinish();
System.out.println("Crawler 1 is finished.");
controller2.waitUntilFinish();
System.out.println("Crawler 2 is finished.");
}
示例3: main
import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.out.println("Needed parameters: ");
System.out.println("\t rootFolder (it will contain intermediate crawl data)");
System.out.println("\t numberOfCralwers (number of concurrent threads)");
return;
}
/*
* crawlStorageFolder is a folder where intermediate crawl data is
* stored.
*/
String crawlStorageFolder = args[0];
/*
* numberOfCrawlers shows the number of concurrent threads that should
* be initiated for crawling.
*/
int numberOfCrawlers = Integer.parseInt(args[1]);
CrawlConfig config = new CrawlConfig();
config.crawlStorageFolder_$eq(crawlStorageFolder);
config.politenessDelay_$eq(1000);
// Unlimited number of pages can be crawled.
config.maxPagesToFetch_$eq(-1);
/*
* Instantiate the controller for this crawl.
*/
PageFetcher pageFetcher = new PageFetcher(config);
RobotsTxtConfig robotstxtConfig = new RobotsTxtConfig();
RobotsTxtServer robotstxtServer = new RobotsTxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* For each crawl, you need to add some seed urls. These are the first
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("http://www.ics.uci.edu/~welling/");
controller.addSeed("http://www.ics.uci.edu/~lopes/");
controller.addSeed("http://www.ics.uci.edu/");
/*
* Start the crawl. This is a blocking operation, meaning that your code
* will reach the line after this only when crawling is finished.
*/
controller.startNonBlocking(BasicCrawler.class, numberOfCrawlers);
// Wait for 30 seconds
Thread.sleep(30 * 1000);
// Send the shutdown request and then wait for finishing
controller.shutdown();
controller.waitUntilFinish();
}
示例4: main
import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 1) {
System.out.println("Needed parameter: ");
System.out.println("\t rootFolder (it will contain intermediate crawl data)");
return;
}
/*
* crawlStorageFolder is a folder where intermediate crawl data is
* stored.
*/
String crawlStorageFolder = args[0];
CrawlConfig config1 = new CrawlConfig();
CrawlConfig config2 = new CrawlConfig();
/*
* The two crawlers should have different storage folders for their
* intermediate data
*/
config1.setCrawlStorageFolder(crawlStorageFolder + "/crawler1");
config2.setCrawlStorageFolder(crawlStorageFolder + "/crawler2");
config1.setPolitenessDelay(1000);
config2.setPolitenessDelay(2000);
config1.setMaxPagesToFetch(50);
config2.setMaxPagesToFetch(100);
/*
* We will use different PageFetchers for the two crawlers.
*/
PageFetcher pageFetcher1 = new PageFetcher(config1);
PageFetcher pageFetcher2 = new PageFetcher(config2);
/*
* We will use the same RobotstxtServer for both of the crawlers.
*/
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher1);
CrawlController controller1 = new CrawlController(config1, pageFetcher1, robotstxtServer);
CrawlController controller2 = new CrawlController(config2, pageFetcher2, robotstxtServer);
String[] crawler1Domains = new String[] { "http://www.ics.uci.edu/", "http://www.cnn.com/" };
String[] crawler2Domains = new String[] { "http://en.wikipedia.org/" };
controller1.setCustomData(crawler1Domains);
controller2.setCustomData(crawler2Domains);
controller1.addSeed("http://www.ics.uci.edu/");
controller1.addSeed("http://www.cnn.com/");
controller1.addSeed("http://www.ics.uci.edu/~lopes/");
controller1.addSeed("http://www.cnn.com/POLITICS/");
controller2.addSeed("http://en.wikipedia.org/wiki/Main_Page");
controller2.addSeed("http://en.wikipedia.org/wiki/Obama");
controller2.addSeed("http://en.wikipedia.org/wiki/Bing");
/*
* The first crawler will have 5 cuncurrent threads and the second
* crawler will have 7 threads.
*/
controller1.startNonBlocking(BasicCrawler.class, 5);
controller2.startNonBlocking(BasicCrawler.class, 7);
controller1.waitUntilFinish();
System.out.println("Crawler 1 is finished.");
controller2.waitUntilFinish();
System.out.println("Crawler 2 is finished.");
}
示例5: main
import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.out.println("Needed parameters: ");
System.out.println("\t rootFolder (it will contain intermediate crawl data)");
System.out.println("\t numberOfCralwers (number of concurrent threads)");
return;
}
/*
* crawlStorageFolder is a folder where intermediate crawl data is
* stored.
*/
String crawlStorageFolder = args[0];
/*
* numberOfCrawlers shows the number of concurrent threads that should
* be initiated for crawling.
*/
int numberOfCrawlers = Integer.parseInt(args[1]);
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
config.setPolitenessDelay(1000);
// Unlimited number of pages can be crawled.
config.setMaxPagesToFetch(-1);
/*
* Instantiate the controller for this crawl.
*/
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* For each crawl, you need to add some seed urls. These are the first
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("http://www.ics.uci.edu/~welling/");
controller.addSeed("http://www.ics.uci.edu/~lopes/");
controller.addSeed("http://www.ics.uci.edu/");
/*
* Start the crawl. This is a blocking operation, meaning that your code
* will reach the line after this only when crawling is finished.
*/
controller.startNonBlocking(BasicCrawler.class, numberOfCrawlers);
// Wait for 30 seconds
Thread.sleep(30 * 1000);
// Send the shutdown request and then wait for finishing
controller.shutdown();
controller.waitUntilFinish();
}