当前位置: 首页>>代码示例>>Java>>正文


Java CrawlController.waitUntilFinish方法代码示例

本文整理汇总了Java中edu.uci.ics.crawler4j.crawler.CrawlController.waitUntilFinish方法的典型用法代码示例。如果您正苦于以下问题:Java CrawlController.waitUntilFinish方法的具体用法?Java CrawlController.waitUntilFinish怎么用?Java CrawlController.waitUntilFinish使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在edu.uci.ics.crawler4j.crawler.CrawlController的用法示例。


在下文中一共展示了CrawlController.waitUntilFinish方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: execute

import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void execute() throws Exception {
    urlMap = GetURL.getAllUrl();

    String crawlStorageFolder = "/data/crawl/root";

    //�����������
    int numberOfCrawlers = 2;

    CrawlConfig config = new CrawlConfig();
    //���ô�������м���Ϣ���ļ�Ŀ¼
    config.setCrawlStorageFolder(crawlStorageFolder);

    //������ȡ���
    config.setMaxDepthOfCrawling(0);

    //�����Ƿ���ȡ���������ݵ�ҳ��
    config.setIncludeBinaryContentInCrawling(false);

    //���⼫����ʷ��������������ɣ����������������ǰ�ȴ�200���루Ĭ�ϣ�
    config.setPolitenessDelay(200);

    //���¿�������
    //config.setResumableCrawling(true);

    //��ʼ������������Ϣ
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

    /*
     *  Ϊÿ��������ӳ�ʼ��ȡҳ�棬������ÿ��ҳ�淢�ֵ�������Ϊ��ȡ����
     *  �����ݿ���Ҫ��ȡ��url��ӵ���ȡ�б���
     */


    //note: map.values and map.keySet ˳���Ƿ�һ�£���飩

    for (String url : urlMap.keySet()) {
    	controller.addSeed(url);
    }

    /*
     * Start the crawl. This is a blocking operation, meaning that your code
     * will reach the line after this only when crawling is finished.
     */
    controller.startNonBlocking(MyCrawler.class, numberOfCrawlers);

    //�ȴ� 1 ���ӣ���ֹ������ȡ������վ��ֹ
    Thread.sleep(1000);

    controller.waitUntilFinish();
}
 
开发者ID:wrayzheng,项目名称:webpage-update-subscribe,代码行数:54,代码来源:Controller.java

示例2: main

import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
	if (args.length != 1) {
		System.out.println("Needed parameter: ");
		System.out.println("\t rootFolder (it will contain intermediate crawl data)");
		return;
	}

	/*
	 * crawlStorageFolder is a folder where intermediate crawl data is
	 * stored.
	 */
	String crawlStorageFolder = args[0];

	CrawlConfig config1 = new CrawlConfig();
	CrawlConfig config2 = new CrawlConfig();

	/*
	 * The two crawlers should have different storage folders for their
	 * intermediate data
	 */
	config1.crawlStorageFolder_$eq(crawlStorageFolder + "/crawler1");
	config2.crawlStorageFolder_$eq(crawlStorageFolder + "/crawler2");

	config1.politenessDelay_$eq(1000);
	config2.politenessDelay_$eq(2000);

	config1.maxPagesToFetch_$eq(50);
	config2.maxPagesToFetch_$eq(100);

	/*
	 * We will use different PageFetchers for the two crawlers.
	 */
	PageFetcher pageFetcher1 = new PageFetcher(config1);
	PageFetcher pageFetcher2 = new PageFetcher(config2);

	/*
	 * We will use the same RobotstxtServer for both of the crawlers.
	 */
	RobotsTxtConfig robotstxtConfig = new RobotsTxtConfig();
	RobotsTxtServer robotstxtServer = new RobotsTxtServer(robotstxtConfig, pageFetcher1);

	CrawlController controller1 = new CrawlController(config1, pageFetcher1, robotstxtServer);
	CrawlController controller2 = new CrawlController(config2, pageFetcher2, robotstxtServer);

	String[] crawler1Domains = new String[] { "http://www.ics.uci.edu/", "http://www.cnn.com/" };
	String[] crawler2Domains = new String[] { "http://en.wikipedia.org/" };

	controller1.setCustomData(crawler1Domains);
	controller2.setCustomData(crawler2Domains);

	controller1.addSeed("http://www.ics.uci.edu/");
	controller1.addSeed("http://www.cnn.com/");
	controller1.addSeed("http://www.ics.uci.edu/~lopes/");
	controller1.addSeed("http://www.cnn.com/POLITICS/");

	controller2.addSeed("http://en.wikipedia.org/wiki/Main_Page");
	controller2.addSeed("http://en.wikipedia.org/wiki/Obama");
	controller2.addSeed("http://en.wikipedia.org/wiki/Bing");

	/*
	 * The first crawler will have 5 cuncurrent threads and the second
	 * crawler will have 7 threads.
	 */
	controller1.startNonBlocking(BasicCrawler.class, 5);
	controller2.startNonBlocking(BasicCrawler.class, 7);

	controller1.waitUntilFinish();
	System.out.println("Crawler 1 is finished.");

	controller2.waitUntilFinish();
	System.out.println("Crawler 2 is finished.");
}
 
开发者ID:sapienapps,项目名称:scrawler,代码行数:73,代码来源:MultipleCrawlerController.java

示例3: main

import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
	if (args.length != 2) {
		System.out.println("Needed parameters: ");
		System.out.println("\t rootFolder (it will contain intermediate crawl data)");
		System.out.println("\t numberOfCralwers (number of concurrent threads)");
		return;
	}

	/*
	 * crawlStorageFolder is a folder where intermediate crawl data is
	 * stored.
	 */
	String crawlStorageFolder = args[0];

	/*
	 * numberOfCrawlers shows the number of concurrent threads that should
	 * be initiated for crawling.
	 */
	int numberOfCrawlers = Integer.parseInt(args[1]);

	CrawlConfig config = new CrawlConfig();

	config.crawlStorageFolder_$eq(crawlStorageFolder);

	config.politenessDelay_$eq(1000);

	// Unlimited number of pages can be crawled.
	config.maxPagesToFetch_$eq(-1);

	/*
	 * Instantiate the controller for this crawl.
	 */
	PageFetcher pageFetcher = new PageFetcher(config);
	RobotsTxtConfig robotstxtConfig = new RobotsTxtConfig();
	RobotsTxtServer robotstxtServer = new RobotsTxtServer(robotstxtConfig, pageFetcher);
	CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

	/*
	 * For each crawl, you need to add some seed urls. These are the first
	 * URLs that are fetched and then the crawler starts following links
	 * which are found in these pages
	 */
	controller.addSeed("http://www.ics.uci.edu/~welling/");
	controller.addSeed("http://www.ics.uci.edu/~lopes/");
	controller.addSeed("http://www.ics.uci.edu/");

	/*
	 * Start the crawl. This is a blocking operation, meaning that your code
	 * will reach the line after this only when crawling is finished.
	 */
	controller.startNonBlocking(BasicCrawler.class, numberOfCrawlers);

	// Wait for 30 seconds
	Thread.sleep(30 * 1000);

	// Send the shutdown request and then wait for finishing
	controller.shutdown();
	controller.waitUntilFinish();
}
 
开发者ID:sapienapps,项目名称:scrawler,代码行数:60,代码来源:ControllerWithShutdown.java

示例4: main

import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
	if (args.length != 1) {
		System.out.println("Needed parameter: ");
		System.out.println("\t rootFolder (it will contain intermediate crawl data)");
		return;
	}

	/*
	 * crawlStorageFolder is a folder where intermediate crawl data is
	 * stored.
	 */
	String crawlStorageFolder = args[0];

	CrawlConfig config1 = new CrawlConfig();
	CrawlConfig config2 = new CrawlConfig();

	/*
	 * The two crawlers should have different storage folders for their
	 * intermediate data
	 */
	config1.setCrawlStorageFolder(crawlStorageFolder + "/crawler1");
	config2.setCrawlStorageFolder(crawlStorageFolder + "/crawler2");

	config1.setPolitenessDelay(1000);
	config2.setPolitenessDelay(2000);

	config1.setMaxPagesToFetch(50);
	config2.setMaxPagesToFetch(100);

	/*
	 * We will use different PageFetchers for the two crawlers.
	 */
	PageFetcher pageFetcher1 = new PageFetcher(config1);
	PageFetcher pageFetcher2 = new PageFetcher(config2);

	/*
	 * We will use the same RobotstxtServer for both of the crawlers.
	 */
	RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
	RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher1);

	CrawlController controller1 = new CrawlController(config1, pageFetcher1, robotstxtServer);
	CrawlController controller2 = new CrawlController(config2, pageFetcher2, robotstxtServer);

	String[] crawler1Domains = new String[] { "http://www.ics.uci.edu/", "http://www.cnn.com/" };
	String[] crawler2Domains = new String[] { "http://en.wikipedia.org/" };

	controller1.setCustomData(crawler1Domains);
	controller2.setCustomData(crawler2Domains);

	controller1.addSeed("http://www.ics.uci.edu/");
	controller1.addSeed("http://www.cnn.com/");
	controller1.addSeed("http://www.ics.uci.edu/~lopes/");
	controller1.addSeed("http://www.cnn.com/POLITICS/");

	controller2.addSeed("http://en.wikipedia.org/wiki/Main_Page");
	controller2.addSeed("http://en.wikipedia.org/wiki/Obama");
	controller2.addSeed("http://en.wikipedia.org/wiki/Bing");

	/*
	 * The first crawler will have 5 cuncurrent threads and the second
	 * crawler will have 7 threads.
	 */
	controller1.startNonBlocking(BasicCrawler.class, 5);
	controller2.startNonBlocking(BasicCrawler.class, 7);

	controller1.waitUntilFinish();
	System.out.println("Crawler 1 is finished.");

	controller2.waitUntilFinish();
	System.out.println("Crawler 2 is finished.");
}
 
开发者ID:Chaiavi,项目名称:Crawler4j,代码行数:73,代码来源:MultipleCrawlerController.java

示例5: main

import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
	if (args.length != 2) {
		System.out.println("Needed parameters: ");
		System.out.println("\t rootFolder (it will contain intermediate crawl data)");
		System.out.println("\t numberOfCralwers (number of concurrent threads)");
		return;
	}

	/*
	 * crawlStorageFolder is a folder where intermediate crawl data is
	 * stored.
	 */
	String crawlStorageFolder = args[0];

	/*
	 * numberOfCrawlers shows the number of concurrent threads that should
	 * be initiated for crawling.
	 */
	int numberOfCrawlers = Integer.parseInt(args[1]);

	CrawlConfig config = new CrawlConfig();

	config.setCrawlStorageFolder(crawlStorageFolder);

	config.setPolitenessDelay(1000);

	// Unlimited number of pages can be crawled.
	config.setMaxPagesToFetch(-1);

	/*
	 * Instantiate the controller for this crawl.
	 */
	PageFetcher pageFetcher = new PageFetcher(config);
	RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
	RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
	CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

	/*
	 * For each crawl, you need to add some seed urls. These are the first
	 * URLs that are fetched and then the crawler starts following links
	 * which are found in these pages
	 */
	controller.addSeed("http://www.ics.uci.edu/~welling/");
	controller.addSeed("http://www.ics.uci.edu/~lopes/");
	controller.addSeed("http://www.ics.uci.edu/");

	/*
	 * Start the crawl. This is a blocking operation, meaning that your code
	 * will reach the line after this only when crawling is finished.
	 */
	controller.startNonBlocking(BasicCrawler.class, numberOfCrawlers);

	// Wait for 30 seconds
	Thread.sleep(30 * 1000);

	// Send the shutdown request and then wait for finishing
	controller.shutdown();
	controller.waitUntilFinish();
}
 
开发者ID:Chaiavi,项目名称:Crawler4j,代码行数:60,代码来源:ControllerWithShutdown.java


注:本文中的edu.uci.ics.crawler4j.crawler.CrawlController.waitUntilFinish方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。