当前位置: 首页>>代码示例>>Java>>正文


Java CrawlController.start方法代码示例

本文整理汇总了Java中edu.uci.ics.crawler4j.crawler.CrawlController.start方法的典型用法代码示例。如果您正苦于以下问题:Java CrawlController.start方法的具体用法?Java CrawlController.start怎么用?Java CrawlController.start使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在edu.uci.ics.crawler4j.crawler.CrawlController的用法示例。


在下文中一共展示了CrawlController.start方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: main

import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
  int numberOfCrawlers = 2;
  CrawlConfig config = new CrawlConfig();
  String crawlStorageFolder = "data";
  
  config.setCrawlStorageFolder(crawlStorageFolder);
  config.setPolitenessDelay(500);
  config.setMaxDepthOfCrawling(2);
  config.setMaxPagesToFetch(20);
  config.setIncludeBinaryContentInCrawling(false);

  PageFetcher pageFetcher = new PageFetcher(config);
  RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
  RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
  CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

  controller.addSeed("https://en.wikipedia.org/wiki/Bishop_Rock,_Isles_of_Scilly");

  controller.start(SampleCrawler.class, numberOfCrawlers);
}
 
开发者ID:PacktPublishing,项目名称:Machine-Learning-End-to-Endguide-for-Java-developers,代码行数:21,代码来源:CrawlerController.java

示例2: main

import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
	String crawlStorageFolder = "/data/crawl/root";
	int numberOfCrawlers = 7;
	
	CrawlConfig config = new CrawlConfig();
	config.setCrawlStorageFolder(crawlStorageFolder);
	config.setPolitenessDelay(500);
	config.setMaxDepthOfCrawling(2);
	config.setMaxPagesToFetch(1000);
	config.setResumableCrawling(false);
	
	PageFetcher pageFetcher = new PageFetcher(config);
	RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
	RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
	CrawlController crawlController = new CrawlController(config, pageFetcher, robotstxtServer);
	
	crawlController.addSeed("http://www.11st.co.kr/html/main.html");
	crawlController.addSeed("http://www.11st.co.kr/html/category/1930.html");
	
	crawlController.start(MyCrawler.class, numberOfCrawlers);
}
 
开发者ID:UCJung,项目名称:javalab,代码行数:22,代码来源:MyCrawlerController.java

示例3: crawlAndImport

import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
/**
 * This is where everything happens!
 */
private void crawlAndImport() throws Exception {

    CrawlConfig crawlConfig = buildCrawlConfig();
    PageFetcher pageFetcher = new PageFetcher(crawlConfig);
    RobotstxtConfig robotsTxtConfig = new RobotstxtConfig();
    robotsTxtConfig.setEnabled(appConfig.isRespectRobotsTxt());
    RobotstxtServer robotsTxtServer = new RobotstxtServer(robotsTxtConfig, pageFetcher);
    CrawlController crawlController = new CrawlController(crawlConfig, pageFetcher, robotsTxtServer);

    // "dependency injection" into crawlers
    Object[] customData = new Object[] { appConfig, graphImporter };
    crawlController.setCustomData(customData);

    addSeedUrls(crawlController);

    logger.info("Start crawling");

    /*
     * Start the crawl. This is a blocking operation, meaning that your code will reach the line after this only
     * when crawling is finished.
     */
    crawlController.start(HtmlOnlyCrawler.class, appConfig.getNumberOfCrawlers());

    logger.info("Finished crawling");
}
 
开发者ID:fgavilondo,项目名称:neo4j-webgraph,代码行数:29,代码来源:App.java

示例4: main

import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
    String crawlStorageFolder = "/data/crawl/root";
    int numberOfCrawlers = 7;

    CrawlConfig config = new CrawlConfig();

    config.setPolitenessDelay(100);

    config.setCrawlStorageFolder(crawlStorageFolder);

    /*
     * Instantiate the controller for this crawl.
     */
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

    /*
     * For each crawl, you need to add some seed urls. These are the first
     * URLs that are fetched and then the crawler starts following links
     * which are found in these pages
     */
    controller.addSeed("https://de.wikipedia.org/wiki/Java_Database_Connectivity");
    controller.addSeed("https://de.wikipedia.org/wiki/Relationale_Datenbank");
    controller.addSeed("https://pt.wikipedia.org/wiki/JDBC");
    controller.addSeed("https://pt.wikipedia.org/wiki/Protocolo");
    controller.addSeed("https://de.wikipedia.org/wiki/Datenbank");

    /*
     * Start the crawl. This is a blocking operation, meaning that your code
     * will reach the line after this only when crawling is finished.
     */

    controller.start(new PostgresCrawlerFactory("jdbc:postgresql://localhost/crawler4j","postgres","postgres"), numberOfCrawlers);
}
 
开发者ID:rzo1,项目名称:crawler4j-postgres-sample,代码行数:37,代码来源:SampleLauncher.java

示例5: run

import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public void run(final CrawlerSettings crawlerSettings, final List<Memo> memos) throws Exception {
    CrawlConfig config = crawlerSettings.getCrawlConfig();

    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

    for (String seed : crawlerSettings.getSeeds()) {
        controller.addSeed(seed);
    }

    ActionsCrawler.configure(memoEntryFinder, memoMatching, servicesContext, memos);
    controller.start(ActionsCrawler.class, crawlerSettings.getNumberOfCrawlers());
}
 
开发者ID:edu-xored,项目名称:memorise,代码行数:16,代码来源:CrawlerRunner.java

示例6: main

import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
    if (args.length != 1) {
        return;
    }

    String crawlStorageFolder = args[0];
    int numberOfCrawlers = 1;

    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(crawlStorageFolder);

    /*
     * Instantiate the controller for this crawl.
     */
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

    /*
     * For each crawl, you need to add some seed urls. These are the first
     * URLs that are fetched and then the crawler starts following links
     * which are found in these pages
     */
    controller.addSeed("http://www.senado.leg.br/senadores/default.asp");

    /*
     * Start the crawl. This is a blocking operation, meaning that your code
     * will reach the line after this only when crawling is finished.
     */
    controller.start(SenatorsCrawler.class, numberOfCrawlers);
}
 
开发者ID:TekkLabs,项目名称:memoria-politica,代码行数:33,代码来源:CrawlerController.java

示例7: main

import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
    String crawlStorageFolder = "D:\\etc\\storage";
    int numberOfCrawlers = 7;

    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(crawlStorageFolder);

    /*
     * Instantiate the controller for this crawl.
     */
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

    /*
     * For each crawl, you need to add some seed urls. These are the first
     * URLs that are fetched and then the crawler starts following links
     * which are found in these pages
     */
    controller.addSeed("http://www.ics.uci.edu/~lopes/");
    controller.addSeed("http://www.ics.uci.edu/~welling/");
    controller.addSeed("http://www.ics.uci.edu/");

    /*
     * Start the crawl. This is a blocking operation, meaning that your code
     * will reach the line after this only when crawling is finished.
     */
    controller.start(MyCrawler.class, numberOfCrawlers);
}
 
开发者ID:vjymits,项目名称:musicFinder,代码行数:31,代码来源:MyCrawler.java

示例8: main

import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
	if (args.length < 3) {
		System.out.println("Needed parameters: ");
		System.out.println("\t rootFolder (it will contain intermediate crawl data)");
		System.out.println("\t numberOfCralwers (number of concurrent threads)");
		System.out.println("\t storageFolder (a folder for storing downloaded images)");
		return;
	}
	String rootFolder = args[0];
	int numberOfCrawlers = Integer.parseInt(args[1]);
	String storageFolder = args[2];

	CrawlConfig config = new CrawlConfig();

	config.crawlStorageFolder_$eq(rootFolder);

	/*
	 * Since images are binary content, we need to set this parameter to
	 * true to make sure they are included in the crawl.
	 */
	config.includeBinaryContentInCrawling_$eq(true);

	String[] crawlDomains = new String[] { "http://uci.edu/" };

	PageFetcher pageFetcher = new PageFetcher(config);
	RobotsTxtConfig robotstxtConfig = new RobotsTxtConfig();
	RobotsTxtServer robotstxtServer = new RobotsTxtServer(robotstxtConfig, pageFetcher);
	CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
	for (String domain : crawlDomains) {
		controller.addSeed(domain);
	}

	ImageCrawler.configure(crawlDomains, storageFolder);

	controller.start(ImageCrawler.class, numberOfCrawlers);
}
 
开发者ID:sapienapps,项目名称:scrawler,代码行数:37,代码来源:ImageCrawlController.java

示例9: main

import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
	if (args.length != 2) {
		System.out.println("Needed parameters: ");
		System.out.println("\t rootFolder (it will contain intermediate crawl data)");
		System.out.println("\t numberOfCralwers (number of concurrent threads)");
		return;
	}
	String rootFolder = args[0];
	int numberOfCrawlers = Integer.parseInt(args[1]);

	CrawlConfig config = new CrawlConfig();
	config.crawlStorageFolder_$eq(rootFolder);
	config.maxPagesToFetch_$eq(10);
	config.politenessDelay_$eq(1000);

	PageFetcher pageFetcher = new PageFetcher(config);
	RobotsTxtConfig robotstxtConfig = new RobotsTxtConfig();
	RobotsTxtServer robotstxtServer = new RobotsTxtServer(robotstxtConfig, pageFetcher);
	CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

	controller.addSeed("http://www.ics.uci.edu/");
	controller.start(LocalDataCollectorCrawler.class, numberOfCrawlers);

	List<Object> crawlersLocalData = controller.getCrawlersLocalData();
	long totalLinks = 0;
	long totalTextSize = 0;
	int totalProcessedPages = 0;
	for (Object localData : crawlersLocalData) {
		CrawlStat stat = (CrawlStat) localData;
		totalLinks += stat.getTotalLinks();
		totalTextSize += stat.getTotalTextSize();
		totalProcessedPages += stat.getTotalProcessedPages();
	}
	System.out.println("Aggregated Statistics:");
	System.out.println("   Processed Pages: " + totalProcessedPages);
	System.out.println("   Total Links found: " + totalLinks);
	System.out.println("   Total Text Size: " + totalTextSize);
}
 
开发者ID:sapienapps,项目名称:scrawler,代码行数:39,代码来源:LocalDataCollectorController.java

示例10: main

import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
	if (args.length < 3) {
		System.out.println("Needed parameters: ");
		System.out.println("\t rootFolder (it will contain intermediate crawl data)");
		System.out.println("\t numberOfCralwers (number of concurrent threads)");
		System.out.println("\t storageFolder (a folder for storing downloaded images)");
		return;
	}
	String rootFolder = args[0];
	int numberOfCrawlers = Integer.parseInt(args[1]);
	String storageFolder = args[2];

	CrawlConfig config = new CrawlConfig();

	config.setCrawlStorageFolder(rootFolder);

	/*
	 * Since images are binary content, we need to set this parameter to
	 * true to make sure they are included in the crawl.
	 */
	config.setIncludeBinaryContentInCrawling(true);

	String[] crawlDomains = new String[] { "http://uci.edu/" };

	PageFetcher pageFetcher = new PageFetcher(config);
	RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
	RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
	CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
	for (String domain : crawlDomains) {
		controller.addSeed(domain);
	}

	ImageCrawler.configure(crawlDomains, storageFolder);

	controller.start(ImageCrawler.class, numberOfCrawlers);
}
 
开发者ID:Chaiavi,项目名称:Crawler4j,代码行数:37,代码来源:ImageCrawlController.java

示例11: main

import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
	if (args.length != 2) {
		System.out.println("Needed parameters: ");
		System.out.println("\t rootFolder (it will contain intermediate crawl data)");
		System.out.println("\t numberOfCralwers (number of concurrent threads)");
		return;
	}
	String rootFolder = args[0];
	int numberOfCrawlers = Integer.parseInt(args[1]);

	CrawlConfig config = new CrawlConfig();
	config.setCrawlStorageFolder(rootFolder);
	config.setMaxPagesToFetch(10);
	config.setPolitenessDelay(1000);

	PageFetcher pageFetcher = new PageFetcher(config);
	RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
	RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
	CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

	controller.addSeed("http://www.ics.uci.edu/");
	controller.start(LocalDataCollectorCrawler.class, numberOfCrawlers);

	List<Object> crawlersLocalData = controller.getCrawlersLocalData();
	long totalLinks = 0;
	long totalTextSize = 0;
	int totalProcessedPages = 0;
	for (Object localData : crawlersLocalData) {
		CrawlStat stat = (CrawlStat) localData;
		totalLinks += stat.getTotalLinks();
		totalTextSize += stat.getTotalTextSize();
		totalProcessedPages += stat.getTotalProcessedPages();
	}
	System.out.println("Aggregated Statistics:");
	System.out.println("   Processed Pages: " + totalProcessedPages);
	System.out.println("   Total Links found: " + totalLinks);
	System.out.println("   Total Text Size: " + totalTextSize);
}
 
开发者ID:Chaiavi,项目名称:Crawler4j,代码行数:39,代码来源:LocalDataCollectorController.java

示例12: main

import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
	if (args.length <= 0) {
		System.out.println("You must specify seed url.");
		return;
	}

	ConfigFile configFile = new ConfigFile("settings.cfg");

	// create database connection object
	String dbUrl = configFile.get("dbUrl");
	String dbUser = configFile.get("dbUser");
	String dbPass = configFile.get("dbPass");

	Connection dbConnection = DriverManager.getConnection("jdbc:" + dbUrl,
			dbUser, dbPass);
	System.out.println("Connected to " + dbUrl);

	int numberOfCrawlers = Integer.parseInt(configFile.get("crawlerCount"));
	CrawlConfig config = new CrawlConfig();
	config.setCrawlStorageFolder("storage");
	config.setPolitenessDelay(Integer.parseInt(configFile.get("crawlerPolitenessDelay")));
	config.setIncludeBinaryContentInCrawling(true);
	config.setResumableCrawling(Integer.parseInt(configFile.get("crawlerResumeCrawling"))>0);

	PageFetcher pageFetcher = new PageFetcher(config);
	RobotstxtConfig robotsTxtConfig = new RobotstxtConfig();
	RobotstxtServer robotstxtServer = new RobotstxtServer(robotsTxtConfig,
			pageFetcher);

	try {
		String domainFilter = null;
		CrawlController controller = new CrawlController(config,
				pageFetcher, robotstxtServer);
		controller.addSeed(args[0]);
		ImageCrawler.configure(domainFilter, configFile.get("imagesDir"),
				dbConnection, configFile);
		System.out.println("Starting crawler @" + args[0]);
		controller.start(ImageCrawler.class, numberOfCrawlers);
	} catch (Exception e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
	dbConnection.close();
	System.out.println("Done!");		
}
 
开发者ID:pgorecki,项目名称:visearch,代码行数:46,代码来源:CrawlerManager.java


注:本文中的edu.uci.ics.crawler4j.crawler.CrawlController.start方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。