Java CrawlConfig.setResumableCrawling方法代码示例

本文整理汇总了Java中edu.uci.ics.crawler4j.crawler.CrawlConfig.setResumableCrawling方法的典型用法代码示例。如果您正苦于以下问题：Java CrawlConfig.setResumableCrawling方法的具体用法？Java CrawlConfig.setResumableCrawling怎么用？Java CrawlConfig.setResumableCrawling使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类edu.uci.ics.crawler4j.crawler.CrawlConfig的用法示例。

在下文中一共展示了CrawlConfig.setResumableCrawling方法的4个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: main

import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
	String crawlStorageFolder = "/data/crawl/root";
	int numberOfCrawlers = 7;
	
	CrawlConfig config = new CrawlConfig();
	config.setCrawlStorageFolder(crawlStorageFolder);
	config.setPolitenessDelay(500);
	config.setMaxDepthOfCrawling(2);
	config.setMaxPagesToFetch(1000);
	config.setResumableCrawling(false);
	
	PageFetcher pageFetcher = new PageFetcher(config);
	RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
	RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
	CrawlController crawlController = new CrawlController(config, pageFetcher, robotstxtServer);
	
	crawlController.addSeed("http://www.11st.co.kr/html/main.html");
	crawlController.addSeed("http://www.11st.co.kr/html/category/1930.html");
	
	crawlController.start(MyCrawler.class, numberOfCrawlers);
}

开发者ID:UCJung，项目名称:javalab，代码行数:22，代码来源:MyCrawlerController.java

示例2: init

import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入方法依赖的package包/类
/**
 * 初始化
 * 
 * @param numberOfCrawlers
 *            爬虫线程数
 * @param maxDepthOfCrawling
 *            抓取深度
 * @param maxPagesToFetch
 *            最大抓取页数
 * @param politenessDelay
 *            延迟
 * @param links
 *            待爬取链接
 */
public void init(int numberOfCrawlers, int maxDepthOfCrawling, int maxPagesToFetch, int politenessDelay,
		String[] links) {
	this.numberOfCrawlers = numberOfCrawlers;
	CrawlConfig config = new CrawlConfig();
	config.setCrawlStorageFolder(DefaultConfigValues.CRAWL_STORAGE_FOLDER);
	config.setMaxDepthOfCrawling(maxDepthOfCrawling);
	config.setIncludeHttpsPages(true);
	config.setMaxPagesToFetch(maxPagesToFetch);
	config.setIncludeBinaryContentInCrawling(false);
	config.setPolitenessDelay(politenessDelay);
	config.setUserAgentString(DefaultConfigValues.USER_AGENT);
	config.setResumableCrawling(true);

	if (com.zhazhapan.vspider.models.CrawlConfig.getTurnOnProxy().get()) {
		logger.info("open proxy");
		config.setProxyHost(com.zhazhapan.vspider.models.CrawlConfig.getProxyServer().get());
		config.setProxyPort(Formatter.stringToInt(com.zhazhapan.vspider.models.CrawlConfig.getProxyPort().get()));
		config.setProxyUsername(com.zhazhapan.vspider.models.CrawlConfig.getProxyUser().get());
		config.setProxyPassword(com.zhazhapan.vspider.models.CrawlConfig.getProxyPass().get());
	}

	PageFetcher pageFetcher = new PageFetcher(config);
	RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
	robotstxtConfig.setEnabled(false);
	RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
	try {
		controller = new CrawlController(config, pageFetcher, robotstxtServer);
		for (String link : links) {
			if (Checker.isHyperLink(link)) {
				controller.addSeed(link);
			}
		}
		isInited = true;
	} catch (Exception e) {
		logger.error("start to crawl urls error: " + e.getMessage());
	}
}

开发者ID:zhazhapan，项目名称:visual-spider，代码行数:52，代码来源:VsController.java

示例3: main

import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
	if (args.length <= 0) {
		System.out.println("You must specify seed url.");
		return;
	}

	ConfigFile configFile = new ConfigFile("settings.cfg");

	// create database connection object
	String dbUrl = configFile.get("dbUrl");
	String dbUser = configFile.get("dbUser");
	String dbPass = configFile.get("dbPass");

	Connection dbConnection = DriverManager.getConnection("jdbc:" + dbUrl,
			dbUser, dbPass);
	System.out.println("Connected to " + dbUrl);

	int numberOfCrawlers = Integer.parseInt(configFile.get("crawlerCount"));
	CrawlConfig config = new CrawlConfig();
	config.setCrawlStorageFolder("storage");
	config.setPolitenessDelay(Integer.parseInt(configFile.get("crawlerPolitenessDelay")));
	config.setIncludeBinaryContentInCrawling(true);
	config.setResumableCrawling(Integer.parseInt(configFile.get("crawlerResumeCrawling"))>0);

	PageFetcher pageFetcher = new PageFetcher(config);
	RobotstxtConfig robotsTxtConfig = new RobotstxtConfig();
	RobotstxtServer robotstxtServer = new RobotstxtServer(robotsTxtConfig,
			pageFetcher);

	try {
		String domainFilter = null;
		CrawlController controller = new CrawlController(config,
				pageFetcher, robotstxtServer);
		controller.addSeed(args[0]);
		ImageCrawler.configure(domainFilter, configFile.get("imagesDir"),
				dbConnection, configFile);
		System.out.println("Starting crawler @" + args[0]);
		controller.start(ImageCrawler.class, numberOfCrawlers);
	} catch (Exception e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
	dbConnection.close();
	System.out.println("Done!");		
}

开发者ID:pgorecki，项目名称:visearch，代码行数:46，代码来源:CrawlerManager.java

示例4: buildCrawlConfig

import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入方法依赖的package包/类
private CrawlConfig buildCrawlConfig() {
    CrawlConfig crawlConfig = new CrawlConfig();

    crawlConfig.setCrawlStorageFolder(appConfig.getCrawlStorageDir());
    crawlConfig.setIncludeBinaryContentInCrawling(appConfig.isIncludeBinaryContentInCrawling());
    crawlConfig.setIncludeHttpsPages(appConfig.isIncludeHttpsPages());

    /*
     * Be polite (200 ms min).
     */
    crawlConfig.setPolitenessDelay(appConfig.getPolitenessDelay());

    /*
     * You can set the maximum crawl depth here. The default value is -1 for unlimited depth
     */
    crawlConfig.setMaxDepthOfCrawling(appConfig.getMaxDepthOfCrawling());

    /*
     * You can set the maximum number of pages to crawl. The default value is -1 for unlimited number of pages.
     */
    crawlConfig.setMaxPagesToFetch(appConfig.getMaxPagesToFetch());

    /*
     * This config parameter can be used to set your crawl to be resumable (meaning that you can resume the crawl
     * from a previously interrupted/crashed crawl). Note: if you enable resuming feature and want to start a fresh
     * crawl, you need to delete the contents of rootFolder manually.
     */
    crawlConfig.setResumableCrawling(appConfig.isResumableImport());

    crawlConfig.setMaxConnectionsPerHost(appConfig.getMaxConnectionsPerHost());
    crawlConfig.setMaxTotalConnections(appConfig.getMaxTotalConnections());

    if (appConfig.getProxyHost() != null) {
        crawlConfig.setProxyHost(appConfig.getProxyHost());
        crawlConfig.setProxyPort(appConfig.getProxyPort());
    }
    if (appConfig.getProxyUsername() != null) {
        crawlConfig.setProxyUsername(appConfig.getProxyUsername());
    }
    if (appConfig.getProxyPassword() != null) {
        crawlConfig.setProxyPassword(appConfig.getProxyPassword());
    }

    return crawlConfig;
}

开发者ID:fgavilondo，项目名称:neo4j-webgraph，代码行数:46，代码来源:App.java

注：本文中的edu.uci.ics.crawler4j.crawler.CrawlConfig.setResumableCrawling方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。