当前位置: 首页>>代码示例>>Java>>正文


Java CrawlConfig类代码示例

本文整理汇总了Java中edu.uci.ics.crawler4j.crawler.CrawlConfig的典型用法代码示例。如果您正苦于以下问题:Java CrawlConfig类的具体用法?Java CrawlConfig怎么用?Java CrawlConfig使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


CrawlConfig类属于edu.uci.ics.crawler4j.crawler包,在下文中一共展示了CrawlConfig类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: main

import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
  int numberOfCrawlers = 2;
  CrawlConfig config = new CrawlConfig();
  String crawlStorageFolder = "data";
  
  config.setCrawlStorageFolder(crawlStorageFolder);
  config.setPolitenessDelay(500);
  config.setMaxDepthOfCrawling(2);
  config.setMaxPagesToFetch(20);
  config.setIncludeBinaryContentInCrawling(false);

  PageFetcher pageFetcher = new PageFetcher(config);
  RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
  RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
  CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

  controller.addSeed("https://en.wikipedia.org/wiki/Bishop_Rock,_Isles_of_Scilly");

  controller.start(SampleCrawler.class, numberOfCrawlers);
}
 
开发者ID:PacktPublishing,项目名称:Machine-Learning-End-to-Endguide-for-Java-developers,代码行数:21,代码来源:CrawlerController.java

示例2: main

import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
	String crawlStorageFolder = "/data/crawl/root";
	int numberOfCrawlers = 7;
	
	CrawlConfig config = new CrawlConfig();
	config.setCrawlStorageFolder(crawlStorageFolder);
	config.setPolitenessDelay(500);
	config.setMaxDepthOfCrawling(2);
	config.setMaxPagesToFetch(1000);
	config.setResumableCrawling(false);
	
	PageFetcher pageFetcher = new PageFetcher(config);
	RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
	RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
	CrawlController crawlController = new CrawlController(config, pageFetcher, robotstxtServer);
	
	crawlController.addSeed("http://www.11st.co.kr/html/main.html");
	crawlController.addSeed("http://www.11st.co.kr/html/category/1930.html");
	
	crawlController.start(MyCrawler.class, numberOfCrawlers);
}
 
开发者ID:UCJung,项目名称:javalab,代码行数:22,代码来源:MyCrawlerController.java

示例3: DocIDServer

import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
public DocIDServer(Environment env, CrawlConfig config) throws DatabaseException {
	super(config);
	DatabaseConfig dbConfig = new DatabaseConfig();
	dbConfig.setAllowCreate(true);
	dbConfig.setTransactional(config.isResumableCrawling());
	dbConfig.setDeferredWrite(!config.isResumableCrawling());
	docIDsDB = env.openDatabase(null, "DocIDs", dbConfig);
	if (config.isResumableCrawling()) {
		int docCount = getDocCount();
		if (docCount > 0) {
			logger.info("Loaded {} URLs that had been detected in previous crawl.", docCount);
			lastDocID = docCount;
		}
	} else {
		lastDocID = 0;
	}
}
 
开发者ID:Chaiavi,项目名称:Crawler4j,代码行数:18,代码来源:DocIDServer.java

示例4: crawlAndImport

import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
/**
 * This is where everything happens!
 */
private void crawlAndImport() throws Exception {

    CrawlConfig crawlConfig = buildCrawlConfig();
    PageFetcher pageFetcher = new PageFetcher(crawlConfig);
    RobotstxtConfig robotsTxtConfig = new RobotstxtConfig();
    robotsTxtConfig.setEnabled(appConfig.isRespectRobotsTxt());
    RobotstxtServer robotsTxtServer = new RobotstxtServer(robotsTxtConfig, pageFetcher);
    CrawlController crawlController = new CrawlController(crawlConfig, pageFetcher, robotsTxtServer);

    // "dependency injection" into crawlers
    Object[] customData = new Object[] { appConfig, graphImporter };
    crawlController.setCustomData(customData);

    addSeedUrls(crawlController);

    logger.info("Start crawling");

    /*
     * Start the crawl. This is a blocking operation, meaning that your code will reach the line after this only
     * when crawling is finished.
     */
    crawlController.start(HtmlOnlyCrawler.class, appConfig.getNumberOfCrawlers());

    logger.info("Finished crawling");
}
 
开发者ID:fgavilondo,项目名称:neo4j-webgraph,代码行数:29,代码来源:App.java

示例5: init

import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
/**
 * 初始化
 * 
 * @param numberOfCrawlers
 *            爬虫线程数
 * @param maxDepthOfCrawling
 *            抓取深度
 * @param maxPagesToFetch
 *            最大抓取页数
 * @param politenessDelay
 *            延迟
 * @param links
 *            待爬取链接
 */
public void init(int numberOfCrawlers, int maxDepthOfCrawling, int maxPagesToFetch, int politenessDelay,
		String[] links) {
	this.numberOfCrawlers = numberOfCrawlers;
	CrawlConfig config = new CrawlConfig();
	config.setCrawlStorageFolder(DefaultConfigValues.CRAWL_STORAGE_FOLDER);
	config.setMaxDepthOfCrawling(maxDepthOfCrawling);
	config.setIncludeHttpsPages(true);
	config.setMaxPagesToFetch(maxPagesToFetch);
	config.setIncludeBinaryContentInCrawling(false);
	config.setPolitenessDelay(politenessDelay);
	config.setUserAgentString(DefaultConfigValues.USER_AGENT);
	config.setResumableCrawling(true);

	if (com.zhazhapan.vspider.models.CrawlConfig.getTurnOnProxy().get()) {
		logger.info("open proxy");
		config.setProxyHost(com.zhazhapan.vspider.models.CrawlConfig.getProxyServer().get());
		config.setProxyPort(Formatter.stringToInt(com.zhazhapan.vspider.models.CrawlConfig.getProxyPort().get()));
		config.setProxyUsername(com.zhazhapan.vspider.models.CrawlConfig.getProxyUser().get());
		config.setProxyPassword(com.zhazhapan.vspider.models.CrawlConfig.getProxyPass().get());
	}

	PageFetcher pageFetcher = new PageFetcher(config);
	RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
	robotstxtConfig.setEnabled(false);
	RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
	try {
		controller = new CrawlController(config, pageFetcher, robotstxtServer);
		for (String link : links) {
			if (Checker.isHyperLink(link)) {
				controller.addSeed(link);
			}
		}
		isInited = true;
	} catch (Exception e) {
		logger.error("start to crawl urls error: " + e.getMessage());
	}
}
 
开发者ID:zhazhapan,项目名称:visual-spider,代码行数:52,代码来源:VsController.java

示例6: main

import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
    String crawlStorageFolder = "/data/crawl/root";
    int numberOfCrawlers = 7;

    CrawlConfig config = new CrawlConfig();

    config.setPolitenessDelay(100);

    config.setCrawlStorageFolder(crawlStorageFolder);

    /*
     * Instantiate the controller for this crawl.
     */
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

    /*
     * For each crawl, you need to add some seed urls. These are the first
     * URLs that are fetched and then the crawler starts following links
     * which are found in these pages
     */
    controller.addSeed("https://de.wikipedia.org/wiki/Java_Database_Connectivity");
    controller.addSeed("https://de.wikipedia.org/wiki/Relationale_Datenbank");
    controller.addSeed("https://pt.wikipedia.org/wiki/JDBC");
    controller.addSeed("https://pt.wikipedia.org/wiki/Protocolo");
    controller.addSeed("https://de.wikipedia.org/wiki/Datenbank");

    /*
     * Start the crawl. This is a blocking operation, meaning that your code
     * will reach the line after this only when crawling is finished.
     */

    controller.start(new PostgresCrawlerFactory("jdbc:postgresql://localhost/crawler4j","postgres","postgres"), numberOfCrawlers);
}
 
开发者ID:rzo1,项目名称:crawler4j-postgres-sample,代码行数:37,代码来源:SampleLauncher.java

示例7: run

import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
public void run(final CrawlerSettings crawlerSettings, final List<Memo> memos) throws Exception {
    CrawlConfig config = crawlerSettings.getCrawlConfig();

    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

    for (String seed : crawlerSettings.getSeeds()) {
        controller.addSeed(seed);
    }

    ActionsCrawler.configure(memoEntryFinder, memoMatching, servicesContext, memos);
    controller.start(ActionsCrawler.class, crawlerSettings.getNumberOfCrawlers());
}
 
开发者ID:edu-xored,项目名称:memorise,代码行数:16,代码来源:CrawlerRunner.java

示例8: executeInternal

import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
@Override
protected void executeInternal(JobExecutionContext jobExecutionContext) throws JobExecutionException {
    //if crawler already has been running, skip it
    if (!crawlerExecuteLock.tryLock())
        return;
    try {
        logger.info("MemoCrawlerJob is running");
        long startTime, workingTime;

        JobDataMap jobDataMap = jobExecutionContext.getJobDetail().getJobDataMap();
        CrawlerRunner crawlerRunner = (CrawlerRunner) jobDataMap.get("crawlRun");

        startTime = System.currentTimeMillis();

        CrawlConfig crawlConfig = setCrawlConfig();
        CrawlerSettings crawlerSettings = new CrawlerSettings(crawlConfig, NUMBER_OF_CRAWLERS, SEEDS);

        crawlerRunner.run(crawlerSettings, MEMOS);

        workingTime = System.currentTimeMillis() - startTime;
        logger.info("Crawler has worked for " + workingTime + " milliseconds");
    } catch (Exception e) {
        logger.info("Crawler crash");
        //TODO: add more info about cause crawler error
        throw new JobExecutionException("Crawler error");
    } finally {
        crawlerExecuteLock.unlock();
    }
}
 
开发者ID:edu-xored,项目名称:memorise,代码行数:30,代码来源:MemoCrawlerJob.java

示例9: setCrawlConfig

import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
private CrawlConfig setCrawlConfig() {
    CrawlConfig crawlConfig = new CrawlConfig();
    crawlConfig.setCrawlStorageFolder(CRAWL_TEMP_DIR);
    crawlConfig.setMaxDepthOfCrawling(MAX_DEPTH_OF_CRAWLING);
    crawlConfig.setMaxPagesToFetch(MAX_PAGES_TO_FETCH);
    return crawlConfig;
}
 
开发者ID:edu-xored,项目名称:memorise,代码行数:8,代码来源:MemoCrawlerJob.java

示例10: setUp

import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
@Before
public void setUp() throws Exception {
    ArrayList<String> seeds = new ArrayList<String>();
    seeds.add("http://www.eurosport.ru/football/champions-league/2016-2017/story_sto5959402.shtml");
    CrawlConfig crawlConfig = setCrawlConfig();
    crawlerSettings = new CrawlerSettings(crawlConfig, 1, seeds);
    Memo memo = new Memo();
    memo.setTitle("Ростов");
    memos = Arrays.asList(memo);
}
 
开发者ID:edu-xored,项目名称:memorise,代码行数:11,代码来源:CrawlerRunnerTest.java

示例11: setCrawlConfig

import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
private CrawlConfig setCrawlConfig() {
    CrawlConfig crawlConfig = new CrawlConfig();
    crawlConfig.setCrawlStorageFolder("src/resources/test/crawlerTemporaryDirectory");
    crawlConfig.setMaxDepthOfCrawling(1);
    crawlConfig.setMaxPagesToFetch(1);
    return crawlConfig;
}
 
开发者ID:edu-xored,项目名称:memorise,代码行数:8,代码来源:CrawlerRunnerTest.java

示例12: main

import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
    if (args.length != 1) {
        return;
    }

    String crawlStorageFolder = args[0];
    int numberOfCrawlers = 1;

    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(crawlStorageFolder);

    /*
     * Instantiate the controller for this crawl.
     */
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

    /*
     * For each crawl, you need to add some seed urls. These are the first
     * URLs that are fetched and then the crawler starts following links
     * which are found in these pages
     */
    controller.addSeed("http://www.senado.leg.br/senadores/default.asp");

    /*
     * Start the crawl. This is a blocking operation, meaning that your code
     * will reach the line after this only when crawling is finished.
     */
    controller.start(SenatorsCrawler.class, numberOfCrawlers);
}
 
开发者ID:TekkLabs,项目名称:memoria-politica,代码行数:33,代码来源:CrawlerController.java

示例13: main

import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
    String crawlStorageFolder = "D:\\etc\\storage";
    int numberOfCrawlers = 7;

    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(crawlStorageFolder);

    /*
     * Instantiate the controller for this crawl.
     */
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

    /*
     * For each crawl, you need to add some seed urls. These are the first
     * URLs that are fetched and then the crawler starts following links
     * which are found in these pages
     */
    controller.addSeed("http://www.ics.uci.edu/~lopes/");
    controller.addSeed("http://www.ics.uci.edu/~welling/");
    controller.addSeed("http://www.ics.uci.edu/");

    /*
     * Start the crawl. This is a blocking operation, meaning that your code
     * will reach the line after this only when crawling is finished.
     */
    controller.start(MyCrawler.class, numberOfCrawlers);
}
 
开发者ID:vjymits,项目名称:musicFinder,代码行数:31,代码来源:MyCrawler.java

示例14: Counters

import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
public Counters(Environment env, CrawlConfig config) throws DatabaseException {
	super(config);

	this.env = env;
	this.counterValues = new HashMap<>();

	/*
	 * When crawling is set to be resumable, we have to keep the statistics
	 * in a transactional database to make sure they are not lost if crawler
	 * is crashed or terminated unexpectedly.
	 */
	if (config.isResumableCrawling()) {
		DatabaseConfig dbConfig = new DatabaseConfig();
		dbConfig.setAllowCreate(true);
		dbConfig.setTransactional(true);
		dbConfig.setDeferredWrite(false);
		statisticsDB = env.openDatabase(null, "Statistics", dbConfig);
		
		OperationStatus result;
		DatabaseEntry key = new DatabaseEntry();
		DatabaseEntry value = new DatabaseEntry();
		Transaction tnx = env.beginTransaction(null, null);
		Cursor cursor = statisticsDB.openCursor(tnx, null);
		result = cursor.getFirst(key, value, null);

		while (result == OperationStatus.SUCCESS) {
			if (value.getData().length > 0) {
				String name = new String(key.getData());
				long counterValue = Util.byteArray2Long(value.getData());
				counterValues.put(name, new Long(counterValue));
			}
			result = cursor.getNext(key, value, null);
		}
		cursor.close();
		tnx.commit();
	}
}
 
开发者ID:Chaiavi,项目名称:Crawler4j,代码行数:38,代码来源:Counters.java

示例15: Frontier

import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
public Frontier(Environment env, CrawlConfig config, DocIDServer docIdServer) {
	super(config);
	this.counters = new Counters(env, config);
	this.docIdServer = docIdServer;
	try {
		workQueues = new WorkQueues(env, "PendingURLsDB", config.isResumableCrawling());
		if (config.isResumableCrawling()) {
			scheduledPages = counters.getValue(ReservedCounterNames.SCHEDULED_PAGES);
			inProcessPages = new InProcessPagesDB(env);
			long numPreviouslyInProcessPages = inProcessPages.getLength();
			if (numPreviouslyInProcessPages > 0) {
				logger.info("Rescheduling {} URLs from previous crawl.", numPreviouslyInProcessPages);
				scheduledPages -= numPreviouslyInProcessPages;
				while (true) {
					List<WebURL> urls = inProcessPages.get(100);
					if (urls.size() == 0) {
                           break;
                       }
					scheduleAll(urls);
					inProcessPages.delete(urls.size());
				}
			}
		} else {
			inProcessPages = null;
			scheduledPages = 0;
		}
	} catch (DatabaseException e) {
		logger.error("Error while initializing the Frontier: {}", e.getMessage());
		workQueues = null;
	}
}
 
开发者ID:Chaiavi,项目名称:Crawler4j,代码行数:32,代码来源:Frontier.java


注:本文中的edu.uci.ics.crawler4j.crawler.CrawlConfig类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。