本文整理汇总了Java中edu.uci.ics.crawler4j.crawler.CrawlConfig类的典型用法代码示例。如果您正苦于以下问题:Java CrawlConfig类的具体用法?Java CrawlConfig怎么用?Java CrawlConfig使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
CrawlConfig类属于edu.uci.ics.crawler4j.crawler包,在下文中一共展示了CrawlConfig类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
int numberOfCrawlers = 2;
CrawlConfig config = new CrawlConfig();
String crawlStorageFolder = "data";
config.setCrawlStorageFolder(crawlStorageFolder);
config.setPolitenessDelay(500);
config.setMaxDepthOfCrawling(2);
config.setMaxPagesToFetch(20);
config.setIncludeBinaryContentInCrawling(false);
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
controller.addSeed("https://en.wikipedia.org/wiki/Bishop_Rock,_Isles_of_Scilly");
controller.start(SampleCrawler.class, numberOfCrawlers);
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-End-to-Endguide-for-Java-developers,代码行数:21,代码来源:CrawlerController.java
示例2: main
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
String crawlStorageFolder = "/data/crawl/root";
int numberOfCrawlers = 7;
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
config.setPolitenessDelay(500);
config.setMaxDepthOfCrawling(2);
config.setMaxPagesToFetch(1000);
config.setResumableCrawling(false);
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController crawlController = new CrawlController(config, pageFetcher, robotstxtServer);
crawlController.addSeed("http://www.11st.co.kr/html/main.html");
crawlController.addSeed("http://www.11st.co.kr/html/category/1930.html");
crawlController.start(MyCrawler.class, numberOfCrawlers);
}
示例3: DocIDServer
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
public DocIDServer(Environment env, CrawlConfig config) throws DatabaseException {
super(config);
DatabaseConfig dbConfig = new DatabaseConfig();
dbConfig.setAllowCreate(true);
dbConfig.setTransactional(config.isResumableCrawling());
dbConfig.setDeferredWrite(!config.isResumableCrawling());
docIDsDB = env.openDatabase(null, "DocIDs", dbConfig);
if (config.isResumableCrawling()) {
int docCount = getDocCount();
if (docCount > 0) {
logger.info("Loaded {} URLs that had been detected in previous crawl.", docCount);
lastDocID = docCount;
}
} else {
lastDocID = 0;
}
}
示例4: crawlAndImport
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
/**
* This is where everything happens!
*/
private void crawlAndImport() throws Exception {
CrawlConfig crawlConfig = buildCrawlConfig();
PageFetcher pageFetcher = new PageFetcher(crawlConfig);
RobotstxtConfig robotsTxtConfig = new RobotstxtConfig();
robotsTxtConfig.setEnabled(appConfig.isRespectRobotsTxt());
RobotstxtServer robotsTxtServer = new RobotstxtServer(robotsTxtConfig, pageFetcher);
CrawlController crawlController = new CrawlController(crawlConfig, pageFetcher, robotsTxtServer);
// "dependency injection" into crawlers
Object[] customData = new Object[] { appConfig, graphImporter };
crawlController.setCustomData(customData);
addSeedUrls(crawlController);
logger.info("Start crawling");
/*
* Start the crawl. This is a blocking operation, meaning that your code will reach the line after this only
* when crawling is finished.
*/
crawlController.start(HtmlOnlyCrawler.class, appConfig.getNumberOfCrawlers());
logger.info("Finished crawling");
}
示例5: init
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
/**
* 初始化
*
* @param numberOfCrawlers
* 爬虫线程数
* @param maxDepthOfCrawling
* 抓取深度
* @param maxPagesToFetch
* 最大抓取页数
* @param politenessDelay
* 延迟
* @param links
* 待爬取链接
*/
public void init(int numberOfCrawlers, int maxDepthOfCrawling, int maxPagesToFetch, int politenessDelay,
String[] links) {
this.numberOfCrawlers = numberOfCrawlers;
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(DefaultConfigValues.CRAWL_STORAGE_FOLDER);
config.setMaxDepthOfCrawling(maxDepthOfCrawling);
config.setIncludeHttpsPages(true);
config.setMaxPagesToFetch(maxPagesToFetch);
config.setIncludeBinaryContentInCrawling(false);
config.setPolitenessDelay(politenessDelay);
config.setUserAgentString(DefaultConfigValues.USER_AGENT);
config.setResumableCrawling(true);
if (com.zhazhapan.vspider.models.CrawlConfig.getTurnOnProxy().get()) {
logger.info("open proxy");
config.setProxyHost(com.zhazhapan.vspider.models.CrawlConfig.getProxyServer().get());
config.setProxyPort(Formatter.stringToInt(com.zhazhapan.vspider.models.CrawlConfig.getProxyPort().get()));
config.setProxyUsername(com.zhazhapan.vspider.models.CrawlConfig.getProxyUser().get());
config.setProxyPassword(com.zhazhapan.vspider.models.CrawlConfig.getProxyPass().get());
}
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
robotstxtConfig.setEnabled(false);
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
try {
controller = new CrawlController(config, pageFetcher, robotstxtServer);
for (String link : links) {
if (Checker.isHyperLink(link)) {
controller.addSeed(link);
}
}
isInited = true;
} catch (Exception e) {
logger.error("start to crawl urls error: " + e.getMessage());
}
}
示例6: main
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
String crawlStorageFolder = "/data/crawl/root";
int numberOfCrawlers = 7;
CrawlConfig config = new CrawlConfig();
config.setPolitenessDelay(100);
config.setCrawlStorageFolder(crawlStorageFolder);
/*
* Instantiate the controller for this crawl.
*/
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* For each crawl, you need to add some seed urls. These are the first
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("https://de.wikipedia.org/wiki/Java_Database_Connectivity");
controller.addSeed("https://de.wikipedia.org/wiki/Relationale_Datenbank");
controller.addSeed("https://pt.wikipedia.org/wiki/JDBC");
controller.addSeed("https://pt.wikipedia.org/wiki/Protocolo");
controller.addSeed("https://de.wikipedia.org/wiki/Datenbank");
/*
* Start the crawl. This is a blocking operation, meaning that your code
* will reach the line after this only when crawling is finished.
*/
controller.start(new PostgresCrawlerFactory("jdbc:postgresql://localhost/crawler4j","postgres","postgres"), numberOfCrawlers);
}
示例7: run
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
public void run(final CrawlerSettings crawlerSettings, final List<Memo> memos) throws Exception {
CrawlConfig config = crawlerSettings.getCrawlConfig();
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
for (String seed : crawlerSettings.getSeeds()) {
controller.addSeed(seed);
}
ActionsCrawler.configure(memoEntryFinder, memoMatching, servicesContext, memos);
controller.start(ActionsCrawler.class, crawlerSettings.getNumberOfCrawlers());
}
示例8: executeInternal
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
@Override
protected void executeInternal(JobExecutionContext jobExecutionContext) throws JobExecutionException {
//if crawler already has been running, skip it
if (!crawlerExecuteLock.tryLock())
return;
try {
logger.info("MemoCrawlerJob is running");
long startTime, workingTime;
JobDataMap jobDataMap = jobExecutionContext.getJobDetail().getJobDataMap();
CrawlerRunner crawlerRunner = (CrawlerRunner) jobDataMap.get("crawlRun");
startTime = System.currentTimeMillis();
CrawlConfig crawlConfig = setCrawlConfig();
CrawlerSettings crawlerSettings = new CrawlerSettings(crawlConfig, NUMBER_OF_CRAWLERS, SEEDS);
crawlerRunner.run(crawlerSettings, MEMOS);
workingTime = System.currentTimeMillis() - startTime;
logger.info("Crawler has worked for " + workingTime + " milliseconds");
} catch (Exception e) {
logger.info("Crawler crash");
//TODO: add more info about cause crawler error
throw new JobExecutionException("Crawler error");
} finally {
crawlerExecuteLock.unlock();
}
}
示例9: setCrawlConfig
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
private CrawlConfig setCrawlConfig() {
CrawlConfig crawlConfig = new CrawlConfig();
crawlConfig.setCrawlStorageFolder(CRAWL_TEMP_DIR);
crawlConfig.setMaxDepthOfCrawling(MAX_DEPTH_OF_CRAWLING);
crawlConfig.setMaxPagesToFetch(MAX_PAGES_TO_FETCH);
return crawlConfig;
}
示例10: setUp
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
@Before
public void setUp() throws Exception {
ArrayList<String> seeds = new ArrayList<String>();
seeds.add("http://www.eurosport.ru/football/champions-league/2016-2017/story_sto5959402.shtml");
CrawlConfig crawlConfig = setCrawlConfig();
crawlerSettings = new CrawlerSettings(crawlConfig, 1, seeds);
Memo memo = new Memo();
memo.setTitle("Ростов");
memos = Arrays.asList(memo);
}
示例11: setCrawlConfig
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
private CrawlConfig setCrawlConfig() {
CrawlConfig crawlConfig = new CrawlConfig();
crawlConfig.setCrawlStorageFolder("src/resources/test/crawlerTemporaryDirectory");
crawlConfig.setMaxDepthOfCrawling(1);
crawlConfig.setMaxPagesToFetch(1);
return crawlConfig;
}
示例12: main
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 1) {
return;
}
String crawlStorageFolder = args[0];
int numberOfCrawlers = 1;
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
/*
* Instantiate the controller for this crawl.
*/
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* For each crawl, you need to add some seed urls. These are the first
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("http://www.senado.leg.br/senadores/default.asp");
/*
* Start the crawl. This is a blocking operation, meaning that your code
* will reach the line after this only when crawling is finished.
*/
controller.start(SenatorsCrawler.class, numberOfCrawlers);
}
示例13: main
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
String crawlStorageFolder = "D:\\etc\\storage";
int numberOfCrawlers = 7;
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
/*
* Instantiate the controller for this crawl.
*/
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* For each crawl, you need to add some seed urls. These are the first
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("http://www.ics.uci.edu/~lopes/");
controller.addSeed("http://www.ics.uci.edu/~welling/");
controller.addSeed("http://www.ics.uci.edu/");
/*
* Start the crawl. This is a blocking operation, meaning that your code
* will reach the line after this only when crawling is finished.
*/
controller.start(MyCrawler.class, numberOfCrawlers);
}
示例14: Counters
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
public Counters(Environment env, CrawlConfig config) throws DatabaseException {
super(config);
this.env = env;
this.counterValues = new HashMap<>();
/*
* When crawling is set to be resumable, we have to keep the statistics
* in a transactional database to make sure they are not lost if crawler
* is crashed or terminated unexpectedly.
*/
if (config.isResumableCrawling()) {
DatabaseConfig dbConfig = new DatabaseConfig();
dbConfig.setAllowCreate(true);
dbConfig.setTransactional(true);
dbConfig.setDeferredWrite(false);
statisticsDB = env.openDatabase(null, "Statistics", dbConfig);
OperationStatus result;
DatabaseEntry key = new DatabaseEntry();
DatabaseEntry value = new DatabaseEntry();
Transaction tnx = env.beginTransaction(null, null);
Cursor cursor = statisticsDB.openCursor(tnx, null);
result = cursor.getFirst(key, value, null);
while (result == OperationStatus.SUCCESS) {
if (value.getData().length > 0) {
String name = new String(key.getData());
long counterValue = Util.byteArray2Long(value.getData());
counterValues.put(name, new Long(counterValue));
}
result = cursor.getNext(key, value, null);
}
cursor.close();
tnx.commit();
}
}
示例15: Frontier
import edu.uci.ics.crawler4j.crawler.CrawlConfig; //导入依赖的package包/类
public Frontier(Environment env, CrawlConfig config, DocIDServer docIdServer) {
super(config);
this.counters = new Counters(env, config);
this.docIdServer = docIdServer;
try {
workQueues = new WorkQueues(env, "PendingURLsDB", config.isResumableCrawling());
if (config.isResumableCrawling()) {
scheduledPages = counters.getValue(ReservedCounterNames.SCHEDULED_PAGES);
inProcessPages = new InProcessPagesDB(env);
long numPreviouslyInProcessPages = inProcessPages.getLength();
if (numPreviouslyInProcessPages > 0) {
logger.info("Rescheduling {} URLs from previous crawl.", numPreviouslyInProcessPages);
scheduledPages -= numPreviouslyInProcessPages;
while (true) {
List<WebURL> urls = inProcessPages.get(100);
if (urls.size() == 0) {
break;
}
scheduleAll(urls);
inProcessPages.delete(urls.size());
}
}
} else {
inProcessPages = null;
scheduledPages = 0;
}
} catch (DatabaseException e) {
logger.error("Error while initializing the Frontier: {}", e.getMessage());
workQueues = null;
}
}