本文整理汇总了Java中edu.uci.ics.crawler4j.crawler.CrawlController.start方法的典型用法代码示例。如果您正苦于以下问题:Java CrawlController.start方法的具体用法?Java CrawlController.start怎么用?Java CrawlController.start使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类edu.uci.ics.crawler4j.crawler.CrawlController
的用法示例。
在下文中一共展示了CrawlController.start方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
int numberOfCrawlers = 2;
CrawlConfig config = new CrawlConfig();
String crawlStorageFolder = "data";
config.setCrawlStorageFolder(crawlStorageFolder);
config.setPolitenessDelay(500);
config.setMaxDepthOfCrawling(2);
config.setMaxPagesToFetch(20);
config.setIncludeBinaryContentInCrawling(false);
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
controller.addSeed("https://en.wikipedia.org/wiki/Bishop_Rock,_Isles_of_Scilly");
controller.start(SampleCrawler.class, numberOfCrawlers);
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-End-to-Endguide-for-Java-developers,代码行数:21,代码来源:CrawlerController.java
示例2: main
import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
String crawlStorageFolder = "/data/crawl/root";
int numberOfCrawlers = 7;
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
config.setPolitenessDelay(500);
config.setMaxDepthOfCrawling(2);
config.setMaxPagesToFetch(1000);
config.setResumableCrawling(false);
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController crawlController = new CrawlController(config, pageFetcher, robotstxtServer);
crawlController.addSeed("http://www.11st.co.kr/html/main.html");
crawlController.addSeed("http://www.11st.co.kr/html/category/1930.html");
crawlController.start(MyCrawler.class, numberOfCrawlers);
}
示例3: crawlAndImport
import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
/**
* This is where everything happens!
*/
private void crawlAndImport() throws Exception {
CrawlConfig crawlConfig = buildCrawlConfig();
PageFetcher pageFetcher = new PageFetcher(crawlConfig);
RobotstxtConfig robotsTxtConfig = new RobotstxtConfig();
robotsTxtConfig.setEnabled(appConfig.isRespectRobotsTxt());
RobotstxtServer robotsTxtServer = new RobotstxtServer(robotsTxtConfig, pageFetcher);
CrawlController crawlController = new CrawlController(crawlConfig, pageFetcher, robotsTxtServer);
// "dependency injection" into crawlers
Object[] customData = new Object[] { appConfig, graphImporter };
crawlController.setCustomData(customData);
addSeedUrls(crawlController);
logger.info("Start crawling");
/*
* Start the crawl. This is a blocking operation, meaning that your code will reach the line after this only
* when crawling is finished.
*/
crawlController.start(HtmlOnlyCrawler.class, appConfig.getNumberOfCrawlers());
logger.info("Finished crawling");
}
示例4: main
import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
String crawlStorageFolder = "/data/crawl/root";
int numberOfCrawlers = 7;
CrawlConfig config = new CrawlConfig();
config.setPolitenessDelay(100);
config.setCrawlStorageFolder(crawlStorageFolder);
/*
* Instantiate the controller for this crawl.
*/
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* For each crawl, you need to add some seed urls. These are the first
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("https://de.wikipedia.org/wiki/Java_Database_Connectivity");
controller.addSeed("https://de.wikipedia.org/wiki/Relationale_Datenbank");
controller.addSeed("https://pt.wikipedia.org/wiki/JDBC");
controller.addSeed("https://pt.wikipedia.org/wiki/Protocolo");
controller.addSeed("https://de.wikipedia.org/wiki/Datenbank");
/*
* Start the crawl. This is a blocking operation, meaning that your code
* will reach the line after this only when crawling is finished.
*/
controller.start(new PostgresCrawlerFactory("jdbc:postgresql://localhost/crawler4j","postgres","postgres"), numberOfCrawlers);
}
示例5: run
import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public void run(final CrawlerSettings crawlerSettings, final List<Memo> memos) throws Exception {
CrawlConfig config = crawlerSettings.getCrawlConfig();
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
for (String seed : crawlerSettings.getSeeds()) {
controller.addSeed(seed);
}
ActionsCrawler.configure(memoEntryFinder, memoMatching, servicesContext, memos);
controller.start(ActionsCrawler.class, crawlerSettings.getNumberOfCrawlers());
}
示例6: main
import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 1) {
return;
}
String crawlStorageFolder = args[0];
int numberOfCrawlers = 1;
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
/*
* Instantiate the controller for this crawl.
*/
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* For each crawl, you need to add some seed urls. These are the first
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("http://www.senado.leg.br/senadores/default.asp");
/*
* Start the crawl. This is a blocking operation, meaning that your code
* will reach the line after this only when crawling is finished.
*/
controller.start(SenatorsCrawler.class, numberOfCrawlers);
}
示例7: main
import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
String crawlStorageFolder = "D:\\etc\\storage";
int numberOfCrawlers = 7;
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
/*
* Instantiate the controller for this crawl.
*/
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* For each crawl, you need to add some seed urls. These are the first
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("http://www.ics.uci.edu/~lopes/");
controller.addSeed("http://www.ics.uci.edu/~welling/");
controller.addSeed("http://www.ics.uci.edu/");
/*
* Start the crawl. This is a blocking operation, meaning that your code
* will reach the line after this only when crawling is finished.
*/
controller.start(MyCrawler.class, numberOfCrawlers);
}
示例8: main
import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length < 3) {
System.out.println("Needed parameters: ");
System.out.println("\t rootFolder (it will contain intermediate crawl data)");
System.out.println("\t numberOfCralwers (number of concurrent threads)");
System.out.println("\t storageFolder (a folder for storing downloaded images)");
return;
}
String rootFolder = args[0];
int numberOfCrawlers = Integer.parseInt(args[1]);
String storageFolder = args[2];
CrawlConfig config = new CrawlConfig();
config.crawlStorageFolder_$eq(rootFolder);
/*
* Since images are binary content, we need to set this parameter to
* true to make sure they are included in the crawl.
*/
config.includeBinaryContentInCrawling_$eq(true);
String[] crawlDomains = new String[] { "http://uci.edu/" };
PageFetcher pageFetcher = new PageFetcher(config);
RobotsTxtConfig robotstxtConfig = new RobotsTxtConfig();
RobotsTxtServer robotstxtServer = new RobotsTxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
for (String domain : crawlDomains) {
controller.addSeed(domain);
}
ImageCrawler.configure(crawlDomains, storageFolder);
controller.start(ImageCrawler.class, numberOfCrawlers);
}
示例9: main
import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.out.println("Needed parameters: ");
System.out.println("\t rootFolder (it will contain intermediate crawl data)");
System.out.println("\t numberOfCralwers (number of concurrent threads)");
return;
}
String rootFolder = args[0];
int numberOfCrawlers = Integer.parseInt(args[1]);
CrawlConfig config = new CrawlConfig();
config.crawlStorageFolder_$eq(rootFolder);
config.maxPagesToFetch_$eq(10);
config.politenessDelay_$eq(1000);
PageFetcher pageFetcher = new PageFetcher(config);
RobotsTxtConfig robotstxtConfig = new RobotsTxtConfig();
RobotsTxtServer robotstxtServer = new RobotsTxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
controller.addSeed("http://www.ics.uci.edu/");
controller.start(LocalDataCollectorCrawler.class, numberOfCrawlers);
List<Object> crawlersLocalData = controller.getCrawlersLocalData();
long totalLinks = 0;
long totalTextSize = 0;
int totalProcessedPages = 0;
for (Object localData : crawlersLocalData) {
CrawlStat stat = (CrawlStat) localData;
totalLinks += stat.getTotalLinks();
totalTextSize += stat.getTotalTextSize();
totalProcessedPages += stat.getTotalProcessedPages();
}
System.out.println("Aggregated Statistics:");
System.out.println(" Processed Pages: " + totalProcessedPages);
System.out.println(" Total Links found: " + totalLinks);
System.out.println(" Total Text Size: " + totalTextSize);
}
示例10: main
import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length < 3) {
System.out.println("Needed parameters: ");
System.out.println("\t rootFolder (it will contain intermediate crawl data)");
System.out.println("\t numberOfCralwers (number of concurrent threads)");
System.out.println("\t storageFolder (a folder for storing downloaded images)");
return;
}
String rootFolder = args[0];
int numberOfCrawlers = Integer.parseInt(args[1]);
String storageFolder = args[2];
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(rootFolder);
/*
* Since images are binary content, we need to set this parameter to
* true to make sure they are included in the crawl.
*/
config.setIncludeBinaryContentInCrawling(true);
String[] crawlDomains = new String[] { "http://uci.edu/" };
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
for (String domain : crawlDomains) {
controller.addSeed(domain);
}
ImageCrawler.configure(crawlDomains, storageFolder);
controller.start(ImageCrawler.class, numberOfCrawlers);
}
示例11: main
import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.out.println("Needed parameters: ");
System.out.println("\t rootFolder (it will contain intermediate crawl data)");
System.out.println("\t numberOfCralwers (number of concurrent threads)");
return;
}
String rootFolder = args[0];
int numberOfCrawlers = Integer.parseInt(args[1]);
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(rootFolder);
config.setMaxPagesToFetch(10);
config.setPolitenessDelay(1000);
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
controller.addSeed("http://www.ics.uci.edu/");
controller.start(LocalDataCollectorCrawler.class, numberOfCrawlers);
List<Object> crawlersLocalData = controller.getCrawlersLocalData();
long totalLinks = 0;
long totalTextSize = 0;
int totalProcessedPages = 0;
for (Object localData : crawlersLocalData) {
CrawlStat stat = (CrawlStat) localData;
totalLinks += stat.getTotalLinks();
totalTextSize += stat.getTotalTextSize();
totalProcessedPages += stat.getTotalProcessedPages();
}
System.out.println("Aggregated Statistics:");
System.out.println(" Processed Pages: " + totalProcessedPages);
System.out.println(" Total Links found: " + totalLinks);
System.out.println(" Total Text Size: " + totalTextSize);
}
示例12: main
import edu.uci.ics.crawler4j.crawler.CrawlController; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length <= 0) {
System.out.println("You must specify seed url.");
return;
}
ConfigFile configFile = new ConfigFile("settings.cfg");
// create database connection object
String dbUrl = configFile.get("dbUrl");
String dbUser = configFile.get("dbUser");
String dbPass = configFile.get("dbPass");
Connection dbConnection = DriverManager.getConnection("jdbc:" + dbUrl,
dbUser, dbPass);
System.out.println("Connected to " + dbUrl);
int numberOfCrawlers = Integer.parseInt(configFile.get("crawlerCount"));
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder("storage");
config.setPolitenessDelay(Integer.parseInt(configFile.get("crawlerPolitenessDelay")));
config.setIncludeBinaryContentInCrawling(true);
config.setResumableCrawling(Integer.parseInt(configFile.get("crawlerResumeCrawling"))>0);
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotsTxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotsTxtConfig,
pageFetcher);
try {
String domainFilter = null;
CrawlController controller = new CrawlController(config,
pageFetcher, robotstxtServer);
controller.addSeed(args[0]);
ImageCrawler.configure(domainFilter, configFile.get("imagesDir"),
dbConnection, configFile);
System.out.println("Starting crawler @" + args[0]);
controller.start(ImageCrawler.class, numberOfCrawlers);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
dbConnection.close();
System.out.println("Done!");
}