本文整理汇总了Java中us.codecraft.webmagic.scheduler.QueueScheduler类的典型用法代码示例。如果您正苦于以下问题:Java QueueScheduler类的具体用法?Java QueueScheduler怎么用?Java QueueScheduler使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
QueueScheduler类属于us.codecraft.webmagic.scheduler包,在下文中一共展示了QueueScheduler类的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testSpiderInfo
import us.codecraft.webmagic.scheduler.QueueScheduler; //导入依赖的package包/类
/**
* 测试爬虫模板
*
* @param info
* @return
*/
public List<Webpage> testSpiderInfo(SpiderInfo info) throws JMException {
final ResultItemsCollectorPipeline resultItemsCollectorPipeline = new ResultItemsCollectorPipeline();
final String uuid = UUID.randomUUID().toString();
Task task = taskManager.initTask(uuid, info.getDomain(), info.getCallbackURL(), "spiderInfoId=" + info.getId() + "&spiderUUID=" + uuid);
task.addExtraInfo("spiderInfo", info);
QueueScheduler queueScheduler = new QueueScheduler();
MySpider spider = (MySpider) makeSpider(info, task)
.addPipeline(resultItemsCollectorPipeline)
.setScheduler(queueScheduler);
spider.startUrls(info.getStartURL());
//慎用爬虫监控,可能导致内存泄露
// spiderMonitor.register(spider);
spiderMap.put(uuid, spider);
taskManager.getTaskById(uuid).setState(State.RUNNING);
spider.run();
List<Webpage> webpageList = Lists.newLinkedList();
resultItemsCollectorPipeline.getCollected().forEach(resultItems -> webpageList.add(CommonWebpagePipeline.convertResultItems2Webpage(resultItems)));
return webpageList;
}
示例2: testSpiderInfo
import us.codecraft.webmagic.scheduler.QueueScheduler; //导入依赖的package包/类
/**
* 测试爬虫模板
*
* @param info
* @return
*/
public List<Webpage> testSpiderInfo(SpiderInfo info) throws JMException {
final ResultItemsCollectorPipeline resultItemsCollectorPipeline = new ResultItemsCollectorPipeline();
final String uuid = UUID.randomUUID().toString();
Task task = taskManager.initTask(uuid, info.getDomain(), info.getCallbackURL(), "spiderInfoId=" + info.getId() + "&spiderUUID=" + uuid);
task.addExtraInfo("spiderInfo", info);
QueueScheduler queueScheduler = new QueueScheduler();
MySpider spider = (MySpider) makeSpider(info, task)
.addPipeline(resultItemsCollectorPipeline)
.setScheduler(queueScheduler);
if (info.isAjaxSite() && StringUtils.isNotBlank(staticValue.getAjaxDownloader())) {
spider.setDownloader(casperjsDownloader);
} else {
spider.setDownloader(contentLengthLimitHttpClientDownloader);
}
spider.startUrls(info.getStartURL());
//慎用爬虫监控,可能导致内存泄露
// spiderMonitor.register(spider);
spiderMap.put(uuid, spider);
taskManager.getTaskById(uuid).setState(State.RUNNING);
spider.run();
List<Webpage> webpageList = Lists.newLinkedList();
resultItemsCollectorPipeline.getCollected().forEach(resultItems -> webpageList.add(CommonWebpagePipeline.convertResultItems2Webpage(resultItems)));
return webpageList;
}
示例3: testSpiderInfo
import us.codecraft.webmagic.scheduler.QueueScheduler; //导入依赖的package包/类
public List<Webpage> testSpiderInfo(SpiderInfo info) {
final ResultItemsCollectorPipeline ricp = new ResultItemsCollectorPipeline();
final String uuid = UUID.randomUUID().toString();
Task task = manager.initTask(uuid, info.getDomain(), info.getCallbackUrl(), "spiderInfoId=" + info.getId() + "&spiderUUID=" + uuid);
task.addExtraInfo("spiderInfo", info);
QueueScheduler scheduler = new QueueScheduler();
NestSpider spider = makeSpider(info, task);
spider.addPipeline(ricp).setScheduler(scheduler);
if(info.isAjaxSite() && StringUtils.isNotBlank(sValue.getAjaxDownloader()))
spider.setDownloader(jsDownloader);
else
spider.setDownloader(contentDownloader);
spider.startUrls(info.getStartUrl());
spiderMap.put(uuid, spider);
manager.findTask(uuid).setState(State.RUNNING);
spider.run();
List<Webpage> list = new LinkedList<>();
ricp.getCollected().forEach(items -> {
Webpage page = WebpagePipeline.convertToWebpage(items);
if(page != null)
list.add(page);
});
if(list.size() > 0)
return list;
else
return new ArrayList<>();
}
示例4: main
import us.codecraft.webmagic.scheduler.QueueScheduler; //导入依赖的package包/类
public static void main(String[] args) throws JMException {
Spider spider = Spider.create(new OschinaBlogPageProcesser())
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(2000)))
.addUrl("http://my.oschina.net/flashsword/blog");
//spider.addUrl("http://my.oschina.net/flashsword/blog");
SpiderMonitor.instance().register(spider);
spider.run();
}