本文整理汇总了Java中us.codecraft.webmagic.scheduler.FileCacheQueueScheduler类的典型用法代码示例。如果您正苦于以下问题:Java FileCacheQueueScheduler类的具体用法?Java FileCacheQueueScheduler怎么用?Java FileCacheQueueScheduler使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
FileCacheQueueScheduler类属于us.codecraft.webmagic.scheduler包,在下文中一共展示了FileCacheQueueScheduler类的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: test
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; //导入依赖的package包/类
@Test
public void test() throws Exception {
Spider.create(new MySqlPageProcessor(htmlService))
.scheduler(new FileCacheQueueScheduler("F:\\webmagic\\cache\\"))
.addUrl("http://www.cnblogs.com/")
.addPipeline(new MySqlPipeline(htmlService))
.thread(10)
.run();
}
示例2: main
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; //导入依赖的package包/类
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
Spider.create(new MamacnPageProcessor())
.setScheduler(new FileCacheQueueScheduler("/data/webmagic/mamacn"))
.addUrl("http://www.mama.cn/photo/t1-p1.html")
.addPipeline(new OneFilePipeline("/data/webmagic/mamacn/data"))
.thread(5)
.run();
}
示例3: main
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; //导入依赖的package包/类
public static void main(String[] args) {
Spider.create(new GithubRepoPageProcessor())
//从"https://github.com/code4craft"开始抓
.addUrl("https://github.com/code4craft")
.scheduler(new FileCacheQueueScheduler("K:\\data\\webmagic"))
.addPipeline(new FilePipeline())
//开启5个线程抓取
.thread(5)
//启动爬虫
.run();
}
示例4: test
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; //导入依赖的package包/类
@Ignore
@Test
public void test() throws IOException {
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/");
Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}
示例5: test
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; //导入依赖的package包/类
@Ignore
@Test
public void test() throws IOException {
SinaBlogProcesser sinaBlogProcesser = new SinaBlogProcesser();
//pipeline是抓取结束后的处理
//默认放到/data/webmagic/ftl/[domain]目录下
JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/");
//Spider.me()是简化写法,其实就是new一个啦
//Spider.pipeline()设定一个pipeline,支持链式调用
//ConsolePipeline输出结果到控制台
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
//Spider.run()执行
Spider.create(sinaBlogProcesser).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}
示例6: testGlobalSpider
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; //导入依赖的package包/类
@Ignore
@Test
public void testGlobalSpider(){
// PageProcessor pageProcessor = new MeicanProcessor();
// Spider.me().pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
// processor(pageProcessor).run();
SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html");
System.out.println(pageProcessor2.getSite().getCharset());
pageProcessor2.getSite().setSleepTime(500);
Spider.create(pageProcessor2).pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}
示例7: test
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; //导入依赖的package包/类
@Ignore
@Test
public void test() throws IOException {
SinaBlogProcessor sinaBlogProcessor = new SinaBlogProcessor();
//pipeline是抓取结束后的处理
//默认放到/data/webmagic/ftl/[domain]目录下
JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/");
//Spider.me()是简化写法,其实就是new一个啦
//Spider.pipeline()设定一个pipeline,支持链式调用
//ConsolePipeline输出结果到控制台
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
//Spider.run()执行
Spider.create(sinaBlogProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}
示例8: testGlobalSpider
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; //导入依赖的package包/类
@Ignore
@Test
public void testGlobalSpider(){
// PageProcessor pageProcessor = new MeicanProcessor();
// Spider.me().pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
// processor(pageProcessor).run();
SimplePageProcessor pageProcessor2 = new SimplePageProcessor( "http://www.diaoyuweng.com/thread-*-1-1.html");
System.out.println(pageProcessor2.getSite().getCharset());
pageProcessor2.getSite().setSleepTime(500);
Spider.create(pageProcessor2).addUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").addPipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}
示例9: main
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; //导入依赖的package包/类
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("https://github.com/explore").setSleepTime(0).setRetryTimes(3),
new JsonFilePageModelPipeline(), GithubRepo.class)
.scheduler(new FileCacheQueueScheduler("/data/webmagic/cache/")).thread(15).run();
}
示例10: main
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; //导入依赖的package包/类
public static void main(String[] args) {
OOSpider.create(Site.me().setSleepTime(0).setRetryTimes(3),
new JsonFilePageModelPipeline(), GithubRepo.class)
.addUrl("https://github.com/explore")
.setScheduler(new FileCacheQueueScheduler("/data/webmagic/cache/")).thread(15).run();
}