本文整理汇总了Java中us.codecraft.webmagic.pipeline.FilePipeline类的典型用法代码示例。如果您正苦于以下问题:Java FilePipeline类的具体用法?Java FilePipeline怎么用?Java FilePipeline使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
FilePipeline类属于us.codecraft.webmagic.pipeline包,在下文中一共展示了FilePipeline类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import us.codecraft.webmagic.pipeline.FilePipeline; //导入依赖的package包/类
public static void main(String[] args) throws JMException {
int threadNumber = Runtime.getRuntime().availableProcessors();
Spider spider = Spider.create(new Crawler()).addUrl("http://list.iqiyi.com/www/1/----------------iqiyi--.html")
.addPipeline(new FilePipeline("/root/iqiyi")).thread(threadNumber * 16);
SpiderMonitor.instance().register(spider);
spider.start();
}
示例2: main
import us.codecraft.webmagic.pipeline.FilePipeline; //导入依赖的package包/类
public static void main(String[] args) {
Spider.create(new ZhihuPageProcessor()).
addUrl("http://www.zhihu.com/search?type=question&q=java").
addPipeline(new FilePipeline("D:\\webmagic\\")).
thread(5).
run();
}
示例3: main
import us.codecraft.webmagic.pipeline.FilePipeline; //导入依赖的package包/类
public static void main(String[] args) {
Spider.create(new GithubRepoPageProcessor())
//从"https://github.com/code4craft"开始抓
.addUrl("https://github.com/code4craft")
.scheduler(new FileCacheQueueScheduler("K:\\data\\webmagic"))
.addPipeline(new FilePipeline())
//开启5个线程抓取
.thread(5)
//启动爬虫
.run();
}
示例4: main
import us.codecraft.webmagic.pipeline.FilePipeline; //导入依赖的package包/类
public static void main(String[] args) {
Spider.create(new InfoQMiniBookProcessor())
.scheduler(new RedisScheduler("localhost"))
.pipeline(new FilePipeline("/data/temp/webmagic/"))
.thread(5)
.run();
}
示例5: test
import us.codecraft.webmagic.pipeline.FilePipeline; //导入依赖的package包/类
@Ignore
@Test
public void test() throws IOException {
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/");
Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}
示例6: test
import us.codecraft.webmagic.pipeline.FilePipeline; //导入依赖的package包/类
@Ignore
@Test
public void test() throws IOException {
SinaBlogProcesser sinaBlogProcesser = new SinaBlogProcesser();
//pipeline是抓取结束后的处理
//默认放到/data/webmagic/ftl/[domain]目录下
JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/");
//Spider.me()是简化写法,其实就是new一个啦
//Spider.pipeline()设定一个pipeline,支持链式调用
//ConsolePipeline输出结果到控制台
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
//Spider.run()执行
Spider.create(sinaBlogProcesser).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}
示例7: testGlobalSpider
import us.codecraft.webmagic.pipeline.FilePipeline; //导入依赖的package包/类
@Ignore
@Test
public void testGlobalSpider(){
// PageProcessor pageProcessor = new MeicanProcessor();
// Spider.me().pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
// processor(pageProcessor).run();
SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html");
System.out.println(pageProcessor2.getSite().getCharset());
pageProcessor2.getSite().setSleepTime(500);
Spider.create(pageProcessor2).pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}
示例8: main
import us.codecraft.webmagic.pipeline.FilePipeline; //导入依赖的package包/类
public static void main(String[] args) {
Spider.create(new HuabanProcessor()).thread(5)
.addPipeline(new FilePipeline("/data/webmagic/test/"))
.setDownloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver"))
.addUrl("http://huaban.com/")
.runAsync();
}
示例9: main
import us.codecraft.webmagic.pipeline.FilePipeline; //导入依赖的package包/类
public static void main(String[] args) {
Spider.create(new GooglePlayProcessor())
.thread(5)
.addPipeline(
new FilePipeline(
"/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/data/"))
.setDownloader(new SeleniumDownloader())
.addUrl("https://play.google.com/store/apps/details?id=com.tencent.mm")
.runAsync();
}
示例10: test
import us.codecraft.webmagic.pipeline.FilePipeline; //导入依赖的package包/类
@Ignore
@Test
public void test() throws IOException {
SinaBlogProcessor sinaBlogProcessor = new SinaBlogProcessor();
//pipeline是抓取结束后的处理
//默认放到/data/webmagic/ftl/[domain]目录下
JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/");
//Spider.me()是简化写法,其实就是new一个啦
//Spider.pipeline()设定一个pipeline,支持链式调用
//ConsolePipeline输出结果到控制台
//FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
//Spider.run()执行
Spider.create(sinaBlogProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}
示例11: testGlobalSpider
import us.codecraft.webmagic.pipeline.FilePipeline; //导入依赖的package包/类
@Ignore
@Test
public void testGlobalSpider(){
// PageProcessor pageProcessor = new MeicanProcessor();
// Spider.me().pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
// processor(pageProcessor).run();
SimplePageProcessor pageProcessor2 = new SimplePageProcessor( "http://www.diaoyuweng.com/thread-*-1-1.html");
System.out.println(pageProcessor2.getSite().getCharset());
pageProcessor2.getSite().setSleepTime(500);
Spider.create(pageProcessor2).addUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").addPipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}
示例12: main
import us.codecraft.webmagic.pipeline.FilePipeline; //导入依赖的package包/类
public static void main(String[] args) {
Spider.create(new HuabanProcessor()).thread(5)
.pipeline(new FilePipeline("/data/webmagic/test/"))
.downloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver"))
.runAsync();
}
示例13: main
import us.codecraft.webmagic.pipeline.FilePipeline; //导入依赖的package包/类
public static void main(String[] args) {
Spider.create(new IteyeBlogProcessor()).thread(5).pipeline(new FilePipeline("/data/webmagic/")).run();
}
示例14: testSpider
import us.codecraft.webmagic.pipeline.FilePipeline; //导入依赖的package包/类
@Ignore
@Test
public void testSpider() throws InterruptedException {
Spider me = Spider.create(new HuxiuProcessor()).pipeline(new FilePipeline());
me.run();
}
示例15: testSpider
import us.codecraft.webmagic.pipeline.FilePipeline; //导入依赖的package包/类
@Ignore
@Test
public void testSpider() throws InterruptedException {
Spider me = Spider.create(new HuxiuProcessor()).addPipeline(new FilePipeline());
me.run();
}