本文整理汇总了Java中us.codecraft.webmagic.pipeline.Pipeline类的典型用法代码示例。如果您正苦于以下问题:Java Pipeline类的具体用法?Java Pipeline怎么用?Java Pipeline使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
Pipeline类属于us.codecraft.webmagic.pipeline包,在下文中一共展示了Pipeline类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: scratch
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public void scratch(){
us.codecraft.webmagic.Spider.create(new SimplePageProcessor(config.getStartUrl(), config.getTargetUrlPattern()))
.addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
Document doc = Jsoup.parse(resultItems.get("html"));
IPersist persist = PersistManager.getInstance().getPersist(config.getHost(), config.getCategory());
if(persist == null){
logger.warn("persistNotExists: host={}, category={}", config.getHost(), config.getCategory());
}else{
persist.persist(resultItems.getRequest().getUrl(), config, doc);
}
}
})
.thread(5).run();
}
示例2: processRequest
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
protected void processRequest(Request request) {
Page page = downloader.download(request, this);
if (page == null) {
sleep(site.getSleepTime());
onError(request);
return;
}
// for cycle retry
if (page.isNeedCycleRetry()) {
extractAndAddRequests(page, true);
sleep(site.getRetrySleepTime());
return;
}
pageLoginProcessor.process(page);
extractAndAddRequests(page, spawnUrl);
if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
}
}
//for proxy status management
request.putExtra(Request.STATUS_CODE, page.getStatusCode());
sleep(site.getSleepTime());
}
示例3: testStartAndStop
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
@Ignore("long time")
@Test
public void testStartAndStop() throws InterruptedException {
Spider spider = Spider.create(new SimplePageProcessor( "http://www.oschina.net/*")).addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
System.out.println(1);
}
}).thread(1).addUrl("http://www.oschina.net/");
spider.start();
Thread.sleep(10000);
spider.stop();
Thread.sleep(10000);
spider.start();
Thread.sleep(10000);
}
示例4: startSpider
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
private static void startSpider(Params params) {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom()
.language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
pageProcessor.getSite().setSleepTime(params.getSleepTime());
pageProcessor.getSite().setRetryTimes(3);
pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404,403, 500,502));
Spider spider = Spider.create(pageProcessor).thread(params.getThread());
spider.clearPipeline().addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
}
});
if (params.getUrls() == null || params.getUrls().size() == 0) {
System.err.println("Need at least one argument");
System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]");
System.exit(-1);
}
for (String url : params.getUrls()) {
spider.addUrl(url);
}
spider.run();
}
示例5: close
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public void close() {
destroyEach(downloader);
destroyEach(pageLoginProcessor);
destroyEach(scheduler);
for (Pipeline pipeline : pipelines) {
destroyEach(pipeline);
}
threadPool.shutdown();
}
示例6: destroy
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
protected void destroy() {
destroyEach(downloader);
destroyEach(pageProcessor);
for (Pipeline pipeline : pipelines) {
destroyEach(pipeline);
}
}
示例7: processRequest
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
protected void processRequest(Request request) {
Page page = downloader.download(request, this);
if (page == null) {
sleep(site.getSleepTime());
return;
}
pageProcessor.process(page);
addRequest(page);
if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
}
}
sleep(site.getSleepTime());
}
示例8: close
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public void close() {
destroyEach(downloader);
destroyEach(pageProcessor);
destroyEach(scheduler);
for (Pipeline pipeline : pipelines) {
destroyEach(pipeline);
}
threadPool.shutdown();
}
示例9: onDownloadSuccess
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
private void onDownloadSuccess(Request request, Page page) {
if (site.getAcceptStatCode().contains(page.getStatusCode())){
pageProcessor.process(page);
extractAndAddRequests(page, spawnUrl);
if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
}
}
} else {
logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
}
sleep(site.getSleepTime());
return;
}
示例10: test_github
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
@Test
public void test_github() throws Exception {
Spider.create(new GithubRepoPageProcessor()).addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
assertThat(((String) resultItems.get("name")).trim()).isEqualTo("webmagic");
assertThat(((String) resultItems.get("author")).trim()).isEqualTo("code4craft");
}
}).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
}
示例11: test
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
@Test
public void test() {
OOSpider.create(new GithubRepoProcessor()).addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
Assert.assertEquals("78",((String)resultItems.get("star")).trim());
Assert.assertEquals("65",((String)resultItems.get("fork")).trim());
}
}).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
}
示例12: getPipelineList
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public List<Pipeline> getPipelineList() {
return pipelineList;
}
示例13: setPipelineList
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public CommonSpider setPipelineList(List<Pipeline> pipelineList) {
this.pipelineList = pipelineList;
return this;
}
示例14: getPipelines
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public List<Pipeline> getPipelines() {
return pipelines;
}
示例15: setPipelines
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public CommonSpider setPipelines(List<Pipeline> pipelines) {
this.pipelines = pipelines;
return this;
}