当前位置: 首页>>代码示例>>Java>>正文


Java Pipeline类代码示例

本文整理汇总了Java中us.codecraft.webmagic.pipeline.Pipeline的典型用法代码示例。如果您正苦于以下问题:Java Pipeline类的具体用法?Java Pipeline怎么用?Java Pipeline使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


Pipeline类属于us.codecraft.webmagic.pipeline包,在下文中一共展示了Pipeline类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: scratch

import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public void scratch(){
    us.codecraft.webmagic.Spider.create(new SimplePageProcessor(config.getStartUrl(), config.getTargetUrlPattern()))
            .addPipeline(new Pipeline() {
                @Override
                public void process(ResultItems resultItems, Task task) {
                    Document doc = Jsoup.parse(resultItems.get("html"));
                    IPersist persist = PersistManager.getInstance().getPersist(config.getHost(), config.getCategory());
                    if(persist == null){
                        logger.warn("persistNotExists: host={}, category={}", config.getHost(), config.getCategory());
                    }else{
                        persist.persist(resultItems.getRequest().getUrl(), config, doc);
                    }
                }
            })
            .thread(5).run();
}
 
开发者ID:wangdamu,项目名称:SpiderApplication,代码行数:17,代码来源:Spider.java

示例2: processRequest

import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
protected void processRequest(Request request) {
    Page page = downloader.download(request, this);
    if (page == null) {
        sleep(site.getSleepTime());
        onError(request);
        return;
    }
    // for cycle retry
    if (page.isNeedCycleRetry()) {
        extractAndAddRequests(page, true);
        sleep(site.getRetrySleepTime());
        return;
    }
    pageLoginProcessor.process(page);
    extractAndAddRequests(page, spawnUrl);
    if (!page.getResultItems().isSkip()) {
        for (Pipeline pipeline : pipelines) {
            pipeline.process(page.getResultItems(), this);
        }
    }
    //for proxy status management
    request.putExtra(Request.STATUS_CODE, page.getStatusCode());
    sleep(site.getSleepTime());
}
 
开发者ID:hexiaohong-code,项目名称:LoginCrawler,代码行数:25,代码来源:SpiderLogin.java

示例3: testStartAndStop

import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
@Ignore("long time")
@Test
public void testStartAndStop() throws InterruptedException {
    Spider spider = Spider.create(new SimplePageProcessor( "http://www.oschina.net/*")).addPipeline(new Pipeline() {
        @Override
        public void process(ResultItems resultItems, Task task) {
            System.out.println(1);
        }
    }).thread(1).addUrl("http://www.oschina.net/");
    spider.start();
    Thread.sleep(10000);
    spider.stop();
    Thread.sleep(10000);
    spider.start();
    Thread.sleep(10000);
}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:17,代码来源:SpiderTest.java

示例4: startSpider

import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
private static void startSpider(Params params) {
    ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom()
            .language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
    pageProcessor.getSite().setSleepTime(params.getSleepTime());
    pageProcessor.getSite().setRetryTimes(3);
    pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404,403, 500,502));
    Spider spider = Spider.create(pageProcessor).thread(params.getThread());
    spider.clearPipeline().addPipeline(new Pipeline() {
        @Override
        public void process(ResultItems resultItems, Task task) {

        }
    });
    if (params.getUrls() == null || params.getUrls().size() == 0) {
        System.err.println("Need at least one argument");
        System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]");
        System.exit(-1);
    }
    for (String url : params.getUrls()) {
        spider.addUrl(url);
    }
    spider.run();
}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:24,代码来源:ScriptConsole.java

示例5: close

import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public void close() {
    destroyEach(downloader);
    destroyEach(pageLoginProcessor);
    destroyEach(scheduler);
    for (Pipeline pipeline : pipelines) {
        destroyEach(pipeline);
    }
    threadPool.shutdown();
}
 
开发者ID:hexiaohong-code,项目名称:LoginCrawler,代码行数:10,代码来源:SpiderLogin.java

示例6: destroy

import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
protected void destroy() {
    destroyEach(downloader);
    destroyEach(pageProcessor);
    for (Pipeline pipeline : pipelines) {
        destroyEach(pipeline);
    }
}
 
开发者ID:yuany,项目名称:en-webmagic,代码行数:8,代码来源:Spider.java

示例7: processRequest

import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
protected void processRequest(Request request) {
    Page page = downloader.download(request, this);
    if (page == null) {
        sleep(site.getSleepTime());
        return;
    }
    pageProcessor.process(page);
    addRequest(page);
    if (!page.getResultItems().isSkip()) {
        for (Pipeline pipeline : pipelines) {
            pipeline.process(page.getResultItems(), this);
        }
    }
    sleep(site.getSleepTime());
}
 
开发者ID:yuany,项目名称:en-webmagic,代码行数:16,代码来源:Spider.java

示例8: close

import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public void close() {
    destroyEach(downloader);
    destroyEach(pageProcessor);
    destroyEach(scheduler);
    for (Pipeline pipeline : pipelines) {
        destroyEach(pipeline);
    }
    threadPool.shutdown();
}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:10,代码来源:Spider.java

示例9: onDownloadSuccess

import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
private void onDownloadSuccess(Request request, Page page) {
    if (site.getAcceptStatCode().contains(page.getStatusCode())){
        pageProcessor.process(page);
        extractAndAddRequests(page, spawnUrl);
        if (!page.getResultItems().isSkip()) {
            for (Pipeline pipeline : pipelines) {
                pipeline.process(page.getResultItems(), this);
            }
        }
    } else {
        logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
    }
    sleep(site.getSleepTime());
    return;
}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:16,代码来源:Spider.java

示例10: test_github

import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
@Test
public void test_github() throws Exception {
    Spider.create(new GithubRepoPageProcessor()).addPipeline(new Pipeline() {
        @Override
        public void process(ResultItems resultItems, Task task) {
            assertThat(((String) resultItems.get("name")).trim()).isEqualTo("webmagic");
            assertThat(((String) resultItems.get("author")).trim()).isEqualTo("code4craft");
        }
    }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:11,代码来源:GithubRepoPageProcessorTest.java

示例11: test

import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
@Test
public void test() {
    OOSpider.create(new GithubRepoProcessor()).addPipeline(new Pipeline() {
        @Override
        public void process(ResultItems resultItems, Task task) {
            Assert.assertEquals("78",((String)resultItems.get("star")).trim());
            Assert.assertEquals("65",((String)resultItems.get("fork")).trim());
        }
    }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:11,代码来源:GithubRepoProcessor.java

示例12: getPipelineList

import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public List<Pipeline> getPipelineList() {
    return pipelineList;
}
 
开发者ID:bruceq,项目名称:Gather-Platform,代码行数:4,代码来源:CommonSpider.java

示例13: setPipelineList

import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public CommonSpider setPipelineList(List<Pipeline> pipelineList) {
    this.pipelineList = pipelineList;
    return this;
}
 
开发者ID:bruceq,项目名称:Gather-Platform,代码行数:5,代码来源:CommonSpider.java

示例14: getPipelines

import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public List<Pipeline> getPipelines() {
    return pipelines;
}
 
开发者ID:TransientBuckwheat,项目名称:nest-spider,代码行数:4,代码来源:CommonSpider.java

示例15: setPipelines

import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public CommonSpider setPipelines(List<Pipeline> pipelines) {
    this.pipelines = pipelines;
    return this;
}
 
开发者ID:TransientBuckwheat,项目名称:nest-spider,代码行数:5,代码来源:CommonSpider.java


注:本文中的us.codecraft.webmagic.pipeline.Pipeline类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。