当前位置: 首页>>代码示例>>Java>>正文


Java Page.setSkip方法代码示例

本文整理汇总了Java中us.codecraft.webmagic.Page.setSkip方法的典型用法代码示例。如果您正苦于以下问题:Java Page.setSkip方法的具体用法?Java Page.setSkip怎么用?Java Page.setSkip使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在us.codecraft.webmagic.Page的用法示例。


在下文中一共展示了Page.setSkip方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    List<String> relativeUrl = page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all();
    page.addTargetRequests(relativeUrl);
    relativeUrl = page.getHtml().xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href").all();
    page.addTargetRequests(relativeUrl);
    List<String> answers =  page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all();
    boolean exist = false;
    for(String answer:answers){
        String vote = new Html(answer).xpath("//div[@class='zm-votebar']//span[@class='count']/text()").toString();
        if(Integer.valueOf(vote) >= voteNum){
            page.putField("vote",vote);
            page.putField("content",new Html(answer).xpath("//div[@class='zm-editable-content']"));
            page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href"));
            exist = true;
        }
    }
    if(!exist){
        page.setSkip(true);
    }
}
 
开发者ID:mikeqian,项目名称:house,代码行数:22,代码来源:ZhihuPageProcessor.java

示例2: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    for (ExtractRule extractRule : extractRules) {
        if (extractRule.isMulti()) {
            List<String> results = page.getHtml().selectDocumentForList(extractRule.getSelector());
            if (extractRule.isNotNull() && results.size() == 0) {
                page.setSkip(true);
            } else {
                page.getResultItems().put(extractRule.getFieldName(), results);
            }
        } else {
            String result = page.getHtml().selectDocument(extractRule.getSelector());
            if (extractRule.isNotNull() && result == null) {
                page.setSkip(true);
            } else {
                page.getResultItems().put(extractRule.getFieldName(), result);
            }
        }
    }
}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:21,代码来源:ConfigurablePageProcessor.java

示例3: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
    // 部分二:定义如何抽取页面信息,并保存下来
    page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
    page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
    if (page.getResultItems().get("name") == null) {
        //skip this page
        page.setSkip(true);
    }
    page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));

    // 部分三:从页面发现后续的url地址来抓取
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
}
 
开发者ID:leon66666,项目名称:financehelper,代码行数:16,代码来源:GithubRepoPageProcessorSamples.java

示例4: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
    // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
    public void process(Page page) {
        String url = page.getRequest().getUrl();
        // http://www.cnblogs.com/rick168/p/5260265.html
        // http://www.cnblogs.com/java-zhao/archive/2016/09/01/5831002.html
        String tmp = url.replaceAll("http://www.cnblogs.com/[^#]+\\.(?:html|htm)", "");
        if(url.equals(tmp) || htmlService.exists(url)){
            logger.info("### 跳过: " + url);
            page.setSkip(true);
        }
        String title = page.getHtml().$("head title").get();
        String content = page.getHtml().get();

//        logger.info("url = " + url);
//        logger.info("title = " + title);
//        logger.info("content = " + content);
//        logger.info("------------------------------------------------------------------------------------------------------------------");

        Html html = new Html();
        html.setUrl(url);
        html.setTitle(title);
        html.setContent(content);
        page.putField("html", html);

        // 部分三:从页面发现后续的url地址来抓取
        // http://www.cnblogs.com/rick168/p/5260265.html
        // http://www.cnblogs.com/netfocus/
        // http://www.cnblogs.com/yixianyong/p/5091812.html
        page.addTargetRequests(page.getHtml().links().regex("http://www.cnblogs.com/[^#]+(?:/p/|)[^#]*").all());
    }
 
开发者ID:Lzw2016,项目名称:study,代码行数:32,代码来源:MySqlPageProcessor.java

示例5: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    List<Selectable> nodes = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li").nodes();
    StringBuilder accum = new StringBuilder();
    for (Selectable node : nodes) {
        accum.append("img:").append(node.xpath("//a/@href").get()).append("\n");
        accum.append("title:").append(node.xpath("//img/@alt").get()).append("\n");
    }
    page.putField("",accum.toString());
    if (accum.length() == 0) {
        page.setSkip(true);
    }
    page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all());
}
 
开发者ID:mikeqian,项目名称:house,代码行数:15,代码来源:MamacnPageProcessor.java

示例6: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
    GithubRepo githubRepo = new GithubRepo();
    githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
    githubRepo.setName(page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
    githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString());
    if (githubRepo.getName() == null) {
        //skip this page
        page.setSkip(true);
    } else {
        page.putField("repo", githubRepo);
    }
}
 
开发者ID:mikeqian,项目名称:house,代码行数:16,代码来源:GithubRepoPageProcessor.java

示例7: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    if (page.getUrl().regex(URL_POST).match()) {
        page.putField("goodsName", page.getHtml().xpath("//div[@id='description']/h1/tidyText()"));
        if (page.getResultItems().get("goodsName") == null) {
            page.setSkip(true);
        }
        page.putField("currency", page.getHtml().xpath("//div[@id='description']//div[@class='itemBoxPrice']/span//span[@class='currency']/tidyText()"));
        page.putField("goodsPrice", page.getHtml().xpath("//div[@id='description']//div[@class='itemBoxPrice']/span//span[@class='priceValue']/tidyText()"));
        page.putField("description", page.getHtml()
                .xpath("//div[@id='tabbedDescription']//div[@class='tabbedDescription']//ul[@id='tabs']//li[@id='tab_description']/div[@id='description_pane']/tidyText()"));
        page.putField("material", page.getHtml()
                .xpath("//div[@id='tabbedDescription']" +
                        "//div[@class='tabbedDescription']" +
                        "//ul[@id='tabs']" +
                        "//li[@id='tab_description']" +
                        "//div[@class='productProperty']" +
                        "//div[@class='productPropertyRow']/span[2]/tidyText()"));
        page.putField("goodsCode", page.getHtml()
                .xpath("//div[@id='tabbedDescription']" +
                        "//div[@class='tabbedDescription']" +
                        "//ul[@id='tabs']" +
                        "//li[@id='tab_description']" +
                        "//div[@class='productProperty']" +
                        "//div[@class='productPropertyRow']//span[@id='modelFabricColorContainer']/tidyText()"));
        page.putField("goodsSize", page.getHtml()
                .xpath("//div[@id='sizesContainer']//div[@id='sizes']//ul[@class='SizeW']"));
        page.putField("goodsColors", page.getHtml()
                .xpath("//div[@id='colors']/ul/html()"));
    } else {
        page.addTargetRequests(page.getHtml().links().regex(URL_POST).all(), 1000);
        page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all(), 1);
    }
}
 
开发者ID:mikeqian,项目名称:house,代码行数:35,代码来源:AlexanderMcqueenGoodsProcessor.java

示例8: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
    // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
    public void process(Page page) {
        // 部分二:定义如何抽取页面信息,并保存下来
    	String author=page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString();
        page.putField("author",author );
        String name=page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString();
        page.putField("name", name);

        if (page.getResultItems().get("name") == null) {
            //skip this page
            page.setSkip(true);
        }
        String readme=page.getHtml().xpath("//div[@id='readme']/tidyText()").toString();
        page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));

//        if(name==null){
//            Test test=new Test();
//            test.setAuthor(author);
//            test.setName(name);
//            test.setReadme(readme);
//            testService.save(test);
//        }
        // 部分三:从页面发现后续的url地址来抓取
        page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
    
    }
 
开发者ID:lawlite19,项目名称:SmartEducation,代码行数:28,代码来源:GithubRepoPageProcessor.java

示例9: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
    page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
    page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
    if (page.getResultItems().get("name")==null){
        //skip this page
        page.setSkip(true);
    }
    page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:13,代码来源:GithubRepoPageProcessor.java

示例10: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("https://www\\.zhihu\\.com/question/\\d+/answer/\\d+.*").all());
    page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString());
    page.putField("question", page.getHtml().xpath("//div[@class='QuestionRichText']//tidyText()").toString());
    page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString());
    if (page.getResultItems().get("title")==null){
        //skip this page
        page.setSkip(true);
    }
}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:12,代码来源:ZhihuPageProcessor.java

示例11: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
    GithubRepo githubRepo = new GithubRepo();
    githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
    githubRepo.setName(page.getHtml().xpath("//h1[contains(@class, 'entry-title') and contains(@class, 'public')]/strong/a/text()").toString());
    githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString());
    if (githubRepo.getName() == null) {
        //skip this page
        page.setSkip(true);
    } else {
        page.putField("repo", githubRepo);
    }
}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:16,代码来源:GithubRepoPageProcessor.java

示例12: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
    GithubRepo githubRepo = githubRepoPageMapper.get(page);
    if (githubRepo == null) {
        page.setSkip(true);
    } else {
        page.putField("repo", githubRepo);
    }

}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:13,代码来源:GithubRepoPageMapper.java


注:本文中的us.codecraft.webmagic.Page.setSkip方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。