本文整理汇总了Java中us.codecraft.webmagic.Page.setSkip方法的典型用法代码示例。如果您正苦于以下问题:Java Page.setSkip方法的具体用法?Java Page.setSkip怎么用?Java Page.setSkip使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类us.codecraft.webmagic.Page
的用法示例。
在下文中一共展示了Page.setSkip方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
List<String> relativeUrl = page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all();
page.addTargetRequests(relativeUrl);
relativeUrl = page.getHtml().xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href").all();
page.addTargetRequests(relativeUrl);
List<String> answers = page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all();
boolean exist = false;
for(String answer:answers){
String vote = new Html(answer).xpath("//div[@class='zm-votebar']//span[@class='count']/text()").toString();
if(Integer.valueOf(vote) >= voteNum){
page.putField("vote",vote);
page.putField("content",new Html(answer).xpath("//div[@class='zm-editable-content']"));
page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href"));
exist = true;
}
}
if(!exist){
page.setSkip(true);
}
}
示例2: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
for (ExtractRule extractRule : extractRules) {
if (extractRule.isMulti()) {
List<String> results = page.getHtml().selectDocumentForList(extractRule.getSelector());
if (extractRule.isNotNull() && results.size() == 0) {
page.setSkip(true);
} else {
page.getResultItems().put(extractRule.getFieldName(), results);
}
} else {
String result = page.getHtml().selectDocument(extractRule.getSelector());
if (extractRule.isNotNull() && result == null) {
page.setSkip(true);
} else {
page.getResultItems().put(extractRule.getFieldName(), result);
}
}
}
}
示例3: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
// 部分二:定义如何抽取页面信息,并保存下来
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
if (page.getResultItems().get("name") == null) {
//skip this page
page.setSkip(true);
}
page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
// 部分三:从页面发现后续的url地址来抓取
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
}
示例4: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
String url = page.getRequest().getUrl();
// http://www.cnblogs.com/rick168/p/5260265.html
// http://www.cnblogs.com/java-zhao/archive/2016/09/01/5831002.html
String tmp = url.replaceAll("http://www.cnblogs.com/[^#]+\\.(?:html|htm)", "");
if(url.equals(tmp) || htmlService.exists(url)){
logger.info("### 跳过: " + url);
page.setSkip(true);
}
String title = page.getHtml().$("head title").get();
String content = page.getHtml().get();
// logger.info("url = " + url);
// logger.info("title = " + title);
// logger.info("content = " + content);
// logger.info("------------------------------------------------------------------------------------------------------------------");
Html html = new Html();
html.setUrl(url);
html.setTitle(title);
html.setContent(content);
page.putField("html", html);
// 部分三:从页面发现后续的url地址来抓取
// http://www.cnblogs.com/rick168/p/5260265.html
// http://www.cnblogs.com/netfocus/
// http://www.cnblogs.com/yixianyong/p/5091812.html
page.addTargetRequests(page.getHtml().links().regex("http://www.cnblogs.com/[^#]+(?:/p/|)[^#]*").all());
}
示例5: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
List<Selectable> nodes = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li").nodes();
StringBuilder accum = new StringBuilder();
for (Selectable node : nodes) {
accum.append("img:").append(node.xpath("//a/@href").get()).append("\n");
accum.append("title:").append(node.xpath("//img/@alt").get()).append("\n");
}
page.putField("",accum.toString());
if (accum.length() == 0) {
page.setSkip(true);
}
page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all());
}
示例6: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
GithubRepo githubRepo = new GithubRepo();
githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
githubRepo.setName(page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString());
if (githubRepo.getName() == null) {
//skip this page
page.setSkip(true);
} else {
page.putField("repo", githubRepo);
}
}
示例7: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
if (page.getUrl().regex(URL_POST).match()) {
page.putField("goodsName", page.getHtml().xpath("//div[@id='description']/h1/tidyText()"));
if (page.getResultItems().get("goodsName") == null) {
page.setSkip(true);
}
page.putField("currency", page.getHtml().xpath("//div[@id='description']//div[@class='itemBoxPrice']/span//span[@class='currency']/tidyText()"));
page.putField("goodsPrice", page.getHtml().xpath("//div[@id='description']//div[@class='itemBoxPrice']/span//span[@class='priceValue']/tidyText()"));
page.putField("description", page.getHtml()
.xpath("//div[@id='tabbedDescription']//div[@class='tabbedDescription']//ul[@id='tabs']//li[@id='tab_description']/div[@id='description_pane']/tidyText()"));
page.putField("material", page.getHtml()
.xpath("//div[@id='tabbedDescription']" +
"//div[@class='tabbedDescription']" +
"//ul[@id='tabs']" +
"//li[@id='tab_description']" +
"//div[@class='productProperty']" +
"//div[@class='productPropertyRow']/span[2]/tidyText()"));
page.putField("goodsCode", page.getHtml()
.xpath("//div[@id='tabbedDescription']" +
"//div[@class='tabbedDescription']" +
"//ul[@id='tabs']" +
"//li[@id='tab_description']" +
"//div[@class='productProperty']" +
"//div[@class='productPropertyRow']//span[@id='modelFabricColorContainer']/tidyText()"));
page.putField("goodsSize", page.getHtml()
.xpath("//div[@id='sizesContainer']//div[@id='sizes']//ul[@class='SizeW']"));
page.putField("goodsColors", page.getHtml()
.xpath("//div[@id='colors']/ul/html()"));
} else {
page.addTargetRequests(page.getHtml().links().regex(URL_POST).all(), 1000);
page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all(), 1);
}
}
示例8: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
// 部分二:定义如何抽取页面信息,并保存下来
String author=page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString();
page.putField("author",author );
String name=page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString();
page.putField("name", name);
if (page.getResultItems().get("name") == null) {
//skip this page
page.setSkip(true);
}
String readme=page.getHtml().xpath("//div[@id='readme']/tidyText()").toString();
page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
// if(name==null){
// Test test=new Test();
// test.setAuthor(author);
// test.setName(name);
// test.setReadme(readme);
// testService.save(test);
// }
// 部分三:从页面发现后续的url地址来抓取
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
}
示例9: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
if (page.getResultItems().get("name")==null){
//skip this page
page.setSkip(true);
}
page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
}
示例10: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("https://www\\.zhihu\\.com/question/\\d+/answer/\\d+.*").all());
page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString());
page.putField("question", page.getHtml().xpath("//div[@class='QuestionRichText']//tidyText()").toString());
page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString());
if (page.getResultItems().get("title")==null){
//skip this page
page.setSkip(true);
}
}
示例11: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
GithubRepo githubRepo = new GithubRepo();
githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
githubRepo.setName(page.getHtml().xpath("//h1[contains(@class, 'entry-title') and contains(@class, 'public')]/strong/a/text()").toString());
githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString());
if (githubRepo.getName() == null) {
//skip this page
page.setSkip(true);
} else {
page.putField("repo", githubRepo);
}
}
示例12: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
GithubRepo githubRepo = githubRepoPageMapper.get(page);
if (githubRepo == null) {
page.setSkip(true);
} else {
page.putField("repo", githubRepo);
}
}