本文整理汇总了Java中us.codecraft.webmagic.Page.putField方法的典型用法代码示例。如果您正苦于以下问题:Java Page.putField方法的具体用法?Java Page.putField怎么用?Java Page.putField使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类us.codecraft.webmagic.Page
的用法示例。
在下文中一共展示了Page.putField方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
List<String> relativeUrl = page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all();
page.addTargetRequests(relativeUrl);
relativeUrl = page.getHtml().xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href").all();
page.addTargetRequests(relativeUrl);
List<String> answers = page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all();
boolean exist = false;
for(String answer:answers){
String vote = new Html(answer).xpath("//div[@class='zm-votebar']//span[@class='count']/text()").toString();
if(Integer.valueOf(vote) >= voteNum){
page.putField("vote",vote);
page.putField("content",new Html(answer).xpath("//div[@class='zm-editable-content']"));
page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href"));
exist = true;
}
}
if(!exist){
page.setSkip(true);
}
}
示例2: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
List<String> lis = page.getHtml().xpath("//ul[@id='house-lst']/li").all();
List<House> housees = new ArrayList<>();
for (String li : lis) {
String key = new Html(li).xpath("//div[@class='pic-panel']/a/@key").get();
String area = new Html(li).xpath("//div[@class='where']/a[@class='laisuzhou']/@href").get()
.replace("/xiaoqu/", "")
.replace(".html", "")
.replace("http://sh.lianjia.com", "");
House house = new House();
house.setId(key.replace("sh", ""));
house.setXiaoqu(area);
housees.add(house);
}
long pageIndex = seed.incrementAndGet();
List<String> urls = Lists.newArrayList(String.format(Constants.House_SEED, pageIndex));
if (pageIndex < maxIndex) {
page.addTargetRequests(urls);
}
page.putField("house", housees);
}
示例3: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
//a()表示提取链接,links()表示提取所有链接
//getHtml()返回Html对象,支持链式调用
//r()表示用正则表达式提取一条内容,regex()表示提取多条内容
//toString()表示取单条结果,all()表示取多条
List<String> requests = page.getHtml().links().regex("(.*/post/.*)").all();
//使用page.addTargetRequests()方法将待抓取的链接加入队列
page.addTargetRequests(requests);
//page.putField(key,value)将抽取的内容加入结果Map
//x()和xs()使用xpath进行抽取
page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|").toString());
//smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率
page.putField("content", page.getHtml().smartContent());
page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/"));
page.putField("id", page.getUrl().regex("post/\\d+-\\d+-\\d+/(\\d+)"));
}
示例4: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
// 部分二:定义如何抽取页面信息,并保存下来
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
if (page.getResultItems().get("name") == null) {
//skip this page
page.setSkip(true);
}
page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
// 部分三:从页面发现后续的url地址来抓取
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
}
示例5: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
String now = DateUtil.dateToString(new Date(), DateUtil.DATE_FORMAT_DAY_SHORT);
String temp = page.getHtml().toString();
int beginIndex = temp.lastIndexOf('[');
int endIndex = temp.lastIndexOf(']');
temp = temp.substring(beginIndex, endIndex + 1);
List<String> targetUrls = new ArrayList<String>();
List<Object> items = JsonUtil.jsonToList(temp, Object.class);
List<IndustryInfo> industryInfos = new ArrayList<IndustryInfo>();
for (Object object : items) {
String[] arr = object.toString().split(",");
IndustryInfo industryInfo = new IndustryInfo();
//// TODO: 2017/9/13 setIndustryId
industryInfo.setIndustryCode(arr[1]);
industryInfo.setIndustryName(arr[2]);
industryInfo.setRise(new BigDecimal(arr[3]));
industryInfo.setMain(Integer.parseInt(arr[4].substring(0, arr[4].lastIndexOf("."))));
industryInfo.setSuper_(Integer.parseInt(arr[6].substring(0, arr[6].lastIndexOf("."))));
industryInfo.setBig(Integer.parseInt(arr[8].substring(0, arr[8].lastIndexOf("."))));
industryInfo.setMedium(Integer.parseInt(arr[10].substring(0, arr[10].lastIndexOf("."))));
industryInfo.setSmall(Integer.parseInt(arr[12].substring(0, arr[12].lastIndexOf("."))));
industryInfo.setTotal(industryInfo.getMain() + industryInfo.getSuper_() + industryInfo.getBig() + industryInfo.getMedium() + industryInfo.getSmall());
industryInfo.setDate(Integer.parseInt(now));
industryInfos.add(industryInfo);
}
page.putField("industryInfos", industryInfos);
page.addTargetRequests(targetUrls);
}
示例6: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
String temp = page.getHtml().toString();
int beginIndex = temp.lastIndexOf('[');
int endIndex = temp.lastIndexOf(']');
temp = temp.substring(beginIndex, endIndex + 1);
List<Object> items = JsonUtil.jsonToList(temp, Object.class);
List<Industry> industries = new ArrayList<Industry>();
for (Object object : items) {
String[] arr = object.toString().split(",");
industries.add(new Industry(arr[2], arr[1]));
}
page.putField("industries", industries);
}
示例7: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
page.addTargetRequests(links);
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString());
page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString());
page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
}
示例8: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
GithubRepo githubRepo = new GithubRepo();
githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
githubRepo.setName(page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString());
if (githubRepo.getName() == null) {
//skip this page
page.setSkip(true);
} else {
page.putField("repo", githubRepo);
}
}
示例9: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
List<Selectable> nodes = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li").nodes();
StringBuilder accum = new StringBuilder();
for (Selectable node : nodes) {
accum.append("img:").append(node.xpath("//a/@href").get()).append("\n");
accum.append("title:").append(node.xpath("//img/@alt").get()).append("\n");
}
page.putField("",accum.toString());
if (accum.length() == 0) {
page.setSkip(true);
}
page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all());
}
示例10: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").all();
page.addTargetRequests(strings);
page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b"));
page.putField("body",page.getHtml().smartContent());
}
示例11: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
//http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106
// &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone
List<String> requests = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").all();
page.addTargetRequests(requests);
page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
page.putField("content",page.getHtml().smartContent());
}
示例12: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
// 部分二:定义如何抽取页面信息,并保存下来
String author=page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString();
page.putField("author",author );
String name=page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString();
page.putField("name", name);
if (page.getResultItems().get("name") == null) {
//skip this page
page.setSkip(true);
}
String readme=page.getHtml().xpath("//div[@id='readme']/tidyText()").toString();
page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
// if(name==null){
// Test test=new Test();
// test.setAuthor(author);
// test.setName(name);
// test.setReadme(readme);
// testService.save(test);
// }
// 部分三:从页面发现后续的url地址来抓取
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
}
示例13: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").all();
page.addTargetRequests(strings);
page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a"));
page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']"));
}
示例14: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
page.addTargetRequests(links);
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString());
page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString());
page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
}
示例15: processDistrict
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
private void processDistrict(Page page) {
String province = page.getRequest().getExtra("province").toString();
String district = page.getRequest().getExtra("district").toString();
String zipCode = page.getHtml().regex("<h2>邮编:(\\d+)</h2>").toString();
page.putField("result", StringUtils.join(new String[]{province, district,
zipCode}, "\t"));
List<String> links = page.getHtml().links().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").all();
for (String link : links) {
page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district));
}
}