本文整理汇总了Java中us.codecraft.webmagic.Page.addTargetRequest方法的典型用法代码示例。如果您正苦于以下问题:Java Page.addTargetRequest方法的具体用法?Java Page.addTargetRequest怎么用?Java Page.addTargetRequest使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类us.codecraft.webmagic.Page
的用法示例。
在下文中一共展示了Page.addTargetRequest方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: extractLinks
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
List<String> links;
if (urlRegionSelector == null) {
links = page.getHtml().links().all();
} else {
links = urlRegionSelector.selectList(page.getHtml().toString());
}
for (String link : links) {
for (Pattern targetUrlPattern : urlPatterns) {
Matcher matcher = targetUrlPattern.matcher(link);
if (matcher.find()) {
page.addTargetRequest(new Request(matcher.group(1)));
}
}
}
}
示例2: extractLinks
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
List<String> links;
if (urlRegionSelector == null) {
links = page.getHtml().links().all();
} else {
links = page.getHtml().selectList(urlRegionSelector).links().all();
}
for (String link : links) {
for (Pattern targetUrlPattern : urlPatterns) {
Matcher matcher = targetUrlPattern.matcher(link);
if (matcher.find()) {
page.addTargetRequest(new Request(matcher.group(0)));
}
}
}
}
示例3: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
/* 只要获取的url数量大于100就终止爬虫任务 */
if (newUrls.size() > 100) {
try {
CrawlerBootstrap.upLoadNewUrls(newUrls);
System.exit(0);
} catch (IOException e) {
e.printStackTrace();
}
return;
}
String html = page.getHtml().toString();
String selfUrl = page.getUrl().toString();
Pattern pattern = Pattern.compile("(?<=<a href=\")(?!" + selfUrl
+ ")https://en.wikipedia.org/wiki/.*?(?=\")");
Matcher matcher = pattern.matcher(html);
int counter = 0;
while (matcher.find()) {
if (counter < 5) {
page.addTargetRequest(matcher.group());
++counter;
} else {
newUrls.add(matcher.group());
}
}
System.out.println("current size: " + newUrls.size());
}
示例4: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
if (page.getUrl().regex(LIST_PAGE_RULE).match()) {
page.addTargetRequests(page.getHtml().links().regex(LIST_PAGE_RULE, 0).all());
page.addTargetRequests(page.getHtml().links().regex(VIDEO_PAGE_RULE, 0).all());
} else if (page.getUrl().regex(VIDEO_PAGE_RULE).match()) {
String albumId = page.getHtml().regex("albumId:(.*?\\d),", 1).toString().replace(" ", "");
if (StringUtils.isNotBlank(albumId)) {
page.addTargetRequest(
"http://mixer.video.iqiyi.com/jp/mixin/videos/avlist?albumId=" + albumId + "&size=4096");
page.addTargetRequests(page.getHtml().links().regex(VIDEO_PAGE_RULE, 0).all());
}
} else if (page.getUrl().regex(RESULT_RULE).match()) {
String json = page.getJson().toString().replace("var tvInfoJs=", "");
List<String> mixinVideos = new JsonPathSelector("$.mixinVideos").selectList(json);
if (!mixinVideos.isEmpty()) {
JsonPathSelector jsonPathAlbumId = new JsonPathSelector("$.albumId");
JsonPathSelector jsonPathTvId = new JsonPathSelector("$.tvId");
JsonPathSelector jsonPathUrl = new JsonPathSelector("$.url");
JsonPathSelector jsonPathPlayCount = new JsonPathSelector("$.playCount");
JsonPathSelector jsonPathName = new JsonPathSelector("$.name");
JsonPathSelector jsonPathDescription = new JsonPathSelector("$.description");
String record = "";
for (Iterator<String> iterator = mixinVideos.iterator(); iterator.hasNext();) {
String element = iterator.next();
record += jsonPathAlbumId.select(element) + "\t" + jsonPathTvId.select(element) + "\t"
+ jsonPathUrl.select(element) + "\t" + jsonPathPlayCount.select(element) + "\t"
+ jsonPathName.select(element).replaceAll("[\t\n]", "") + "\t"
+ jsonPathDescription.select(element).replaceAll("[\t\n]", " ") + "\n";
}
System.out.print(record);
}
}
}
示例5: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
if (page.getUrl().regex(LIST_URL).match()) {
List<String> ids = new JsonPathSelector("$.data[*]._id").selectList(page.getRawText());
if (CollectionUtils.isNotEmpty(ids)) {
for (String id : ids) {
page.addTargetRequest("http://angularjs.cn/api/article/" + id);
}
}
} else {
page.putField("title", new JsonPathSelector("$.data.title").select(page.getRawText()));
page.putField("content", new JsonPathSelector("$.data.content").select(page.getRawText()));
}
}
示例6: processCountry
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
private void processCountry(Page page) {
List<String> provinces = page.getHtml().xpath("//*[@id=\"newAlexa\"]/table/tbody/tr/td").all();
for (String province : provinces) {
String link = xpath("//@href").select(province);
String title = xpath("/text()").select(province);
Request request = new Request(link).setPriority(0).putExtra("province", title);
page.addTargetRequest(request);
}
}
示例7: processProvince
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
private void processProvince(Page page) {
//这里仅靠xpath没法精准定位,所以使用正则作为筛选,不符合正则的会被过滤掉
List<String> districts = page.getHtml().xpath("//body/table/tbody/tr[@bgcolor=\"#ffffff\"]").all();
Pattern pattern = Pattern.compile("<td>([^<>]+)</td>.*?href=\"(.*?)\"",Pattern.DOTALL);
for (String district : districts) {
Matcher matcher = pattern.matcher(district);
while (matcher.find()) {
String title = matcher.group(1);
String link = matcher.group(2);
Request request = new Request(link).setPriority(1).putExtra("province", page.getRequest().getExtra("province")).putExtra("district", title);
page.addTargetRequest(request);
}
}
}
示例8: processDistrict
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
private void processDistrict(Page page) {
String province = page.getRequest().getExtra("province").toString();
String district = page.getRequest().getExtra("district").toString();
String zipCode = page.getHtml().regex("<h2>邮编:(\\d+)</h2>").toString();
page.putField("result", StringUtils.join(new String[]{province, district,
zipCode}, "\t"));
List<String> links = page.getHtml().links().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").all();
for (String link : links) {
page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district));
}
}
示例9: process
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1;
page.addTargetRequest("http://kaichiba.com/shop/" + i);
page.putField("title",page.getHtml().xpath("//Title"));
page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace("<span>.*?</span>", ""));
}
示例10: crawerCourse
import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
/**
* 爬取课程
*/
public void crawerCourse(Page page) {
/**
* 得到上级传来的专业类型实体
*/
SpiderProfessionType professionTypeModel = (SpiderProfessionType) page.getRequest().getExtra(
"professionTypeModel");
// <div class="label">
// 哲学 </div>
// 筛选专业类型
String professionTypeName = page.getHtml()
.xpath("//div[@class='label']/text()").toString();
// <li class="ans-slow-anim">
// <div class="picArea ans-slow-anim"><a href="/course/198413.html"
// target="_blank">
// <img
// src="http://p.ananas.chaoxing.com/star/258_153c/1384413396917gvcrs.jpg"
// width="178" height="109"></a>
// </div>
// <div class="introArea"><a href="/course/198413.html" target="_blank"
// title="中华传统思想-对话先秦哲学">中华传统思想-对话先秦哲学</a></div>
// <div class="introArea2" title="万献初 李景林 郭齐勇 夏可君 陈炎 武汉大学">
// 万献初等
// 武汉大学
//
// </div>
// </li>
// 筛选名称
List<String> courseNameList = page.getHtml()
.xpath("//div[@class='introArea']/a/html()").all();
// page.putField("courseNameList", courseNameList);
// 筛选url
List<String> courseUrlList = page.getHtml()
.xpath("//div[@class='introArea']/a/@href").all();
// page.putField("courseUrlList", courseUrlList);
// 筛选信息
List<String> infoList = page.getHtml()
.xpath("//div[@class='introArea2']/@title").all();
// page.putField("infoList", infoList);
//筛选imgUrl
// <div class="picArea ans-slow-anim"><a href="/course/157855.html" target="_blank">
// <img src="http://p.ananas.chaoxing.com/star/258_153c/1383715356523iiuzg.jpg" width="178" height="109"></a>
// </div>
List<String> courseImgUrlList=page.getHtml().xpath("//div[@class='picArea ans-slow-anim']/a/img/@src").all();
if (courseNameList.size() > 0) {
for (int i = 0; i < courseNameList.size(); i++) {
SpiderCourse model = new SpiderCourse(courseNameList.get(i)
.toString().trim(), courseUrlList.get(i).toString()
.trim(), infoList.get(i).toString(), professionTypeName,courseImgUrlList.get(i).toString(),
professionTypeModel,0);
spiderCourseService.save(model);
// Request request2=new
// Request(courseUrlList.get(i)).setPriority(1).putExtra("courseModel",
// model);
// page.putField("model", model);
// 设置优先级为1
page.addTargetRequest(new Request(courseUrlList.get(i))
.setPriority(1).putExtra("courseModel", model));
}
}
//查找所有的课程类型
List<SpiderProfessionType> list =
spiderProfessionTypeService.findAll();
for (int j = 2; j < list.size(); j++) {
// 设置优先级为0
page.addTargetRequest(new Request(list.get(j).getUrl()+"/0/1400").setPriority(0).
putExtra("professionTypeModel", list.get(j)));
}
}