当前位置: 首页>>代码示例>>Java>>正文


Java Page.addTargetRequest方法代码示例

本文整理汇总了Java中us.codecraft.webmagic.Page.addTargetRequest方法的典型用法代码示例。如果您正苦于以下问题:Java Page.addTargetRequest方法的具体用法?Java Page.addTargetRequest怎么用?Java Page.addTargetRequest使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在us.codecraft.webmagic.Page的用法示例。


在下文中一共展示了Page.addTargetRequest方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: extractLinks

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
    List<String> links;
    if (urlRegionSelector == null) {
        links = page.getHtml().links().all();
    } else {
        links = urlRegionSelector.selectList(page.getHtml().toString());
    }
    for (String link : links) {
        for (Pattern targetUrlPattern : urlPatterns) {
            Matcher matcher = targetUrlPattern.matcher(link);
            if (matcher.find()) {
                page.addTargetRequest(new Request(matcher.group(1)));
            }
        }
    }
}
 
开发者ID:yuany,项目名称:en-webmagic,代码行数:17,代码来源:ModelPageProcessor.java

示例2: extractLinks

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
    List<String> links;
    if (urlRegionSelector == null) {
        links = page.getHtml().links().all();
    } else {
        links = page.getHtml().selectList(urlRegionSelector).links().all();
    }
    for (String link : links) {
        for (Pattern targetUrlPattern : urlPatterns) {
            Matcher matcher = targetUrlPattern.matcher(link);
            if (matcher.find()) {
                page.addTargetRequest(new Request(matcher.group(0)));
            }
        }
    }
}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:17,代码来源:ModelPageProcessor.java

示例3: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    /* 只要获取的url数量大于100就终止爬虫任务 */
    if (newUrls.size() > 100) {
        try {
            CrawlerBootstrap.upLoadNewUrls(newUrls);
            System.exit(0);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return;
    }
    String html = page.getHtml().toString();
    String selfUrl = page.getUrl().toString();
    Pattern pattern = Pattern.compile("(?<=<a href=\")(?!" + selfUrl
            + ")https://en.wikipedia.org/wiki/.*?(?=\")");
    Matcher matcher = pattern.matcher(html);
    int counter = 0;
    while (matcher.find()) {
        if (counter < 5) {
            page.addTargetRequest(matcher.group());
            ++counter;
        } else {
            newUrls.add(matcher.group());
        }
    }
    System.out.println("current size: " + newUrls.size());
}
 
开发者ID:xiongbeer,项目名称:Cobweb,代码行数:29,代码来源:WarmUp.java

示例4: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
	if (page.getUrl().regex(LIST_PAGE_RULE).match()) {
		page.addTargetRequests(page.getHtml().links().regex(LIST_PAGE_RULE, 0).all());
		page.addTargetRequests(page.getHtml().links().regex(VIDEO_PAGE_RULE, 0).all());
	} else if (page.getUrl().regex(VIDEO_PAGE_RULE).match()) {
		String albumId = page.getHtml().regex("albumId:(.*?\\d),", 1).toString().replace(" ", "");
		if (StringUtils.isNotBlank(albumId)) {
			page.addTargetRequest(
					"http://mixer.video.iqiyi.com/jp/mixin/videos/avlist?albumId=" + albumId + "&size=4096");
			page.addTargetRequests(page.getHtml().links().regex(VIDEO_PAGE_RULE, 0).all());
		}
	} else if (page.getUrl().regex(RESULT_RULE).match()) {
		String json = page.getJson().toString().replace("var tvInfoJs=", "");
		List<String> mixinVideos = new JsonPathSelector("$.mixinVideos").selectList(json);
		if (!mixinVideos.isEmpty()) {
			JsonPathSelector jsonPathAlbumId = new JsonPathSelector("$.albumId");
			JsonPathSelector jsonPathTvId = new JsonPathSelector("$.tvId");
			JsonPathSelector jsonPathUrl = new JsonPathSelector("$.url");
			JsonPathSelector jsonPathPlayCount = new JsonPathSelector("$.playCount");
			JsonPathSelector jsonPathName = new JsonPathSelector("$.name");
			JsonPathSelector jsonPathDescription = new JsonPathSelector("$.description");
			String record = "";
			for (Iterator<String> iterator = mixinVideos.iterator(); iterator.hasNext();) {
				String element = iterator.next();
				record += jsonPathAlbumId.select(element) + "\t" + jsonPathTvId.select(element) + "\t"
						+ jsonPathUrl.select(element) + "\t" + jsonPathPlayCount.select(element) + "\t"
						+ jsonPathName.select(element).replaceAll("[\t\n]", "") + "\t"
						+ jsonPathDescription.select(element).replaceAll("[\t\n]", " ") + "\n";
			}
			System.out.print(record);
		}
	}
}
 
开发者ID:viixv,项目名称:iqiyi-crawler,代码行数:35,代码来源:Crawler.java

示例5: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    if (page.getUrl().regex(LIST_URL).match()) {
        List<String> ids = new JsonPathSelector("$.data[*]._id").selectList(page.getRawText());
        if (CollectionUtils.isNotEmpty(ids)) {
            for (String id : ids) {
                page.addTargetRequest("http://angularjs.cn/api/article/" + id);
            }
        }
    } else {
        page.putField("title", new JsonPathSelector("$.data.title").select(page.getRawText()));
        page.putField("content", new JsonPathSelector("$.data.content").select(page.getRawText()));
    }

}
 
开发者ID:mikeqian,项目名称:house,代码行数:16,代码来源:AngularJSProcessor.java

示例6: processCountry

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
private void processCountry(Page page) {
    List<String> provinces = page.getHtml().xpath("//*[@id=\"newAlexa\"]/table/tbody/tr/td").all();
    for (String province : provinces) {
        String link = xpath("//@href").select(province);
        String title = xpath("/text()").select(province);
        Request request = new Request(link).setPriority(0).putExtra("province", title);
        page.addTargetRequest(request);
    }
}
 
开发者ID:mikeqian,项目名称:house,代码行数:10,代码来源:ZipCodePageProcessor.java

示例7: processProvince

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
private void processProvince(Page page) {
    //这里仅靠xpath没法精准定位,所以使用正则作为筛选,不符合正则的会被过滤掉
    List<String> districts = page.getHtml().xpath("//body/table/tbody/tr[@bgcolor=\"#ffffff\"]").all();
    Pattern pattern = Pattern.compile("<td>([^<>]+)</td>.*?href=\"(.*?)\"",Pattern.DOTALL);
    for (String district : districts) {
        Matcher matcher = pattern.matcher(district);
        while (matcher.find()) {
            String title = matcher.group(1);
            String link = matcher.group(2);
            Request request = new Request(link).setPriority(1).putExtra("province", page.getRequest().getExtra("province")).putExtra("district", title);
            page.addTargetRequest(request);
        }
    }
}
 
开发者ID:mikeqian,项目名称:house,代码行数:15,代码来源:ZipCodePageProcessor.java

示例8: processDistrict

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
private void processDistrict(Page page) {
    String province = page.getRequest().getExtra("province").toString();
    String district = page.getRequest().getExtra("district").toString();
    String zipCode = page.getHtml().regex("<h2>邮编:(\\d+)</h2>").toString();
    page.putField("result", StringUtils.join(new String[]{province, district,
            zipCode}, "\t"));
    List<String> links = page.getHtml().links().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").all();
    for (String link : links) {
        page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district));
    }

}
 
开发者ID:mikeqian,项目名称:house,代码行数:13,代码来源:ZipCodePageProcessor.java

示例9: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    //http://progressdaily.diandian.com/post/2013-01-24/40046867275
    int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1;
    page.addTargetRequest("http://kaichiba.com/shop/" + i);
    page.putField("title",page.getHtml().xpath("//Title"));
    page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace("<span>.*?</span>", ""));
}
 
开发者ID:yuany,项目名称:en-webmagic,代码行数:9,代码来源:KaichibaProcessor.java

示例10: crawerCourse

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
/**
	 * 爬取课程
	 */
	public void crawerCourse(Page page) {
		/**
		 * 得到上级传来的专业类型实体
		 */
		SpiderProfessionType professionTypeModel = (SpiderProfessionType) page.getRequest().getExtra(
				"professionTypeModel");
		
		// <div class="label">
		// 哲学 </div>

		// 筛选专业类型
		String professionTypeName = page.getHtml()
				.xpath("//div[@class='label']/text()").toString();
		// <li class="ans-slow-anim">
		// <div class="picArea ans-slow-anim"><a href="/course/198413.html"
		// target="_blank">
		// <img
		// src="http://p.ananas.chaoxing.com/star/258_153c/1384413396917gvcrs.jpg"
		// width="178" height="109"></a>
		// </div>
		// <div class="introArea"><a href="/course/198413.html" target="_blank"
		// title="中华传统思想-对话先秦哲学">中华传统思想-对话先秦哲学</a></div>
		// <div class="introArea2" title="万献初 李景林 郭齐勇 夏可君  陈炎   武汉大学">
		// 万献初等
		// 武汉大学
		//
		// </div>
		// </li>
		// 筛选名称
		List<String> courseNameList = page.getHtml()
				.xpath("//div[@class='introArea']/a/html()").all();
		// page.putField("courseNameList", courseNameList);
		// 筛选url
		List<String> courseUrlList = page.getHtml()
				.xpath("//div[@class='introArea']/a/@href").all();
		// page.putField("courseUrlList", courseUrlList);
		// 筛选信息
		List<String> infoList = page.getHtml()
				.xpath("//div[@class='introArea2']/@title").all();
		// page.putField("infoList", infoList);
		
		//筛选imgUrl
//		<div class="picArea ans-slow-anim"><a href="/course/157855.html" target="_blank">
//			<img src="http://p.ananas.chaoxing.com/star/258_153c/1383715356523iiuzg.jpg" width="178" height="109"></a>
//		</div>
		List<String> courseImgUrlList=page.getHtml().xpath("//div[@class='picArea ans-slow-anim']/a/img/@src").all();
		
		if (courseNameList.size() > 0) {
			for (int i = 0; i < courseNameList.size(); i++) {
				SpiderCourse model = new SpiderCourse(courseNameList.get(i)
						.toString().trim(), courseUrlList.get(i).toString()
						.trim(), infoList.get(i).toString(), professionTypeName,courseImgUrlList.get(i).toString(),
						professionTypeModel,0);
				spiderCourseService.save(model);

				// Request request2=new
				// Request(courseUrlList.get(i)).setPriority(1).putExtra("courseModel",
				// model);
				// page.putField("model", model);
				// 设置优先级为1
				page.addTargetRequest(new Request(courseUrlList.get(i))
						.setPriority(1).putExtra("courseModel", model));
			}
		}
		//查找所有的课程类型
		 List<SpiderProfessionType> list =
		 spiderProfessionTypeService.findAll();
		 for (int j = 2; j < list.size(); j++) {
			 // 设置优先级为0
			 page.addTargetRequest(new Request(list.get(j).getUrl()+"/0/1400").setPriority(0).
					 putExtra("professionTypeModel", list.get(j)));
		 }
	}
 
开发者ID:lawlite19,项目名称:SmartEducation,代码行数:77,代码来源:CourseSpider.java


注:本文中的us.codecraft.webmagic.Page.addTargetRequest方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。