当前位置: 首页>>代码示例>>Java>>正文


Java Page.addTargetRequests方法代码示例

本文整理汇总了Java中us.codecraft.webmagic.Page.addTargetRequests方法的典型用法代码示例。如果您正苦于以下问题:Java Page.addTargetRequests方法的具体用法?Java Page.addTargetRequests怎么用?Java Page.addTargetRequests使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在us.codecraft.webmagic.Page的用法示例。


在下文中一共展示了Page.addTargetRequests方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
    // 部分二:定义如何抽取页面信息,并保存下来
    page.putField("author", page.getUrl().regex("http://webmagic.io/docs/.*").toString());
    page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
    if (page.getResultItems().get("name") == null) {
        //skip this page
        // page.setSkip(true);
    }
    page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));

    String content = page.getHtml().toString();
    page.putField("content", content);
    // 部分三:从页面发现后续的url地址来抓取
    page.addTargetRequests(page.getHtml().links().regex("http://.*\\.jd\\..*com.*").all());
}
 
开发者ID:Lzw2016,项目名称:study,代码行数:18,代码来源:GithubRepoPageProcessor.java

示例2: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    List<String> relativeUrl = page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all();
    page.addTargetRequests(relativeUrl);
    relativeUrl = page.getHtml().xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href").all();
    page.addTargetRequests(relativeUrl);
    List<String> answers =  page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all();
    boolean exist = false;
    for(String answer:answers){
        String vote = new Html(answer).xpath("//div[@class='zm-votebar']//span[@class='count']/text()").toString();
        if(Integer.valueOf(vote) >= voteNum){
            page.putField("vote",vote);
            page.putField("content",new Html(answer).xpath("//div[@class='zm-editable-content']"));
            page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href"));
            exist = true;
        }
    }
    if(!exist){
        page.setSkip(true);
    }
}
 
开发者ID:blogshun,项目名称:ants-project,代码行数:22,代码来源:ZhihuPageProcessor.java

示例3: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    //a()表示提取链接,links()表示提取所有链接
    //getHtml()返回Html对象,支持链式调用
    //r()表示用正则表达式提取一条内容,regex()表示提取多条内容
    //toString()表示取单条结果,all()表示取多条
    List<String> requests = page.getHtml().links().regex("(.*/post/.*)").all();
    //使用page.addTargetRequests()方法将待抓取的链接加入队列
    page.addTargetRequests(requests);
    //page.putField(key,value)将抽取的内容加入结果Map
    //x()和xs()使用xpath进行抽取
    page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|").toString());
    //smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率
    page.putField("content", page.getHtml().smartContent());
    page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/"));
    page.putField("id", page.getUrl().regex("post/\\d+-\\d+-\\d+/(\\d+)"));
}
 
开发者ID:blogshun,项目名称:ants-project,代码行数:18,代码来源:DiandianBlogProcessor.java

示例4: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    List<String> imageUrlList = page.getHtml().$(".BDE_Image", "src").all();
    String pageId = page.getUrl().toString().replace(tieBaConfiguration.getTiebaContentPageUrl(),"");
    List<String> list = new ArrayList<>();
    for (String imageUrl : imageUrlList) {
        if (imageUrl.startsWith(tieBaConfiguration.getTiebaImageUrl())) {
            imageUrl=convertImageUrl(imageUrl);
            if (null!=imageUrl)list.add(imageUrl);
        }
    }
    if (list.size() > 0) {
        map.put(WebmagicService.getByte(TieBaImageIdMessageListener.TIEBA_CONTENT_IMAGE_KEY+pageId), WebmagicService.getByte(JSONObject.toJSONString(list)));
    }else{
        redisTemplate.convertAndSend(tieBaConfiguration.getTiebaContentNoImageIdTopic(), JSONObject.toJSONString(new ContentBean(pageId,tiebaName)));
    }
    if (!isAddTarget) {
        for (String id : pageNumberList) {
            StringBuilder sb = new StringBuilder();
            sb.append(url).append(id);
            page.addTargetRequests(Arrays.asList(sb.toString()));
        }
        isAddTarget = true;
    }
}
 
开发者ID:ggj2010,项目名称:javabase,代码行数:26,代码来源:ContentImageProcessor.java

示例5: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    for (int i = 1; i <= pageSize; i++) {
        String json = page.getHtml().xpath("//ul[@id='thread_list']/li[@class='j_thread_list clearfix'][" + (i) + "]/@data-field").toString();
        if(json!=null&&JSONObject.parseObject(json).containsKey("id")){
            JSONObject jsonObject = JSONObject.parseObject(json);
            String pageId=jsonObject.getString("id");
            String authorName=jsonObject.getString("author_name");
            String date = praseDate(page,i);
            String title=page.getHtml().xpath("a[@href='"+tieBaConfiguration.getTiebaContentPageUrl()+pageId+"']/@title").toString();

            pageNumberList.add(new ContentBean(pageId,date,tiebaName,authorName,title));
        }
    }

    if (!isAddTarget) {
        for (int i = 2; i <= endNum; i++) {
            StringBuilder sb = new StringBuilder();
            sb.append(tiebaUrl).append("&pn=" + i*pageSize);
            page.addTargetRequests(Arrays.asList(sb.toString()));
        }
        isAddTarget = true;
    }
}
 
开发者ID:ggj2010,项目名称:javabase,代码行数:25,代码来源:ContentIdProcessor.java

示例6: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
    // 部分二:定义如何抽取页面信息,并保存下来
    page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
    page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
    if (page.getResultItems().get("name") == null) {
        //skip this page
        page.setSkip(true);
    }
    page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));

    // 部分三:从页面发现后续的url地址来抓取
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
}
 
开发者ID:leon66666,项目名称:financehelper,代码行数:16,代码来源:GithubRepoPageProcessorSamples.java

示例7: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    String now = DateUtil.dateToString(new Date(), DateUtil.DATE_FORMAT_DAY_SHORT);
    String temp = page.getHtml().toString();
    int beginIndex = temp.lastIndexOf('[');
    int endIndex = temp.lastIndexOf(']');
    temp = temp.substring(beginIndex, endIndex + 1);
    List<String> targetUrls = new ArrayList<String>();
    List<Object> items = JsonUtil.jsonToList(temp, Object.class);
    List<IndustryInfo> industryInfos = new ArrayList<IndustryInfo>();
    for (Object object : items) {
        String[] arr = object.toString().split(",");
        IndustryInfo industryInfo = new IndustryInfo();
        //// TODO: 2017/9/13 setIndustryId
        industryInfo.setIndustryCode(arr[1]);
        industryInfo.setIndustryName(arr[2]);
        industryInfo.setRise(new BigDecimal(arr[3]));
        industryInfo.setMain(Integer.parseInt(arr[4].substring(0, arr[4].lastIndexOf("."))));
        industryInfo.setSuper_(Integer.parseInt(arr[6].substring(0, arr[6].lastIndexOf("."))));
        industryInfo.setBig(Integer.parseInt(arr[8].substring(0, arr[8].lastIndexOf("."))));
        industryInfo.setMedium(Integer.parseInt(arr[10].substring(0, arr[10].lastIndexOf("."))));
        industryInfo.setSmall(Integer.parseInt(arr[12].substring(0, arr[12].lastIndexOf("."))));
        industryInfo.setTotal(industryInfo.getMain() + industryInfo.getSuper_() + industryInfo.getBig() + industryInfo.getMedium() + industryInfo.getSmall());
        industryInfo.setDate(Integer.parseInt(now));
        industryInfos.add(industryInfo);
    }
    page.putField("industryInfos", industryInfos);
    page.addTargetRequests(targetUrls);
}
 
开发者ID:leon66666,项目名称:financehelper,代码行数:30,代码来源:IndustryInfoProcessor.java

示例8: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
	if (page.getUrl().regex(LIST_PAGE_RULE).match()) {
		page.addTargetRequests(page.getHtml().links().regex(LIST_PAGE_RULE, 0).all());
		page.addTargetRequests(page.getHtml().links().regex(VIDEO_PAGE_RULE, 0).all());
	} else if (page.getUrl().regex(VIDEO_PAGE_RULE).match()) {
		String albumId = page.getHtml().regex("albumId:(.*?\\d),", 1).toString().replace(" ", "");
		if (StringUtils.isNotBlank(albumId)) {
			page.addTargetRequest(
					"http://mixer.video.iqiyi.com/jp/mixin/videos/avlist?albumId=" + albumId + "&size=4096");
			page.addTargetRequests(page.getHtml().links().regex(VIDEO_PAGE_RULE, 0).all());
		}
	} else if (page.getUrl().regex(RESULT_RULE).match()) {
		String json = page.getJson().toString().replace("var tvInfoJs=", "");
		List<String> mixinVideos = new JsonPathSelector("$.mixinVideos").selectList(json);
		if (!mixinVideos.isEmpty()) {
			JsonPathSelector jsonPathAlbumId = new JsonPathSelector("$.albumId");
			JsonPathSelector jsonPathTvId = new JsonPathSelector("$.tvId");
			JsonPathSelector jsonPathUrl = new JsonPathSelector("$.url");
			JsonPathSelector jsonPathPlayCount = new JsonPathSelector("$.playCount");
			JsonPathSelector jsonPathName = new JsonPathSelector("$.name");
			JsonPathSelector jsonPathDescription = new JsonPathSelector("$.description");
			String record = "";
			for (Iterator<String> iterator = mixinVideos.iterator(); iterator.hasNext();) {
				String element = iterator.next();
				record += jsonPathAlbumId.select(element) + "\t" + jsonPathTvId.select(element) + "\t"
						+ jsonPathUrl.select(element) + "\t" + jsonPathPlayCount.select(element) + "\t"
						+ jsonPathName.select(element).replaceAll("[\t\n]", "") + "\t"
						+ jsonPathDescription.select(element).replaceAll("[\t\n]", " ") + "\n";
			}
			System.out.print(record);
		}
	}
}
 
开发者ID:viixv,项目名称:iqiyi-crawler,代码行数:35,代码来源:Crawler.java

示例9: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
    // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
    public void process(Page page) {
        String url = page.getRequest().getUrl();
        // http://www.cnblogs.com/rick168/p/5260265.html
        // http://www.cnblogs.com/java-zhao/archive/2016/09/01/5831002.html
        String tmp = url.replaceAll("http://www.cnblogs.com/[^#]+\\.(?:html|htm)", "");
        if(url.equals(tmp) || htmlService.exists(url)){
            logger.info("### 跳过: " + url);
            page.setSkip(true);
        }
        String title = page.getHtml().$("head title").get();
        String content = page.getHtml().get();

//        logger.info("url = " + url);
//        logger.info("title = " + title);
//        logger.info("content = " + content);
//        logger.info("------------------------------------------------------------------------------------------------------------------");

        Html html = new Html();
        html.setUrl(url);
        html.setTitle(title);
        html.setContent(content);
        page.putField("html", html);

        // 部分三:从页面发现后续的url地址来抓取
        // http://www.cnblogs.com/rick168/p/5260265.html
        // http://www.cnblogs.com/netfocus/
        // http://www.cnblogs.com/yixianyong/p/5091812.html
        page.addTargetRequests(page.getHtml().links().regex("http://www.cnblogs.com/[^#]+(?:/p/|)[^#]*").all());
    }
 
开发者ID:Lzw2016,项目名称:study,代码行数:32,代码来源:MySqlPageProcessor.java

示例10: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    List<Selectable> nodes = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li").nodes();
    StringBuilder accum = new StringBuilder();
    for (Selectable node : nodes) {
        accum.append("img:").append(node.xpath("//a/@href").get()).append("\n");
        accum.append("title:").append(node.xpath("//img/@alt").get()).append("\n");
    }
    page.putField("",accum.toString());
    if (accum.length() == 0) {
        page.setSkip(true);
    }
    page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all());
}
 
开发者ID:mikeqian,项目名称:house,代码行数:15,代码来源:MamacnPageProcessor.java

示例11: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    List<String> requests = page.getHtml().regex("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").all();
    page.addTargetRequests(requests);
    page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
    page.putField("content",page.getHtml().smartContent());
}
 
开发者ID:blogshun,项目名称:ants-project,代码行数:8,代码来源:NjuBBSProcessor.java

示例12: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    //列表页
    if (page.getUrl().regex(URL_LIST).match()) {
        page.addTargetRequests(page.getHtml().xpath("//div[@class=\"articleList\"]").links().regex(URL_POST).all());
        page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all());
        //文章页
    } else {
        page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2"));
        page.putField("content", page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']"));
        page.putField("date",
                page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)"));
    }
}
 
开发者ID:mikeqian,项目名称:house,代码行数:15,代码来源:SinaBlogProcessor.java

示例13: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
    page.addTargetRequests(links);
    page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString());
    page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString());
    page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
}
 
开发者ID:blogshun,项目名称:ants-project,代码行数:9,代码来源:OschinaBlogPageProcesser.java

示例14: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").all();
    page.addTargetRequests(strings);
    page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a"));
    page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']"));
}
 
开发者ID:mikeqian,项目名称:house,代码行数:8,代码来源:OschinaPageProcesser.java

示例15: process

import us.codecraft.webmagic.Page; //导入方法依赖的package包/类
@Override
public void process(Page page) {
    List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").all();
    page.addTargetRequests(strings);
    page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b"));
    page.putField("body",page.getHtml().smartContent());
}
 
开发者ID:blogshun,项目名称:ants-project,代码行数:8,代码来源:TianyaPageProcesser.java


注:本文中的us.codecraft.webmagic.Page.addTargetRequests方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。