当前位置: 首页>>代码示例>>Java>>正文


Java Page类代码示例

本文整理汇总了Java中us.codecraft.webmagic.Page的典型用法代码示例。如果您正苦于以下问题:Java Page类的具体用法?Java Page怎么用?Java Page使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


Page类属于us.codecraft.webmagic包,在下文中一共展示了Page类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: afterProcess

import us.codecraft.webmagic.Page; //导入依赖的package包/类
@Override
    public void afterProcess(Page page) {
        String temp = page.getHtml().toString();
        int beginIndex = temp.lastIndexOf('[');
        int endIndex = temp.lastIndexOf(']');
        temp = temp.substring(beginIndex, endIndex + 1);
        List<Object> items = JsonUtil.jsonToList(temp, Object.class);
        for (Object object : items) {
            String[] arr = object.toString().split(",");
//            industryDao.add(new IndustrySample(arr[2]));
        }
        //从页面发现后续的url地址来抓取
        /*List<String> targetRequests = new ArrayList<String>();
        targetRequests.add("http://nufm.dfcfw.com/EM_Finance2014NumericApplication/JS.aspx?cmd=C._BKHY&type=ct&st=(BalFlowMain)&sr=-1&p=2&ps=50&js=var%20BscAxcAq={pages:(pc),data:[(x)]}&token=894050c76af8597a853f5b408b759f5d&sty=DCFFITABK&rt=50173371");
        page.addTargetRequests(targetRequests);*/
    }
 
开发者ID:leon66666,项目名称:financehelper,代码行数:17,代码来源:IndustrySample.java

示例2: process

import us.codecraft.webmagic.Page; //导入依赖的package包/类
@Override
public void process(Page page) {
    counter.addAndGet(1);
    String html = page.getHtml().toString();
    String selfUrl = page.getUrl().toString();
    Pattern pattern = Pattern.compile("(?<=<a href=\")(?!" + selfUrl
            + ")https://en.wikipedia.org/wiki/.*?(?=\")");
    Matcher matcher = pattern.matcher(html);
    while (matcher.find()) {
        /* 因为只是做一个简单的测试,太多了任务数量太多要爬太久,所以这里获取到的url超过1000就不继续爬了 */
        if (newUrls.size() <= 1000) {
            newUrls.add(matcher.group());
        } else {
            return;
        }
    }
}
 
开发者ID:xiongbeer,项目名称:Cobweb,代码行数:18,代码来源:Crawler.java

示例3: process

import us.codecraft.webmagic.Page; //导入依赖的package包/类
@Override
public void process(Page page) {
    List<String> url_list = new ArrayList<>();
    List<String> name_list = new ArrayList<>();
    JSONObject jsonObject = (JSONObject) JSONObject.parse(page.getRawText());
    JSONArray data = (JSONArray) jsonObject.get("imgs");
    for(int i=0;i<data.size();i++){
        String url = (String) data.getJSONObject(i).get("objURL");
        String name = (String) data.getJSONObject(i).get("fromPageTitleEnc");
        if(url!=null){
            url_list.add(url);
            name_list.add(name);
        }
    }
    setUrls(url_list);
    setNames(name_list);
}
 
开发者ID:bruceq,项目名称:Gather-Platform,代码行数:18,代码来源:DownloadPicture.java

示例4: download

import us.codecraft.webmagic.Page; //导入依赖的package包/类
@Override
public Page download(Request request, Task task) {
	String html = null;
	try {
		html = casperjs.gatherHtml(new cn.nest.spider.entity.commons.Request(request.getUrl(), true));
	} catch(IOException e) {
		request.putExtra("EXCEPTION", e);
           onError(request);
           return null;
	}
	Page page = new Page().setRawText(html);
	page.setRequest(request);
	page.setUrl(new PlainText(request.getUrl()));
	onSuccess(request);
	return page;
}
 
开发者ID:TransientBuckwheat,项目名称:nest-spider,代码行数:17,代码来源:CasperjsDownloader.java

示例5: download

import us.codecraft.webmagic.Page; //导入依赖的package包/类
@Override
public Page download(Request request, Task task) {
    String html = null;
    Site site = null;
    if (task != null) {
        site = task.getSite();
    }
    try {
        html = casperjs.gatherHtml(new com.gs.spider.model.commons.Request(request.getUrl(), true));
    } catch (Exception e) {
        if (site.getCycleRetryTimes() > 0) {
            return addToCycleRetry(request, site);
        }
        request.putExtra("EXCEPTION", e);
        onError(request);
        return null;
    }
    Page page = new Page();
    page.setRawText(html);
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    onSuccess(request);
    return page;
}
 
开发者ID:gsh199449,项目名称:spider,代码行数:25,代码来源:CasperjsDownloader.java

示例6: process

import us.codecraft.webmagic.Page; //导入依赖的package包/类
@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
    // 部分二:定义如何抽取页面信息,并保存下来
    page.putField("author", page.getUrl().regex("http://webmagic.io/docs/.*").toString());
    page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
    if (page.getResultItems().get("name") == null) {
        //skip this page
        // page.setSkip(true);
    }
    page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));

    String content = page.getHtml().toString();
    page.putField("content", content);
    // 部分三:从页面发现后续的url地址来抓取
    page.addTargetRequests(page.getHtml().links().regex("http://.*\\.jd\\..*com.*").all());
}
 
开发者ID:Lzw2016,项目名称:study,代码行数:18,代码来源:GithubRepoPageProcessor.java

示例7: process

import us.codecraft.webmagic.Page; //导入依赖的package包/类
@Override
public void process(Page page) {
    List<String> lis = page.getHtml().xpath("//ul[@id='house-lst']/li").all();
    List<House> housees = new ArrayList<>();

    for (String li : lis) {
        String key = new Html(li).xpath("//div[@class='pic-panel']/a/@key").get();
        String area = new Html(li).xpath("//div[@class='where']/a[@class='laisuzhou']/@href").get()
                .replace("/xiaoqu/", "")
                .replace(".html", "")
                .replace("http://sh.lianjia.com", "");
        House house = new House();
        house.setId(key.replace("sh", ""));
        house.setXiaoqu(area);
        housees.add(house);
    }

    long pageIndex = seed.incrementAndGet();
    List<String> urls = Lists.newArrayList(String.format(Constants.House_SEED, pageIndex));
    if (pageIndex < maxIndex) {
        page.addTargetRequests(urls);
    }

    page.putField("house", housees);
}
 
开发者ID:mikeqian,项目名称:house,代码行数:26,代码来源:HousePageProcessor.java

示例8: process

import us.codecraft.webmagic.Page; //导入依赖的package包/类
@Override
public void process(Page page) {
    List<String> relativeUrl = page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all();
    page.addTargetRequests(relativeUrl);
    relativeUrl = page.getHtml().xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href").all();
    page.addTargetRequests(relativeUrl);
    List<String> answers =  page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all();
    boolean exist = false;
    for(String answer:answers){
        String vote = new Html(answer).xpath("//div[@class='zm-votebar']//span[@class='count']/text()").toString();
        if(Integer.valueOf(vote) >= voteNum){
            page.putField("vote",vote);
            page.putField("content",new Html(answer).xpath("//div[@class='zm-editable-content']"));
            page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href"));
            exist = true;
        }
    }
    if(!exist){
        page.setSkip(true);
    }
}
 
开发者ID:mikeqian,项目名称:house,代码行数:22,代码来源:ZhihuPageProcessor.java

示例9: process

import us.codecraft.webmagic.Page; //导入依赖的package包/类
@Override
public void process(Page page) {
    //a()表示提取链接,links()表示提取所有链接
    //getHtml()返回Html对象,支持链式调用
    //r()表示用正则表达式提取一条内容,regex()表示提取多条内容
    //toString()表示取单条结果,all()表示取多条
    List<String> requests = page.getHtml().links().regex("(.*/post/.*)").all();
    //使用page.addTargetRequests()方法将待抓取的链接加入队列
    page.addTargetRequests(requests);
    //page.putField(key,value)将抽取的内容加入结果Map
    //x()和xs()使用xpath进行抽取
    page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|").toString());
    //smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率
    page.putField("content", page.getHtml().smartContent());
    page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/"));
    page.putField("id", page.getUrl().regex("post/\\d+-\\d+-\\d+/(\\d+)"));
}
 
开发者ID:mikeqian,项目名称:house,代码行数:18,代码来源:DiandianBlogProcessor.java

示例10: process

import us.codecraft.webmagic.Page; //导入依赖的package包/类
@Override
public void process(Page page) {
    List<String> imageUrlList = page.getHtml().$(".BDE_Image", "src").all();
    String pageId = page.getUrl().toString().replace(tieBaConfiguration.getTiebaContentPageUrl(),"");
    List<String> list = new ArrayList<>();
    for (String imageUrl : imageUrlList) {
        if (imageUrl.startsWith(tieBaConfiguration.getTiebaImageUrl())) {
            imageUrl=convertImageUrl(imageUrl);
            if (null!=imageUrl)list.add(imageUrl);
        }
    }
    if (list.size() > 0) {
        map.put(WebmagicService.getByte(TieBaImageIdMessageListener.TIEBA_CONTENT_IMAGE_KEY+pageId), WebmagicService.getByte(JSONObject.toJSONString(list)));
    }else{
        redisTemplate.convertAndSend(tieBaConfiguration.getTiebaContentNoImageIdTopic(), JSONObject.toJSONString(new ContentBean(pageId,tiebaName)));
    }
    if (!isAddTarget) {
        for (String id : pageNumberList) {
            StringBuilder sb = new StringBuilder();
            sb.append(url).append(id);
            page.addTargetRequests(Arrays.asList(sb.toString()));
        }
        isAddTarget = true;
    }
}
 
开发者ID:ggj2010,项目名称:javabase,代码行数:26,代码来源:ContentImageProcessor.java

示例11: process

import us.codecraft.webmagic.Page; //导入依赖的package包/类
@Override
public void process(Page page) {
    for (int i = 1; i <= pageSize; i++) {
        String json = page.getHtml().xpath("//ul[@id='thread_list']/li[@class='j_thread_list clearfix'][" + (i) + "]/@data-field").toString();
        if(json!=null&&JSONObject.parseObject(json).containsKey("id")){
            JSONObject jsonObject = JSONObject.parseObject(json);
            String pageId=jsonObject.getString("id");
            String authorName=jsonObject.getString("author_name");
            String date = praseDate(page,i);
            String title=page.getHtml().xpath("a[@href='"+tieBaConfiguration.getTiebaContentPageUrl()+pageId+"']/@title").toString();

            pageNumberList.add(new ContentBean(pageId,date,tiebaName,authorName,title));
        }
    }

    if (!isAddTarget) {
        for (int i = 2; i <= endNum; i++) {
            StringBuilder sb = new StringBuilder();
            sb.append(tiebaUrl).append("&pn=" + i*pageSize);
            page.addTargetRequests(Arrays.asList(sb.toString()));
        }
        isAddTarget = true;
    }
}
 
开发者ID:ggj2010,项目名称:javabase,代码行数:25,代码来源:ContentIdProcessor.java

示例12: extractItemDetails

import us.codecraft.webmagic.Page; //导入依赖的package包/类
private List<String> extractItemDetails(Page page) {
    String detailUrl = page.getHtml().regex("//tds.alicdn.com/json/item_imgs.htm.+?,").get();
    detailUrl = detailUrl.substring(0, detailUrl.length() - 2);
    detailUrl = "https:" + detailUrl;
    return restTemplate.execute(
            detailUrl, HttpMethod.GET, null, clientHttpResponse -> {
                String text = IOUtils.toString(clientHttpResponse.getBody(), StandardCharsets.UTF_8);
                String response = text.replace("$callback(", "").replace("})", "}");
                HashMap<String, Object> hashMap = TcSerializationUtils.readJson(response,
                        new TypeReference<HashMap<String, Object>>() {
                        });
                List<String> pics1 = Lists.newArrayList();
                hashMap.entrySet().forEach(entry -> {
                    if (!(entry.getKey().equals("success")
                            || entry.getKey().equals("size")
                            || entry.getKey().equals("conflict")
                            || entry.getKey().equals("req"))) {
                        pics1.add("https://img.alicdn.com/imgextra/i3/2928049528/" + entry.getKey());
                    }
                });
                return pics1;
            });
}
 
开发者ID:srarcbrsent,项目名称:tc,代码行数:24,代码来源:TcCrawler.java

示例13: process

import us.codecraft.webmagic.Page; //导入依赖的package包/类
@Override
public void process(Page page) {
	// 格式:http://mooc.chaoxing.com/category/01/0/1000
	if (page.getUrl().regex("http://mooc\\.chaoxing\\.com/category/\\d+/\\d/\\d+")
			.toString() != null) {
		System.out.println("第一层");
		crawerCourse(page);
	}
	// 格式:http://mooc.chaoxing.com/course/55672.html
	else if (page.getUrl().regex("http://mooc\\.chaoxing\\.com/course/\\d+\\.html")
			.toString() != null) {
		System.out.println("第二层");
		crawCourseInfo(page);
	}

}
 
开发者ID:lawlite19,项目名称:SmartEducation,代码行数:17,代码来源:CourseSpider.java

示例14: process

import us.codecraft.webmagic.Page; //导入依赖的package包/类
@Override
public void process(Page page) {
	// <li><a href="/category/01">哲学</a></li>
	// 筛选名称
	List<String> professionTypeNameList = page.getHtml()
			.xpath("//ul[@class='category']/li/a/html()").all();
	page.putField("professionName", professionTypeNameList);
	// 筛选url
	List<String> professionTypeUrlList = page.getHtml().xpath("//ul[@class='category']/li/a/@href").all();
	page.putField("professionName", professionTypeUrlList);
	
	if(professionTypeNameList.size()>0){
		for(int i=0;i<professionTypeNameList.size();i++){
			SpiderProfessionType model=new SpiderProfessionType(professionTypeNameList.get(i).toString(), professionTypeUrlList.get(i));
			spiderProfessionTypeService.save(model);
		}
	}
}
 
开发者ID:lawlite19,项目名称:SmartEducation,代码行数:19,代码来源:ProfessionTypeSpider.java

示例15: test

import us.codecraft.webmagic.Page; //导入依赖的package包/类
@Ignore("need chrome driver")
@Test
public void test() {
	SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
	long time1 = System.currentTimeMillis();
	for (int i = 0; i < 100; i++) {
		Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() {
			@Override
			public String getUUID() {
				return "huaban.com";
			}

			@Override
			public Site getSite() {
				return Site.me();
			}
		});
		System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all());
	}
	System.out.println(System.currentTimeMillis() - time1);
}
 
开发者ID:yuany,项目名称:en-webmagic,代码行数:22,代码来源:SeleniumDownloaderTest.java


注:本文中的us.codecraft.webmagic.Page类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。