当前位置: 首页>>代码示例>>Java>>正文


Java Request类代码示例

本文整理汇总了Java中us.codecraft.webmagic.Request的典型用法代码示例。如果您正苦于以下问题:Java Request类的具体用法?Java Request怎么用?Java Request使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


Request类属于us.codecraft.webmagic包,在下文中一共展示了Request类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: onSuccess

import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Override
protected void onSuccess(Request request) {
    super.onSuccess(request);
    Task task = taskManager.getTaskById(this.getUUID());
    boolean reachMax = false, exceedRatio = false;
    if (
            (
                    //已抓取数量大于最大抓取页数,退出
                    (reachMax = (SPIDER_INFO.getMaxPageGather() > 0 && task.getCount() >= SPIDER_INFO.getMaxPageGather()))
                            ||
                            //如果抓取页面超过最大抓取数量ratio倍的时候,仍未达到最大抓取数量,爬虫也退出
                            (exceedRatio = (this.getPageCount() > SPIDER_INFO.getMaxPageGather() * staticValue.getCommonsWebpageCrawlRatio()))
            )
                    && this.getStatus() == Status.Running) {
        LOG.info("爬虫ID{}已处理{}个页面,有效页面{}个,最大抓取页数{},reachMax={},exceedRatio={},退出.", this.getUUID(), this.getPageCount(), task.getCount(), SPIDER_INFO.getMaxPageGather(), reachMax, exceedRatio);
        task.setDescription("爬虫ID%s已处理%s个页面,有效页面%s个,达到最大抓取页数%s,reachMax=%s,exceedRatio=%s,退出.", this.getUUID(), this.getPageCount(), task.getCount(), SPIDER_INFO.getMaxPageGather(), reachMax, exceedRatio);
        this.stop();
    }
}
 
开发者ID:bruceq,项目名称:Gather-Platform,代码行数:20,代码来源:CommonSpider.java

示例2: download

import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Override
public Page download(Request request, Task task) {
	String html = null;
	try {
		html = casperjs.gatherHtml(new cn.nest.spider.entity.commons.Request(request.getUrl(), true));
	} catch(IOException e) {
		request.putExtra("EXCEPTION", e);
           onError(request);
           return null;
	}
	Page page = new Page().setRawText(html);
	page.setRequest(request);
	page.setUrl(new PlainText(request.getUrl()));
	onSuccess(request);
	return page;
}
 
开发者ID:TransientBuckwheat,项目名称:nest-spider,代码行数:17,代码来源:CasperjsDownloader.java

示例3: covertRequest

import us.codecraft.webmagic.Request; //导入依赖的package包/类
public static Seed covertRequest(Request request) {
    if (StringUtils.isNotEmpty(request.getMethod()) && !StringUtils.equalsIgnoreCase(request.getMethod(), "get")) {
        log.warn("vscrawler can not support webmagic get method,this request {} will be ignore", request.getUrl());
        return null;
    }
    Seed seed = new Seed(request.getUrl());
    seed.setExt(Maps.transformEntries(request.getExtras(), new Maps.EntryTransformer<String, Object, String>() {
        @Override
        public String transformEntry(String key, Object value) {
            if (value instanceof String) {
                return (String) value;
            }
            return JSONObject.toJSONString(value);
        }
    }));

    return seed;
}
 
开发者ID:virjar,项目名称:vscrawler,代码行数:19,代码来源:CovertUtil.java

示例4: parse

import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Override
protected void parse(Seed seed, String result, CrawlResult crawlResult) {
    if (result == null) {
        seed.retry();
        return;
    }
    JsoupXpathPage jsoupXpathPage = new JsoupXpathPage();
    jsoupXpathPage.setRawText(result);
    jsoupXpathPage.setUrl(new PlainText(seed.getData()));
    jsoupXpathPage.setRequest(CovertUtil.convertSeed(seed));
    jsoupXpathPage.setStatusCode(200);
    pageProcessor.process(jsoupXpathPage);

    // new url
    List<Request> targetRequests = jsoupXpathPage.getTargetRequests();
    for (Request request : targetRequests) {
        crawlResult.addSeed(CovertUtil.covertRequest(request));
    }

    if (!jsoupXpathPage.getResultItems().isSkip()) {
        ResultItems resultItems = jsoupXpathPage.getResultItems();
        crawlResult.addResult(JSONObject.toJSONString(resultItems.getAll()));
    }
}
 
开发者ID:virjar,项目名称:vscrawler,代码行数:25,代码来源:WebMagicProcessorDelegator.java

示例5: onSuccess

import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Override
protected void onSuccess(Request request) {
    super.onSuccess(request);
    Task task = taskManager.getTaskById(this.getUUID());
    boolean reachMax = false, exceedRatio = false;
    if (
            (
                    //已抓取数量大于最大抓取页数,退出
                    (reachMax = (SPIDER_INFO.getMaxPageGather() > 0 && task.getCount() >= SPIDER_INFO.getMaxPageGather()))
                            ||
                            //如果抓取页面超过最大抓取数量ratio倍的时候,仍未达到最大抓取数量,爬虫也退出
                            (exceedRatio = (this.getPageCount() > SPIDER_INFO.getMaxPageGather() * staticValue.getCommonsWebpageCrawlRatio() && SPIDER_INFO.getMaxPageGather() > 0))
            )
                    && this.getStatus() == Status.Running) {
        LOG.info("爬虫ID{}已处理{}个页面,有效页面{}个,最大抓取页数{},reachMax={},exceedRatio={},退出.", this.getUUID(), this.getPageCount(), task.getCount(), SPIDER_INFO.getMaxPageGather(), reachMax, exceedRatio);
        task.setDescription("爬虫ID%s已处理%s个页面,有效页面%s个,达到最大抓取页数%s,reachMax=%s,exceedRatio=%s,退出.", this.getUUID(), this.getPageCount(), task.getCount(), SPIDER_INFO.getMaxPageGather(), reachMax, exceedRatio);
        this.stop();
    }
}
 
开发者ID:gsh199449,项目名称:spider,代码行数:20,代码来源:CommonSpider.java

示例6: download

import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Override
public Page download(Request request, Task task) {
    String html = null;
    Site site = null;
    if (task != null) {
        site = task.getSite();
    }
    try {
        html = casperjs.gatherHtml(new com.gs.spider.model.commons.Request(request.getUrl(), true));
    } catch (Exception e) {
        if (site.getCycleRetryTimes() > 0) {
            return addToCycleRetry(request, site);
        }
        request.putExtra("EXCEPTION", e);
        onError(request);
        return null;
    }
    Page page = new Page();
    page.setRawText(html);
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    onSuccess(request);
    return page;
}
 
开发者ID:gsh199449,项目名称:spider,代码行数:25,代码来源:CasperjsDownloader.java

示例7: test

import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Ignore("need chrome driver")
@Test
public void test() {
	SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
	long time1 = System.currentTimeMillis();
	for (int i = 0; i < 100; i++) {
		Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() {
			@Override
			public String getUUID() {
				return "huaban.com";
			}

			@Override
			public Site getSite() {
				return Site.me();
			}
		});
		System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all());
	}
	System.out.println(System.currentTimeMillis() - time1);
}
 
开发者ID:yuany,项目名称:en-webmagic,代码行数:22,代码来源:SeleniumDownloaderTest.java

示例8: testBaiduWenku

import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Ignore
@Test
public void testBaiduWenku() {
	SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
       seleniumDownloader.setSleepTime(10000);
	long time1 = System.currentTimeMillis();
	Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() {
		@Override
		public String getUUID() {
			return "huaban.com";
		}

		@Override
		public Site getSite() {
			return Site.me();
		}
	});
	System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all());
}
 
开发者ID:yuany,项目名称:en-webmagic,代码行数:20,代码来源:SeleniumDownloaderTest.java

示例9: extractLinks

import us.codecraft.webmagic.Request; //导入依赖的package包/类
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
    List<String> links;
    if (urlRegionSelector == null) {
        links = page.getHtml().links().all();
    } else {
        links = urlRegionSelector.selectList(page.getHtml().toString());
    }
    for (String link : links) {
        for (Pattern targetUrlPattern : urlPatterns) {
            Matcher matcher = targetUrlPattern.matcher(link);
            if (matcher.find()) {
                page.addTargetRequest(new Request(matcher.group(1)));
            }
        }
    }
}
 
开发者ID:yuany,项目名称:en-webmagic,代码行数:17,代码来源:ModelPageProcessor.java

示例10: testMissHit

import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Ignore("long time")
@Test
public void testMissHit() throws Exception {
    int times = 5000000;
    DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times, 0.01);
    int right = 0;
    int wrong = 0;
    int missCheck = 0;
    for (int i = 0; i < times; i++) {
        boolean duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
        if (duplicate) {
            wrong++;
        } else {
            right++;
        }
        duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
        if (!duplicate) {
            missCheck++;
        }
    }

    System.out.println("Right count: " + right + " Wrong count: " + wrong + " Miss check: " + missCheck);
}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:24,代码来源:BloomFilterDuplicateRemoverTest.java

示例11: testMemory

import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Ignore("long time")
@Test
public void testMemory() throws Exception {
    int times = 5000000;
    DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times,0.005);
    long freeMemory = Runtime.getRuntime().freeMemory();
    long time = System.currentTimeMillis();
    for (int i = 0; i < times; i++) {
        duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
    }
    System.out.println("Time used by bloomfilter:" + (System.currentTimeMillis() - time));
    System.out.println("Memory used by bloomfilter:" + (freeMemory - Runtime.getRuntime().freeMemory()));

    duplicateRemover = new HashSetDuplicateRemover();
    System.gc();
    freeMemory = Runtime.getRuntime().freeMemory();
    time = System.currentTimeMillis();
    for (int i = 0; i < times; i++) {
        duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
    }
    System.out.println("Time used by hashset:" + (System.currentTimeMillis() - time));
    System.out.println("Memory used by hashset:" + (freeMemory - Runtime.getRuntime().freeMemory()));
}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:24,代码来源:BloomFilterDuplicateRemoverTest.java

示例12: push

import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Override
public synchronized void push(Request request, Task task) {
    Jedis jedis = pool.getResource();
    try {
        //使用Set进行url去重
        if (!jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) {
            //使用List保存队列
            jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
            jedis.sadd(SET_PREFIX + task.getUUID(), request.getUrl());
            if (request.getExtras() != null) {
                String field = DigestUtils.shaHex(request.getUrl());
                String value = JSON.toJSONString(request);
                jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
            }
        }
    } finally {
        pool.returnResource(jedis);
    }
}
 
开发者ID:yuany,项目名称:en-webmagic,代码行数:20,代码来源:RedisScheduler.java

示例13: poll

import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Override
public synchronized Request poll(Task task) {
    Jedis jedis = pool.getResource();
    try {
        String url = jedis.lpop(QUEUE_PREFIX + task.getUUID());
        if (url == null) {
            return null;
        }
        String key = ITEM_PREFIX + task.getUUID();
        String field = DigestUtils.shaHex(url);
        byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
        if (bytes != null) {
            Request o = JSON.parseObject(new String(bytes), Request.class);
            return o;
        }
        Request request = new Request(url);
        return request;
    } finally {
        pool.returnResource(jedis);
    }
}
 
开发者ID:yuany,项目名称:en-webmagic,代码行数:22,代码来源:RedisScheduler.java

示例14: test

import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Ignore("environment depended")
 @Test
 public void test() {
     Task task = new Task() {
         @Override
         public String getUUID() {
             return "1";
         }

         @Override
         public Site getSite() {
             return null;
         }

@Override
public void cron(String expr) {
}
     };
     Request request = new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/");
     request.putExtra("1","2");
     redisScheduler.push(request, task);
     Request poll = redisScheduler.poll(task);
     System.out.println(poll);

 }
 
开发者ID:yuany,项目名称:en-webmagic,代码行数:26,代码来源:RedisSchedulerTest.java

示例15: handleResponse

import us.codecraft.webmagic.Request; //导入依赖的package包/类
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
    byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
    String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
    Page page = new Page();
    page.setBytes(bytes);
    if (!request.isBinaryContent()){
        if (charset == null) {
            charset = getHtmlCharset(contentType, bytes);
        }
        page.setCharset(charset);
        page.setRawText(new String(bytes, charset));
    }
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
    page.setDownloadSuccess(true);
    if (responseHeader) {
        page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
    }
    return page;
}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:22,代码来源:HttpClientDownloader.java


注:本文中的us.codecraft.webmagic.Request类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。