本文整理汇总了Java中us.codecraft.webmagic.Request类的典型用法代码示例。如果您正苦于以下问题:Java Request类的具体用法?Java Request怎么用?Java Request使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
Request类属于us.codecraft.webmagic包,在下文中一共展示了Request类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: onSuccess
import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Override
protected void onSuccess(Request request) {
super.onSuccess(request);
Task task = taskManager.getTaskById(this.getUUID());
boolean reachMax = false, exceedRatio = false;
if (
(
//已抓取数量大于最大抓取页数,退出
(reachMax = (SPIDER_INFO.getMaxPageGather() > 0 && task.getCount() >= SPIDER_INFO.getMaxPageGather()))
||
//如果抓取页面超过最大抓取数量ratio倍的时候,仍未达到最大抓取数量,爬虫也退出
(exceedRatio = (this.getPageCount() > SPIDER_INFO.getMaxPageGather() * staticValue.getCommonsWebpageCrawlRatio()))
)
&& this.getStatus() == Status.Running) {
LOG.info("爬虫ID{}已处理{}个页面,有效页面{}个,最大抓取页数{},reachMax={},exceedRatio={},退出.", this.getUUID(), this.getPageCount(), task.getCount(), SPIDER_INFO.getMaxPageGather(), reachMax, exceedRatio);
task.setDescription("爬虫ID%s已处理%s个页面,有效页面%s个,达到最大抓取页数%s,reachMax=%s,exceedRatio=%s,退出.", this.getUUID(), this.getPageCount(), task.getCount(), SPIDER_INFO.getMaxPageGather(), reachMax, exceedRatio);
this.stop();
}
}
示例2: download
import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Override
public Page download(Request request, Task task) {
String html = null;
try {
html = casperjs.gatherHtml(new cn.nest.spider.entity.commons.Request(request.getUrl(), true));
} catch(IOException e) {
request.putExtra("EXCEPTION", e);
onError(request);
return null;
}
Page page = new Page().setRawText(html);
page.setRequest(request);
page.setUrl(new PlainText(request.getUrl()));
onSuccess(request);
return page;
}
示例3: covertRequest
import us.codecraft.webmagic.Request; //导入依赖的package包/类
public static Seed covertRequest(Request request) {
if (StringUtils.isNotEmpty(request.getMethod()) && !StringUtils.equalsIgnoreCase(request.getMethod(), "get")) {
log.warn("vscrawler can not support webmagic get method,this request {} will be ignore", request.getUrl());
return null;
}
Seed seed = new Seed(request.getUrl());
seed.setExt(Maps.transformEntries(request.getExtras(), new Maps.EntryTransformer<String, Object, String>() {
@Override
public String transformEntry(String key, Object value) {
if (value instanceof String) {
return (String) value;
}
return JSONObject.toJSONString(value);
}
}));
return seed;
}
示例4: parse
import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Override
protected void parse(Seed seed, String result, CrawlResult crawlResult) {
if (result == null) {
seed.retry();
return;
}
JsoupXpathPage jsoupXpathPage = new JsoupXpathPage();
jsoupXpathPage.setRawText(result);
jsoupXpathPage.setUrl(new PlainText(seed.getData()));
jsoupXpathPage.setRequest(CovertUtil.convertSeed(seed));
jsoupXpathPage.setStatusCode(200);
pageProcessor.process(jsoupXpathPage);
// new url
List<Request> targetRequests = jsoupXpathPage.getTargetRequests();
for (Request request : targetRequests) {
crawlResult.addSeed(CovertUtil.covertRequest(request));
}
if (!jsoupXpathPage.getResultItems().isSkip()) {
ResultItems resultItems = jsoupXpathPage.getResultItems();
crawlResult.addResult(JSONObject.toJSONString(resultItems.getAll()));
}
}
示例5: onSuccess
import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Override
protected void onSuccess(Request request) {
super.onSuccess(request);
Task task = taskManager.getTaskById(this.getUUID());
boolean reachMax = false, exceedRatio = false;
if (
(
//已抓取数量大于最大抓取页数,退出
(reachMax = (SPIDER_INFO.getMaxPageGather() > 0 && task.getCount() >= SPIDER_INFO.getMaxPageGather()))
||
//如果抓取页面超过最大抓取数量ratio倍的时候,仍未达到最大抓取数量,爬虫也退出
(exceedRatio = (this.getPageCount() > SPIDER_INFO.getMaxPageGather() * staticValue.getCommonsWebpageCrawlRatio() && SPIDER_INFO.getMaxPageGather() > 0))
)
&& this.getStatus() == Status.Running) {
LOG.info("爬虫ID{}已处理{}个页面,有效页面{}个,最大抓取页数{},reachMax={},exceedRatio={},退出.", this.getUUID(), this.getPageCount(), task.getCount(), SPIDER_INFO.getMaxPageGather(), reachMax, exceedRatio);
task.setDescription("爬虫ID%s已处理%s个页面,有效页面%s个,达到最大抓取页数%s,reachMax=%s,exceedRatio=%s,退出.", this.getUUID(), this.getPageCount(), task.getCount(), SPIDER_INFO.getMaxPageGather(), reachMax, exceedRatio);
this.stop();
}
}
示例6: download
import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Override
public Page download(Request request, Task task) {
String html = null;
Site site = null;
if (task != null) {
site = task.getSite();
}
try {
html = casperjs.gatherHtml(new com.gs.spider.model.commons.Request(request.getUrl(), true));
} catch (Exception e) {
if (site.getCycleRetryTimes() > 0) {
return addToCycleRetry(request, site);
}
request.putExtra("EXCEPTION", e);
onError(request);
return null;
}
Page page = new Page();
page.setRawText(html);
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
onSuccess(request);
return page;
}
示例7: test
import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Ignore("need chrome driver")
@Test
public void test() {
SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
long time1 = System.currentTimeMillis();
for (int i = 0; i < 100; i++) {
Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() {
@Override
public String getUUID() {
return "huaban.com";
}
@Override
public Site getSite() {
return Site.me();
}
});
System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all());
}
System.out.println(System.currentTimeMillis() - time1);
}
示例8: testBaiduWenku
import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Ignore
@Test
public void testBaiduWenku() {
SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
seleniumDownloader.setSleepTime(10000);
long time1 = System.currentTimeMillis();
Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() {
@Override
public String getUUID() {
return "huaban.com";
}
@Override
public Site getSite() {
return Site.me();
}
});
System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all());
}
示例9: extractLinks
import us.codecraft.webmagic.Request; //导入依赖的package包/类
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
List<String> links;
if (urlRegionSelector == null) {
links = page.getHtml().links().all();
} else {
links = urlRegionSelector.selectList(page.getHtml().toString());
}
for (String link : links) {
for (Pattern targetUrlPattern : urlPatterns) {
Matcher matcher = targetUrlPattern.matcher(link);
if (matcher.find()) {
page.addTargetRequest(new Request(matcher.group(1)));
}
}
}
}
示例10: testMissHit
import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Ignore("long time")
@Test
public void testMissHit() throws Exception {
int times = 5000000;
DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times, 0.01);
int right = 0;
int wrong = 0;
int missCheck = 0;
for (int i = 0; i < times; i++) {
boolean duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
if (duplicate) {
wrong++;
} else {
right++;
}
duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
if (!duplicate) {
missCheck++;
}
}
System.out.println("Right count: " + right + " Wrong count: " + wrong + " Miss check: " + missCheck);
}
示例11: testMemory
import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Ignore("long time")
@Test
public void testMemory() throws Exception {
int times = 5000000;
DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times,0.005);
long freeMemory = Runtime.getRuntime().freeMemory();
long time = System.currentTimeMillis();
for (int i = 0; i < times; i++) {
duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
}
System.out.println("Time used by bloomfilter:" + (System.currentTimeMillis() - time));
System.out.println("Memory used by bloomfilter:" + (freeMemory - Runtime.getRuntime().freeMemory()));
duplicateRemover = new HashSetDuplicateRemover();
System.gc();
freeMemory = Runtime.getRuntime().freeMemory();
time = System.currentTimeMillis();
for (int i = 0; i < times; i++) {
duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
}
System.out.println("Time used by hashset:" + (System.currentTimeMillis() - time));
System.out.println("Memory used by hashset:" + (freeMemory - Runtime.getRuntime().freeMemory()));
}
示例12: push
import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Override
public synchronized void push(Request request, Task task) {
Jedis jedis = pool.getResource();
try {
//使用Set进行url去重
if (!jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) {
//使用List保存队列
jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
jedis.sadd(SET_PREFIX + task.getUUID(), request.getUrl());
if (request.getExtras() != null) {
String field = DigestUtils.shaHex(request.getUrl());
String value = JSON.toJSONString(request);
jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
}
}
} finally {
pool.returnResource(jedis);
}
}
示例13: poll
import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Override
public synchronized Request poll(Task task) {
Jedis jedis = pool.getResource();
try {
String url = jedis.lpop(QUEUE_PREFIX + task.getUUID());
if (url == null) {
return null;
}
String key = ITEM_PREFIX + task.getUUID();
String field = DigestUtils.shaHex(url);
byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
if (bytes != null) {
Request o = JSON.parseObject(new String(bytes), Request.class);
return o;
}
Request request = new Request(url);
return request;
} finally {
pool.returnResource(jedis);
}
}
示例14: test
import us.codecraft.webmagic.Request; //导入依赖的package包/类
@Ignore("environment depended")
@Test
public void test() {
Task task = new Task() {
@Override
public String getUUID() {
return "1";
}
@Override
public Site getSite() {
return null;
}
@Override
public void cron(String expr) {
}
};
Request request = new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/");
request.putExtra("1","2");
redisScheduler.push(request, task);
Request poll = redisScheduler.poll(task);
System.out.println(poll);
}
示例15: handleResponse
import us.codecraft.webmagic.Request; //导入依赖的package包/类
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
Page page = new Page();
page.setBytes(bytes);
if (!request.isBinaryContent()){
if (charset == null) {
charset = getHtmlCharset(contentType, bytes);
}
page.setCharset(charset);
page.setRawText(new String(bytes, charset));
}
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
page.setDownloadSuccess(true);
if (responseHeader) {
page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
}
return page;
}