本文整理汇总了Java中us.codecraft.webmagic.Task类的典型用法代码示例。如果您正苦于以下问题:Java Task类的具体用法?Java Task怎么用?Java Task使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
Task类属于us.codecraft.webmagic包,在下文中一共展示了Task类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: process
import us.codecraft.webmagic.Task; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
SpiderInfo info = resultItems.get("spiderInfo");
Webpage page = convertToWebpage(resultItems);
/*
* guava22.0不再对MD5()提供支持,
* 如果想更安全,使用sha256(),
* 如果想更快,使用goodFastHash()
* */
try {
client.prepareIndex(INDEX_NAME, TYPE_NAME)
.setId(Hashing.sha256().hashString(page.getUrl(), Charset.forName("utf-8")).toString())
.setSource(GSON.toJson(page), XContentType.JSON)
.get();
} catch(Exception e) {
LOG.error("索引Webpage出错, 由于 " + e.getLocalizedMessage());
}
}
示例2: process
import us.codecraft.webmagic.Task; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
try {
Iterator iterator = resultItems.getAll().values().iterator();
while (iterator.hasNext()) {
Map map = (Map) iterator.next();
String name = map.get("itemId").toString();
if (map.get("raw_title") == null) {
if (map.get("rateList")!=null)
name += "_tmall_comment";
else name += "_taobao_comment";
name+="_"+map.get("currentPage");
}
PrintWriter printWriter = new PrintWriter(new FileWriter(this.getFile(path + name + ".json")));
printWriter.write(JSON.toJSONString(map));
printWriter.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
示例3: scratch
import us.codecraft.webmagic.Task; //导入依赖的package包/类
public void scratch(){
us.codecraft.webmagic.Spider.create(new SimplePageProcessor(config.getStartUrl(), config.getTargetUrlPattern()))
.addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
Document doc = Jsoup.parse(resultItems.get("html"));
IPersist persist = PersistManager.getInstance().getPersist(config.getHost(), config.getCategory());
if(persist == null){
logger.warn("persistNotExists: host={}, category={}", config.getHost(), config.getCategory());
}else{
persist.persist(resultItems.getRequest().getUrl(), config, doc);
}
}
})
.thread(5).run();
}
示例4: process
import us.codecraft.webmagic.Task; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
List<IndustryInfo> industryInfos = resultItems.get("industryInfos");
if (industryInfos != null && industryInfos.size() > 0) {
for (IndustryInfo industryInfo : industryInfos) {
try {
industryInfoDao.add(industryInfo);
} catch (Exception e) {
if (e instanceof DataIntegrityViolationException) {
} else {
e.printStackTrace();
}
}
}
}
}
示例5: process
import us.codecraft.webmagic.Task; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
SpiderInfo spiderInfo = resultItems.get("spiderInfo");
Webpage webpage = convertResultItems2Webpage(resultItems);
SearchRequestBuilder searchRequestBuilder = client.prepareSearch(INDEX_NAME)
.setTypes(TYPE_NAME)
.setQuery(QueryBuilders.matchQuery("url", webpage.getUrl()));
SearchResponse response = searchRequestBuilder.execute().actionGet();
if (response.getHits().totalHits() == 0) {
try {
client.prepareIndex(INDEX_NAME, TYPE_NAME)
.setId(Hashing.md5().hashString(webpage.getUrl(), Charset.forName("utf-8")).toString())
.setSource(gson.toJson(webpage))
.get();
} catch (Exception e) {
LOG.error("索引 Webpage 出错," + e.getLocalizedMessage());
}
}
}
示例6: download
import us.codecraft.webmagic.Task; //导入依赖的package包/类
@Override
public Page download(Request request, Task task) {
String html = null;
try {
html = casperjs.gatherHtml(new cn.nest.spider.entity.commons.Request(request.getUrl(), true));
} catch(IOException e) {
request.putExtra("EXCEPTION", e);
onError(request);
return null;
}
Page page = new Page().setRawText(html);
page.setRequest(request);
page.setUrl(new PlainText(request.getUrl()));
onSuccess(request);
return page;
}
示例7: download
import us.codecraft.webmagic.Task; //导入依赖的package包/类
@Override
public Page download(Request request, Task task) {
String html = null;
Site site = null;
if (task != null) {
site = task.getSite();
}
try {
html = casperjs.gatherHtml(new com.gs.spider.model.commons.Request(request.getUrl(), true));
} catch (Exception e) {
if (site.getCycleRetryTimes() > 0) {
return addToCycleRetry(request, site);
}
request.putExtra("EXCEPTION", e);
onError(request);
return null;
}
Page page = new Page();
page.setRawText(html);
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
onSuccess(request);
return page;
}
示例8: process
import us.codecraft.webmagic.Task; //导入依赖的package包/类
@Override
public synchronized void process(ResultItems resultItems, Task task) {
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
if (entry.getValue() instanceof Iterable) {
Iterable value = (Iterable) entry.getValue();
printWriter.println(entry.getKey() + ":");
for (Object o : value) {
printWriter.println(o);
}
} else {
printWriter.println(entry.getKey() + ":\t" + entry.getValue());
}
}
printWriter.flush();
}
示例9: process
import us.codecraft.webmagic.Task; //导入依赖的package包/类
public void process(ResultItems resultItems, Task task) {
Map<String,Object> items = resultItems.getAll();
if(resultItems!=null&&resultItems.getAll().size()>0){
Article article = new Article();
article.setTitle((String) items.get("title"));
article.setContent((String) items.get("content"));
article.setSource((String) items.get("source"));
article.setAuthor((String) items.get("author"));
article.setUrl((String)items.get("url"));
String dataStr = (String)items.get("create");
Pattern pattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}");
Matcher matcher = pattern.matcher(dataStr);
if(matcher.find()){
dataStr = matcher.group(0);
}
try {
article.setPubdate(new SimpleDateFormat("yyyy-MM-dd HH:mm").parse(dataStr));
} catch (ParseException e) {
e.printStackTrace();
}
articleDao.save(article);
}
}
示例10: test
import us.codecraft.webmagic.Task; //导入依赖的package包/类
@Ignore("need chrome driver")
@Test
public void test() {
SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
long time1 = System.currentTimeMillis();
for (int i = 0; i < 100; i++) {
Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() {
@Override
public String getUUID() {
return "huaban.com";
}
@Override
public Site getSite() {
return Site.me();
}
});
System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all());
}
System.out.println(System.currentTimeMillis() - time1);
}
示例11: testBaiduWenku
import us.codecraft.webmagic.Task; //导入依赖的package包/类
@Ignore
@Test
public void testBaiduWenku() {
SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
seleniumDownloader.setSleepTime(10000);
long time1 = System.currentTimeMillis();
Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() {
@Override
public String getUUID() {
return "huaban.com";
}
@Override
public Site getSite() {
return Site.me();
}
});
System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all());
}
示例12: process
import us.codecraft.webmagic.Task; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
try {
PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")));
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
if (entry.getValue() instanceof Iterable) {
Iterable value = (Iterable) entry.getValue();
printWriter.println(entry.getKey() + ":");
for (Object o : value) {
printWriter.println(o);
}
} else {
printWriter.println(entry.getKey() + ":\t" + entry.getValue());
}
}
printWriter.close();
} catch (IOException e) {
logger.warn("write file error", e);
}
}
示例13: handleResponse
import us.codecraft.webmagic.Task; //导入依赖的package包/类
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task)
throws IOException {
String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
Page page = new Page();
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
// set http response value
page.putHttpResponse(Constant.STATUS_CODE, httpResponse.getStatusLine().getStatusCode() + "");
Header[] headers = httpResponse.getAllHeaders();
for (Header header : headers) {
page.putHttpResponse(header.getName(), header.getValue());
}
return page;
}
示例14: process
import us.codecraft.webmagic.Task; //导入依赖的package包/类
@Override
public void process(Object o, Task task) {
String path = this.path + "/" + task.getUUID() + "/";
try {
String filename;
if (o instanceof HasKey) {
filename = path + ((HasKey)o).key() + ".json";
} else {
filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".json";
}
PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(filename)));
printWriter.write(JSON.toJSONString(o));
printWriter.close();
} catch (IOException e) {
logger.warn("write file error", e);
}
}
示例15: push
import us.codecraft.webmagic.Task; //导入依赖的package包/类
@Override
public synchronized void push(Request request, Task task) {
Jedis jedis = pool.getResource();
try {
//使用Set进行url去重
if (!jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) {
//使用List保存队列
jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
jedis.sadd(SET_PREFIX + task.getUUID(), request.getUrl());
if (request.getExtras() != null) {
String field = DigestUtils.shaHex(request.getUrl());
String value = JSON.toJSONString(request);
jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
}
}
} finally {
pool.returnResource(jedis);
}
}