本文整理汇总了Java中us.codecraft.webmagic.ResultItems类的典型用法代码示例。如果您正苦于以下问题:Java ResultItems类的具体用法?Java ResultItems怎么用?Java ResultItems使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
ResultItems类属于us.codecraft.webmagic包,在下文中一共展示了ResultItems类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: process
import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
try {
Iterator iterator = resultItems.getAll().values().iterator();
while (iterator.hasNext()) {
Map map = (Map) iterator.next();
String name = map.get("itemId").toString();
if (map.get("raw_title") == null) {
if (map.get("rateList")!=null)
name += "_tmall_comment";
else name += "_taobao_comment";
name+="_"+map.get("currentPage");
}
PrintWriter printWriter = new PrintWriter(new FileWriter(this.getFile(path + name + ".json")));
printWriter.write(JSON.toJSONString(map));
printWriter.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
示例2: scratch
import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
public void scratch(){
us.codecraft.webmagic.Spider.create(new SimplePageProcessor(config.getStartUrl(), config.getTargetUrlPattern()))
.addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
Document doc = Jsoup.parse(resultItems.get("html"));
IPersist persist = PersistManager.getInstance().getPersist(config.getHost(), config.getCategory());
if(persist == null){
logger.warn("persistNotExists: host={}, category={}", config.getHost(), config.getCategory());
}else{
persist.persist(resultItems.getRequest().getUrl(), config, doc);
}
}
})
.thread(5).run();
}
示例3: process
import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
List<IndustryInfo> industryInfos = resultItems.get("industryInfos");
if (industryInfos != null && industryInfos.size() > 0) {
for (IndustryInfo industryInfo : industryInfos) {
try {
industryInfoDao.add(industryInfo);
} catch (Exception e) {
if (e instanceof DataIntegrityViolationException) {
} else {
e.printStackTrace();
}
}
}
}
}
示例4: convertResultItems2Webpage
import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
/**
* 将webmagic的resultItems转换成webpage对象
*
* @param resultItems
* @return
*/
public static Webpage convertResultItems2Webpage(ResultItems resultItems) {
Webpage webpage = new Webpage();
webpage.setContent(resultItems.get("content"));
webpage.setTitle(resultItems.get("title"));
webpage.setUrl(resultItems.get("url"));
webpage.setId(Hashing.md5().hashString(webpage.getUrl(), Charset.forName("utf-8")).toString());
webpage.setDomain(resultItems.get("domain"));
webpage.setSpiderInfoId(resultItems.get("spiderInfoId"));
webpage.setGathertime(resultItems.get("gatherTime"));
webpage.setSpiderUUID(resultItems.get("spiderUUID"));
webpage.setKeywords(resultItems.get("keywords"));
webpage.setSummary(resultItems.get("summary"));
webpage.setNamedEntity(resultItems.get("namedEntity"));
webpage.setPublishTime(resultItems.get("publishTime"));
webpage.setCategory(resultItems.get("category"));
webpage.setRawHTML(resultItems.get("rawHTML"));
webpage.setDynamicFields(resultItems.get(DYNAMIC_FIELD));
webpage.setStaticFields(resultItems.get("staticField"));
webpage.setAttachmentList(resultItems.get("attachmentList"));
webpage.setImageList(resultItems.get("imageList"));
webpage.setProcessTime(resultItems.get("processTime"));
return webpage;
}
示例5: process
import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
SpiderInfo spiderInfo = resultItems.get("spiderInfo");
Webpage webpage = convertResultItems2Webpage(resultItems);
SearchRequestBuilder searchRequestBuilder = client.prepareSearch(INDEX_NAME)
.setTypes(TYPE_NAME)
.setQuery(QueryBuilders.matchQuery("url", webpage.getUrl()));
SearchResponse response = searchRequestBuilder.execute().actionGet();
if (response.getHits().totalHits() == 0) {
try {
client.prepareIndex(INDEX_NAME, TYPE_NAME)
.setId(Hashing.md5().hashString(webpage.getUrl(), Charset.forName("utf-8")).toString())
.setSource(gson.toJson(webpage))
.get();
} catch (Exception e) {
LOG.error("索引 Webpage 出错," + e.getLocalizedMessage());
}
}
}
示例6: convertToWebpage
import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
/**
* 将Webmagic中的ResultItems转换为Webpage
* @param items
* @return
*/
public static Webpage convertToWebpage(ResultItems items) {
if(items.toString().indexOf("ResultItems{fields={}") > 0) {
return null;
}
Webpage page = new Webpage().setContent(items.get("content")).setTitle(items.get("title"))
.setUrl(items.get("url")).setDomain(items.get("domain")).setSpiderInfoId(items.get("spiderInfoId"))
.setGathertime(items.get("gathertime")).setSpiderUUID(items.get("spiderUUID"))
.setKeywords(items.get("keywords")).setSummary(items.get("summary"))
.setNamedEntity(items.get("namedEntity")).setPublishTime(items.get("publishTime"))
.setCategory(items.get("category")).setRawHTML(items.get("rawHTML"))
.setDynamicFields(items.get(DYNAMIC_FIELD)).setStaticFields(items.get("staticField"))
.setAttachmentList(items.get("attachmentList")).setImageList(items.get("imageList"))
.setProcessTime(items.get("processTime"));
return page.setId(Hashing.sha256().hashString(page.getUrl(), Charset.forName("utf-8")).toString());
}
示例7: process
import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
SpiderInfo info = resultItems.get("spiderInfo");
Webpage page = convertToWebpage(resultItems);
/*
* guava22.0不再对MD5()提供支持,
* 如果想更安全,使用sha256(),
* 如果想更快,使用goodFastHash()
* */
try {
client.prepareIndex(INDEX_NAME, TYPE_NAME)
.setId(Hashing.sha256().hashString(page.getUrl(), Charset.forName("utf-8")).toString())
.setSource(GSON.toJson(page), XContentType.JSON)
.get();
} catch(Exception e) {
LOG.error("索引Webpage出错, 由于 " + e.getLocalizedMessage());
}
}
示例8: parse
import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
@Override
protected void parse(Seed seed, String result, CrawlResult crawlResult) {
if (result == null) {
seed.retry();
return;
}
JsoupXpathPage jsoupXpathPage = new JsoupXpathPage();
jsoupXpathPage.setRawText(result);
jsoupXpathPage.setUrl(new PlainText(seed.getData()));
jsoupXpathPage.setRequest(CovertUtil.convertSeed(seed));
jsoupXpathPage.setStatusCode(200);
pageProcessor.process(jsoupXpathPage);
// new url
List<Request> targetRequests = jsoupXpathPage.getTargetRequests();
for (Request request : targetRequests) {
crawlResult.addSeed(CovertUtil.covertRequest(request));
}
if (!jsoupXpathPage.getResultItems().isSkip()) {
ResultItems resultItems = jsoupXpathPage.getResultItems();
crawlResult.addResult(JSONObject.toJSONString(resultItems.getAll()));
}
}
示例9: process
import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
@Override
public synchronized void process(ResultItems resultItems, Task task) {
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
if (entry.getValue() instanceof Iterable) {
Iterable value = (Iterable) entry.getValue();
printWriter.println(entry.getKey() + ":");
for (Object o : value) {
printWriter.println(o);
}
} else {
printWriter.println(entry.getKey() + ":\t" + entry.getValue());
}
}
printWriter.flush();
}
示例10: process
import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
public void process(ResultItems resultItems, Task task) {
Map<String,Object> items = resultItems.getAll();
if(resultItems!=null&&resultItems.getAll().size()>0){
Article article = new Article();
article.setTitle((String) items.get("title"));
article.setContent((String) items.get("content"));
article.setSource((String) items.get("source"));
article.setAuthor((String) items.get("author"));
article.setUrl((String)items.get("url"));
String dataStr = (String)items.get("create");
Pattern pattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}");
Matcher matcher = pattern.matcher(dataStr);
if(matcher.find()){
dataStr = matcher.group(0);
}
try {
article.setPubdate(new SimpleDateFormat("yyyy-MM-dd HH:mm").parse(dataStr));
} catch (ParseException e) {
e.printStackTrace();
}
articleDao.save(article);
}
}
示例11: process
import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
try {
PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")));
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
if (entry.getValue() instanceof Iterable) {
Iterable value = (Iterable) entry.getValue();
printWriter.println(entry.getKey() + ":");
for (Object o : value) {
printWriter.println(o);
}
} else {
printWriter.println(entry.getKey() + ":\t" + entry.getValue());
}
}
printWriter.close();
} catch (IOException e) {
logger.warn("write file error", e);
}
}
示例12: process
import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) {
Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName());
if (o != null) {
Annotation annotation = classPageModelPipelineEntry.getKey().getAnnotation(ExtractBy.class);
if (annotation == null || !((ExtractBy) annotation).multi()) {
classPageModelPipelineEntry.getValue().process(o, task);
} else {
List<Object> list = (List<Object>) o;
for (Object o1 : list) {
classPageModelPipelineEntry.getValue().process(o1, task);
}
}
}
}
}
示例13: process
import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
if (resultItems.isSkip()){
return;
}
Document doc = new Document();
Map<String,Object> all = resultItems.getAll();
if (all==null){
return;
}
for (Map.Entry<String, Object> objectEntry : all.entrySet()) {
doc.add(new Field(objectEntry.getKey(), objectEntry.getValue().toString(), TextField.TYPE_STORED));
}
try {
IndexWriter indexWriter = new IndexWriter(directory, config);
indexWriter.addDocument(doc);
indexWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
示例14: process
import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
try {
PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")),"UTF-8"));
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
if (entry.getValue() instanceof Iterable) {
Iterable value = (Iterable) entry.getValue();
printWriter.println(entry.getKey() + ":");
for (Object o : value) {
printWriter.println(o);
}
} else {
printWriter.println(entry.getKey() + ":\t" + entry.getValue());
}
}
printWriter.close();
} catch (IOException e) {
logger.warn("write file error", e);
}
}
示例15: main
import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
public static void main(String[] args) {
//single download
Spider spider = Spider.create(new BaiduBaikePageProcessor()).thread(2);
String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
ResultItems resultItems = spider.<ResultItems>get(String.format(urlTemplate, "水力发电"));
System.out.println(resultItems);
//multidownload
List<String> list = new ArrayList<String>();
list.add(String.format(urlTemplate,"风力发电"));
list.add(String.format(urlTemplate,"太阳能"));
list.add(String.format(urlTemplate,"地热发电"));
list.add(String.format(urlTemplate,"地热发电"));
List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);
for (ResultItems resultItemse : resultItemses) {
System.out.println(resultItemse.getAll());
}
spider.close();
}