当前位置: 首页>>代码示例>>Java>>正文


Java ResultItems类代码示例

本文整理汇总了Java中us.codecraft.webmagic.ResultItems的典型用法代码示例。如果您正苦于以下问题:Java ResultItems类的具体用法?Java ResultItems怎么用?Java ResultItems使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


ResultItems类属于us.codecraft.webmagic包,在下文中一共展示了ResultItems类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: process

import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
    try {
        Iterator iterator = resultItems.getAll().values().iterator();
        while (iterator.hasNext()) {
            Map map = (Map) iterator.next();
            String name = map.get("itemId").toString();
            if (map.get("raw_title") == null) {
                if (map.get("rateList")!=null)
                    name += "_tmall_comment";
                else name += "_taobao_comment";
                name+="_"+map.get("currentPage");
            }
            PrintWriter printWriter = new PrintWriter(new FileWriter(this.getFile(path + name + ".json")));
            printWriter.write(JSON.toJSONString(map));
            printWriter.close();
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}
 
开发者ID:CieloSun,项目名称:FashionSpider,代码行数:22,代码来源:MyTBJsonPipeline.java

示例2: scratch

import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
public void scratch(){
    us.codecraft.webmagic.Spider.create(new SimplePageProcessor(config.getStartUrl(), config.getTargetUrlPattern()))
            .addPipeline(new Pipeline() {
                @Override
                public void process(ResultItems resultItems, Task task) {
                    Document doc = Jsoup.parse(resultItems.get("html"));
                    IPersist persist = PersistManager.getInstance().getPersist(config.getHost(), config.getCategory());
                    if(persist == null){
                        logger.warn("persistNotExists: host={}, category={}", config.getHost(), config.getCategory());
                    }else{
                        persist.persist(resultItems.getRequest().getUrl(), config, doc);
                    }
                }
            })
            .thread(5).run();
}
 
开发者ID:wangdamu,项目名称:SpiderApplication,代码行数:17,代码来源:Spider.java

示例3: process

import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
    List<IndustryInfo> industryInfos = resultItems.get("industryInfos");
    if (industryInfos != null && industryInfos.size() > 0) {
        for (IndustryInfo industryInfo : industryInfos) {
            try {
                industryInfoDao.add(industryInfo);
            } catch (Exception e) {
                if (e instanceof DataIntegrityViolationException) {

                } else {
                    e.printStackTrace();
                }
            }
        }
    }
}
 
开发者ID:leon66666,项目名称:financehelper,代码行数:18,代码来源:IndustryInfoPipeline.java

示例4: convertResultItems2Webpage

import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
/**
 * 将webmagic的resultItems转换成webpage对象
 *
 * @param resultItems
 * @return
 */
public static Webpage convertResultItems2Webpage(ResultItems resultItems) {
    Webpage webpage = new Webpage();
    webpage.setContent(resultItems.get("content"));
    webpage.setTitle(resultItems.get("title"));
    webpage.setUrl(resultItems.get("url"));
    webpage.setId(Hashing.md5().hashString(webpage.getUrl(), Charset.forName("utf-8")).toString());
    webpage.setDomain(resultItems.get("domain"));
    webpage.setSpiderInfoId(resultItems.get("spiderInfoId"));
    webpage.setGathertime(resultItems.get("gatherTime"));
    webpage.setSpiderUUID(resultItems.get("spiderUUID"));
    webpage.setKeywords(resultItems.get("keywords"));
    webpage.setSummary(resultItems.get("summary"));
    webpage.setNamedEntity(resultItems.get("namedEntity"));
    webpage.setPublishTime(resultItems.get("publishTime"));
    webpage.setCategory(resultItems.get("category"));
    webpage.setRawHTML(resultItems.get("rawHTML"));
    webpage.setDynamicFields(resultItems.get(DYNAMIC_FIELD));
    webpage.setStaticFields(resultItems.get("staticField"));
    webpage.setAttachmentList(resultItems.get("attachmentList"));
    webpage.setImageList(resultItems.get("imageList"));
    webpage.setProcessTime(resultItems.get("processTime"));
    return webpage;
}
 
开发者ID:bruceq,项目名称:Gather-Platform,代码行数:30,代码来源:CommonWebpagePipeline.java

示例5: process

import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
    SpiderInfo spiderInfo = resultItems.get("spiderInfo");
    Webpage webpage = convertResultItems2Webpage(resultItems);
    SearchRequestBuilder searchRequestBuilder = client.prepareSearch(INDEX_NAME)
            .setTypes(TYPE_NAME)
            .setQuery(QueryBuilders.matchQuery("url", webpage.getUrl()));
    SearchResponse response = searchRequestBuilder.execute().actionGet();
    if (response.getHits().totalHits() == 0) {
        try {
            client.prepareIndex(INDEX_NAME, TYPE_NAME)
                    .setId(Hashing.md5().hashString(webpage.getUrl(), Charset.forName("utf-8")).toString())
                    .setSource(gson.toJson(webpage))
                    .get();
        } catch (Exception e) {
            LOG.error("索引 Webpage 出错," + e.getLocalizedMessage());
        }
    }
}
 
开发者ID:bruceq,项目名称:Gather-Platform,代码行数:20,代码来源:CommonWebpagePipeline.java

示例6: convertToWebpage

import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
/**
 * 将Webmagic中的ResultItems转换为Webpage
 * @param items
 * @return
 */
public static Webpage convertToWebpage(ResultItems items) {
	if(items.toString().indexOf("ResultItems{fields={}") > 0) {
		return null;
	}
	Webpage page = new Webpage().setContent(items.get("content")).setTitle(items.get("title"))
			.setUrl(items.get("url")).setDomain(items.get("domain")).setSpiderInfoId(items.get("spiderInfoId"))
			.setGathertime(items.get("gathertime")).setSpiderUUID(items.get("spiderUUID"))
			.setKeywords(items.get("keywords")).setSummary(items.get("summary"))
			.setNamedEntity(items.get("namedEntity")).setPublishTime(items.get("publishTime"))
			.setCategory(items.get("category")).setRawHTML(items.get("rawHTML"))
			.setDynamicFields(items.get(DYNAMIC_FIELD)).setStaticFields(items.get("staticField"))
			.setAttachmentList(items.get("attachmentList")).setImageList(items.get("imageList"))
			.setProcessTime(items.get("processTime"));
	return 	page.setId(Hashing.sha256().hashString(page.getUrl(), Charset.forName("utf-8")).toString());
}
 
开发者ID:TransientBuckwheat,项目名称:nest-spider,代码行数:21,代码来源:WebpagePipeline.java

示例7: process

import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
	SpiderInfo info = resultItems.get("spiderInfo");
	Webpage page = convertToWebpage(resultItems);
	/*
	 * guava22.0不再对MD5()提供支持,
	 * 如果想更安全,使用sha256(), 
	 * 如果想更快,使用goodFastHash()
	 * */
	try {
		client.prepareIndex(INDEX_NAME, TYPE_NAME)
		   .setId(Hashing.sha256().hashString(page.getUrl(), Charset.forName("utf-8")).toString())
		   .setSource(GSON.toJson(page), XContentType.JSON)
		   .get();
	} catch(Exception e) {
		LOG.error("索引Webpage出错, 由于 " + e.getLocalizedMessage());
	}
}
 
开发者ID:TransientBuckwheat,项目名称:nest-spider,代码行数:19,代码来源:WebpagePipeline.java

示例8: parse

import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
@Override
protected void parse(Seed seed, String result, CrawlResult crawlResult) {
    if (result == null) {
        seed.retry();
        return;
    }
    JsoupXpathPage jsoupXpathPage = new JsoupXpathPage();
    jsoupXpathPage.setRawText(result);
    jsoupXpathPage.setUrl(new PlainText(seed.getData()));
    jsoupXpathPage.setRequest(CovertUtil.convertSeed(seed));
    jsoupXpathPage.setStatusCode(200);
    pageProcessor.process(jsoupXpathPage);

    // new url
    List<Request> targetRequests = jsoupXpathPage.getTargetRequests();
    for (Request request : targetRequests) {
        crawlResult.addSeed(CovertUtil.covertRequest(request));
    }

    if (!jsoupXpathPage.getResultItems().isSkip()) {
        ResultItems resultItems = jsoupXpathPage.getResultItems();
        crawlResult.addResult(JSONObject.toJSONString(resultItems.getAll()));
    }
}
 
开发者ID:virjar,项目名称:vscrawler,代码行数:25,代码来源:WebMagicProcessorDelegator.java

示例9: process

import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
@Override
public synchronized void process(ResultItems resultItems, Task task) {
    printWriter.println("url:\t" + resultItems.getRequest().getUrl());
    for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
        if (entry.getValue() instanceof Iterable) {
            Iterable value = (Iterable) entry.getValue();
            printWriter.println(entry.getKey() + ":");
            for (Object o : value) {
                printWriter.println(o);
            }
        } else {
            printWriter.println(entry.getKey() + ":\t" + entry.getValue());
        }
    }
    printWriter.flush();
}
 
开发者ID:mikeqian,项目名称:house,代码行数:17,代码来源:OneFilePipeline.java

示例10: process

import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
public void process(ResultItems resultItems, Task task) {
    Map<String,Object> items = resultItems.getAll();
    if(resultItems!=null&&resultItems.getAll().size()>0){
        Article article = new Article();
        article.setTitle((String) items.get("title"));
        article.setContent((String) items.get("content"));
        article.setSource((String) items.get("source"));
        article.setAuthor((String) items.get("author"));
        article.setUrl((String)items.get("url"));
        String dataStr = (String)items.get("create");
        Pattern pattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}");
        Matcher matcher = pattern.matcher(dataStr);
        if(matcher.find()){
            dataStr = matcher.group(0);
        }
        try {
            article.setPubdate(new SimpleDateFormat("yyyy-MM-dd HH:mm").parse(dataStr));
        } catch (ParseException e) {
            e.printStackTrace();
        }
        articleDao.save(article);
    }
}
 
开发者ID:ameizi,项目名称:elasticsearch-jest-example,代码行数:24,代码来源:JdbcPipeline.java

示例11: process

import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
    String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
    try {
        PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")));
        printWriter.println("url:\t" + resultItems.getRequest().getUrl());
        for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
            if (entry.getValue() instanceof Iterable) {
                Iterable value = (Iterable) entry.getValue();
                printWriter.println(entry.getKey() + ":");
                for (Object o : value) {
                    printWriter.println(o);
                }
            } else {
                printWriter.println(entry.getKey() + ":\t" + entry.getValue());
            }
        }
        printWriter.close();
    } catch (IOException e) {
        logger.warn("write file error", e);
    }
}
 
开发者ID:yuany,项目名称:en-webmagic,代码行数:23,代码来源:FilePipeline.java

示例12: process

import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
    for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) {
        Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName());
        if (o != null) {
            Annotation annotation = classPageModelPipelineEntry.getKey().getAnnotation(ExtractBy.class);
            if (annotation == null || !((ExtractBy) annotation).multi()) {
                classPageModelPipelineEntry.getValue().process(o, task);
            } else {
                List<Object> list = (List<Object>) o;
                for (Object o1 : list) {
                    classPageModelPipelineEntry.getValue().process(o1, task);
                }
            }
        }
    }
}
 
开发者ID:yuany,项目名称:en-webmagic,代码行数:18,代码来源:ModelPipeline.java

示例13: process

import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
    if (resultItems.isSkip()){
        return;
    }
    Document doc = new Document();
    Map<String,Object> all = resultItems.getAll();
    if (all==null){
        return;
    }
    for (Map.Entry<String, Object> objectEntry : all.entrySet()) {
        doc.add(new Field(objectEntry.getKey(), objectEntry.getValue().toString(), TextField.TYPE_STORED));
    }
    try {
        IndexWriter indexWriter = new IndexWriter(directory, config);
        indexWriter.addDocument(doc);
        indexWriter.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
 
开发者ID:yuany,项目名称:en-webmagic,代码行数:22,代码来源:LucenePipeline.java

示例14: process

import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
@Override
public void process(ResultItems resultItems, Task task) {
    String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
    try {
        PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")),"UTF-8"));
        printWriter.println("url:\t" + resultItems.getRequest().getUrl());
        for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
            if (entry.getValue() instanceof Iterable) {
                Iterable value = (Iterable) entry.getValue();
                printWriter.println(entry.getKey() + ":");
                for (Object o : value) {
                    printWriter.println(o);
                }
            } else {
                printWriter.println(entry.getKey() + ":\t" + entry.getValue());
            }
        }
        printWriter.close();
    } catch (IOException e) {
        logger.warn("write file error", e);
    }
}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:23,代码来源:FilePipeline.java

示例15: main

import us.codecraft.webmagic.ResultItems; //导入依赖的package包/类
public static void main(String[] args) {
    //single download
    Spider spider = Spider.create(new BaiduBaikePageProcessor()).thread(2);
    String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
    ResultItems resultItems = spider.<ResultItems>get(String.format(urlTemplate, "水力发电"));
    System.out.println(resultItems);

    //multidownload
    List<String> list = new ArrayList<String>();
    list.add(String.format(urlTemplate,"风力发电"));
    list.add(String.format(urlTemplate,"太阳能"));
    list.add(String.format(urlTemplate,"地热发电"));
    list.add(String.format(urlTemplate,"地热发电"));
    List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);
    for (ResultItems resultItemse : resultItemses) {
        System.out.println(resultItemse.getAll());
    }
    spider.close();
}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:20,代码来源:BaiduBaikePageProcessor.java


注:本文中的us.codecraft.webmagic.ResultItems类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。