当前位置: 首页>>代码示例>>Java>>正文


Java Spider类代码示例

本文整理汇总了Java中us.codecraft.webmagic.Spider的典型用法代码示例。如果您正苦于以下问题:Java Spider类的具体用法?Java Spider怎么用?Java Spider使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


Spider类属于us.codecraft.webmagic包,在下文中一共展示了Spider类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: main

import us.codecraft.webmagic.Spider; //导入依赖的package包/类
public static void main(String[] args) {
    //single download
    Spider spider = Spider.create(new BaiduBaikePageProcessor()).thread(2);
    String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
    ResultItems resultItems = spider.<ResultItems>get(String.format(urlTemplate, "水力发电"));
    System.out.println(resultItems);

    //multidownload
    List<String> list = new ArrayList<String>();
    list.add(String.format(urlTemplate,"风力发电"));
    list.add(String.format(urlTemplate,"太阳能"));
    list.add(String.format(urlTemplate,"地热发电"));
    list.add(String.format(urlTemplate,"地热发电"));
    List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);
    for (ResultItems resultItemse : resultItemses) {
        System.out.println(resultItemse.getAll());
    }
    spider.close();
}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:20,代码来源:BaiduBaikePageProcessor.java

示例2: register

import us.codecraft.webmagic.Spider; //导入依赖的package包/类
/**
 * Register spider for monitor.
 *
 * @param spiders spiders
 * @return this
 * @throws JMException JMException
 */
public synchronized SpiderMonitor register(Spider... spiders) throws JMException {
    for (Spider spider : spiders) {
        MonitorSpiderListener monitorSpiderListener = new MonitorSpiderListener();
        if (spider.getSpiderListeners() == null) {
            List<SpiderListener> spiderListeners = new ArrayList<SpiderListener>();
            spiderListeners.add(monitorSpiderListener);
            spider.setSpiderListeners(spiderListeners);
        } else {
            spider.getSpiderListeners().add(monitorSpiderListener);
        }
        SpiderStatusMXBean spiderStatusMBean = getSpiderStatusMBean(spider, monitorSpiderListener);
        registerMBean(spiderStatusMBean);
        spiderStatuses.add(spiderStatusMBean);
    }
    return this;
}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:24,代码来源:SpiderMonitor.java

示例3: testInherit

import us.codecraft.webmagic.Spider; //导入依赖的package包/类
@Test
public void testInherit() throws Exception {
    SpiderMonitor spiderMonitor = new SpiderMonitor(){
        @Override
        protected SpiderStatusMXBean getSpiderStatusMBean(Spider spider, MonitorSpiderListener monitorSpiderListener) {
            return new CustomSpiderStatus(spider, monitorSpiderListener);
        }
    };

    Spider zhihuSpider = Spider.create(new ZhihuPageProcessor())
            .addUrl("http://my.oschina.net/flashsword/blog").thread(2);
    Spider githubSpider = Spider.create(new GithubRepoPageProcessor())
            .addUrl("https://github.com/code4craft");

    spiderMonitor.register(zhihuSpider, githubSpider);

}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:18,代码来源:SpiderMonitorTest.java

示例4: test

import us.codecraft.webmagic.Spider; //导入依赖的package包/类
@Test
public void test() throws Exception {
    List<ExtractRule> extractRules = new ArrayList<ExtractRule>();
    ExtractRule extractRule = new ExtractRule();
    extractRule.setExpressionType(ExpressionType.XPath);
    extractRule.setExpressionValue("//title");
    extractRule.setFieldName("title");
    extractRules.add(extractRule);
    extractRule = new ExtractRule();
    extractRule.setExpressionType(ExpressionType.XPath);
    extractRule.setExpressionValue("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()");
    extractRule.setFieldName("star");
    extractRules.add(extractRule);
    ResultItems resultItems = Spider.create(new ConfigurablePageProcessor(Site.me(), extractRules))
            .setDownloader(new MockGithubDownloader()).get("https://github.com/code4craft/webmagic");
    assertThat(resultItems.getAll()).containsEntry("title", "<title>code4craft/webmagic · GitHub</title>");
    assertThat(resultItems.getAll()).containsEntry("star", " 86 ");

}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:20,代码来源:ConfigurablePageProcessorTest.java

示例5: startSpider

import us.codecraft.webmagic.Spider; //导入依赖的package包/类
private static void startSpider(Params params) {
    ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom()
            .language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
    pageProcessor.getSite().setSleepTime(params.getSleepTime());
    pageProcessor.getSite().setRetryTimes(3);
    pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404,403, 500,502));
    Spider spider = Spider.create(pageProcessor).thread(params.getThread());
    spider.clearPipeline().addPipeline(new Pipeline() {
        @Override
        public void process(ResultItems resultItems, Task task) {

        }
    });
    if (params.getUrls() == null || params.getUrls().size() == 0) {
        System.err.println("Need at least one argument");
        System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]");
        System.exit(-1);
    }
    for (String url : params.getUrls()) {
        spider.addUrl(url);
    }
    spider.run();
}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:24,代码来源:ScriptConsole.java

示例6: main

import us.codecraft.webmagic.Spider; //导入依赖的package包/类
public static void main(String[] args) {

        Spider.create(new GithubRepoPageProcessorSamples())
                //从"https://github.com/code4craft"开始抓
                .addUrl("https://github.com/code4craft")
                //开启5个线程抓取
                .thread(5)
                //启动爬虫
                .run();
    }
 
开发者ID:leon66666,项目名称:financehelper,代码行数:11,代码来源:GithubRepoPageProcessorSamples.java

示例7: main

import us.codecraft.webmagic.Spider; //导入依赖的package包/类
public static void main(String[] args) {
    InitLogger.init();
    String startUrl = "https://en.wikipedia.org/wiki/Wiki";
    Spider.create(new WarmUp())
            .addUrl(startUrl)
            .thread(1)
            .run();
}
 
开发者ID:xiongbeer,项目名称:Cobweb,代码行数:9,代码来源:WarmUp.java

示例8: main

import us.codecraft.webmagic.Spider; //导入依赖的package包/类
public static void main(String[] args) {
    //tn:resultjsonavatarnew
    //ie:utf-8 字符编码(ie输入 oe输出)
    //word:美女 搜索关键字
    //pn:60 开始条数
    //rn:30 显示数量
    //z:0 尺寸(0全部尺寸 9特大 3大 2中 1小)
    //width:1024 自定义尺寸-宽
    //height:768 自定义尺寸-高
    //ic:0 颜色(0全部颜色 1红色 2黄色 4绿色 8青色 16蓝色 32紫色 64粉色 128棕色 256橙色 512黑色 1024白色 2048黑白)
    //s:0 3头像图片
    //face:0 1面部特写
    //st:-1 -1全部类型 1卡通画 2简笔画
    //lm:-1 (6动态图片 7静态图片)
    //gsm:3c pn值的十六进制数

    String key = "海贼王";    //百度图片 关键词
    DownloadPicture downloadPicture = new DownloadPicture();
    ArrayList<String> nameList = new ArrayList<>();
    ArrayList<String> urlList = new ArrayList<>();
    for(int i=0;i<2;i++){   //控制爬取页数,一页30张图片
        String url = "http://image.baidu.com/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word="+key+"&pn="+i*3+"0&rn=30&z=3&ic=0&s=0&face=0&st=-1&lm=-1";
        Spider.create(new DownloadPicture())
            .addUrl(url)
            .run();
        urlList.addAll(urls);
        nameList.addAll(names);
    }
    downloadPicture.downloadPicture(urlList,nameList,key);
}
 
开发者ID:bruceq,项目名称:Gather-Platform,代码行数:31,代码来源:DownloadPicture.java

示例9: deleteAll

import us.codecraft.webmagic.Spider; //导入依赖的package包/类
/**
 * 删除全部爬虫
 */
public void deleteAll() {
    List<String> spiderUUID2BeRemoved = spiderMap.entrySet().stream().filter(
            spiderEntry -> spiderEntry.getValue().getStatus() == Spider.Status.Stopped
    ).map(Map.Entry::getKey).collect(Collectors.toList());
    for (String uuid : spiderUUID2BeRemoved) {
        try {
            deleteTaskById(uuid);
            spiderMap.remove(uuid);
        } catch (Exception e) {
            LOG.error("删除任务ID:{}出错,{}", uuid, e.getLocalizedMessage());
        }
    }
    taskManager.deleteTasksByState(State.STOP);
}
 
开发者ID:bruceq,项目名称:Gather-Platform,代码行数:18,代码来源:CommonSpider.java

示例10: main

import us.codecraft.webmagic.Spider; //导入依赖的package包/类
public static void main(String[] args) throws JMException {
	int threadNumber = Runtime.getRuntime().availableProcessors();
	Spider spider = Spider.create(new Crawler()).addUrl("http://list.iqiyi.com/www/1/----------------iqiyi--.html")
			.addPipeline(new FilePipeline("/root/iqiyi")).thread(threadNumber * 16);
	SpiderMonitor.instance().register(spider);
	spider.start();
}
 
开发者ID:viixv,项目名称:iqiyi-crawler,代码行数:8,代码来源:Main.java

示例11: test

import us.codecraft.webmagic.Spider; //导入依赖的package包/类
@Test
public void test() throws Exception {
    Spider.create(new MySqlPageProcessor(htmlService))
            .scheduler(new FileCacheQueueScheduler("F:\\webmagic\\cache\\"))
            .addUrl("http://www.cnblogs.com/")
            .addPipeline(new MySqlPipeline(htmlService))
            .thread(10)
            .run();
}
 
开发者ID:Lzw2016,项目名称:study,代码行数:10,代码来源:HtmlTest.java

示例12: main

import us.codecraft.webmagic.Spider; //导入依赖的package包/类
@Test
    public void main() {
        Spider.create(new GithubRepoPageProcessor())
                //从"https://github.com/code4craft"开始抓
                .addUrl("http://re.jd.com/index/standard")
                // 保存在F盘的webmagic目录
                .addPipeline(new JsonFilePipeline("F:\\webmagic\\"))
//                .addPipeline(new ConsolePipeline())
                //开启5个线程抓取
                .thread(5)
                //启动爬虫
                .run();
    }
 
开发者ID:Lzw2016,项目名称:study,代码行数:14,代码来源:GithubRepoPageProcessor.java

示例13: index

import us.codecraft.webmagic.Spider; //导入依赖的package包/类
@RequestMapping(value = "main/index", method = RequestMethod.GET)
@ResponseBody
public String index() {
    String startUrl = String.format(Constants.House_SEED, 1);

    Spider.create(processor).addUrl(startUrl).addPipeline(pipeline).run();

    return "done!";
}
 
开发者ID:mikeqian,项目名称:house,代码行数:10,代码来源:MainController.java

示例14: main

import us.codecraft.webmagic.Spider; //导入依赖的package包/类
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
    Spider.create(new MamacnPageProcessor())
            .setScheduler(new FileCacheQueueScheduler("/data/webmagic/mamacn"))
            .addUrl("http://www.mama.cn/photo/t1-p1.html")
            .addPipeline(new OneFilePipeline("/data/webmagic/mamacn/data"))
            .thread(5)
            .run();
}
 
开发者ID:mikeqian,项目名称:house,代码行数:9,代码来源:MamacnPageProcessor.java

示例15: main

import us.codecraft.webmagic.Spider; //导入依赖的package包/类
public static void main(String[] args) {
    Spider.create(new ZhihuPageProcessor()).
            addUrl("http://www.zhihu.com/search?type=question&q=java").
            addPipeline(new FilePipeline("D:\\webmagic\\")).
            thread(5).
            run();
}
 
开发者ID:mikeqian,项目名称:house,代码行数:8,代码来源:ZhihuPageProcessor.java


注:本文中的us.codecraft.webmagic.Spider类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。