本文整理汇总了Java中us.codecraft.webmagic.Spider类的典型用法代码示例。如果您正苦于以下问题:Java Spider类的具体用法?Java Spider怎么用?Java Spider使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
Spider类属于us.codecraft.webmagic包,在下文中一共展示了Spider类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import us.codecraft.webmagic.Spider; //导入依赖的package包/类
public static void main(String[] args) {
//single download
Spider spider = Spider.create(new BaiduBaikePageProcessor()).thread(2);
String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
ResultItems resultItems = spider.<ResultItems>get(String.format(urlTemplate, "水力发电"));
System.out.println(resultItems);
//multidownload
List<String> list = new ArrayList<String>();
list.add(String.format(urlTemplate,"风力发电"));
list.add(String.format(urlTemplate,"太阳能"));
list.add(String.format(urlTemplate,"地热发电"));
list.add(String.format(urlTemplate,"地热发电"));
List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);
for (ResultItems resultItemse : resultItemses) {
System.out.println(resultItemse.getAll());
}
spider.close();
}
示例2: register
import us.codecraft.webmagic.Spider; //导入依赖的package包/类
/**
* Register spider for monitor.
*
* @param spiders spiders
* @return this
* @throws JMException JMException
*/
public synchronized SpiderMonitor register(Spider... spiders) throws JMException {
for (Spider spider : spiders) {
MonitorSpiderListener monitorSpiderListener = new MonitorSpiderListener();
if (spider.getSpiderListeners() == null) {
List<SpiderListener> spiderListeners = new ArrayList<SpiderListener>();
spiderListeners.add(monitorSpiderListener);
spider.setSpiderListeners(spiderListeners);
} else {
spider.getSpiderListeners().add(monitorSpiderListener);
}
SpiderStatusMXBean spiderStatusMBean = getSpiderStatusMBean(spider, monitorSpiderListener);
registerMBean(spiderStatusMBean);
spiderStatuses.add(spiderStatusMBean);
}
return this;
}
示例3: testInherit
import us.codecraft.webmagic.Spider; //导入依赖的package包/类
@Test
public void testInherit() throws Exception {
SpiderMonitor spiderMonitor = new SpiderMonitor(){
@Override
protected SpiderStatusMXBean getSpiderStatusMBean(Spider spider, MonitorSpiderListener monitorSpiderListener) {
return new CustomSpiderStatus(spider, monitorSpiderListener);
}
};
Spider zhihuSpider = Spider.create(new ZhihuPageProcessor())
.addUrl("http://my.oschina.net/flashsword/blog").thread(2);
Spider githubSpider = Spider.create(new GithubRepoPageProcessor())
.addUrl("https://github.com/code4craft");
spiderMonitor.register(zhihuSpider, githubSpider);
}
示例4: test
import us.codecraft.webmagic.Spider; //导入依赖的package包/类
@Test
public void test() throws Exception {
List<ExtractRule> extractRules = new ArrayList<ExtractRule>();
ExtractRule extractRule = new ExtractRule();
extractRule.setExpressionType(ExpressionType.XPath);
extractRule.setExpressionValue("//title");
extractRule.setFieldName("title");
extractRules.add(extractRule);
extractRule = new ExtractRule();
extractRule.setExpressionType(ExpressionType.XPath);
extractRule.setExpressionValue("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()");
extractRule.setFieldName("star");
extractRules.add(extractRule);
ResultItems resultItems = Spider.create(new ConfigurablePageProcessor(Site.me(), extractRules))
.setDownloader(new MockGithubDownloader()).get("https://github.com/code4craft/webmagic");
assertThat(resultItems.getAll()).containsEntry("title", "<title>code4craft/webmagic · GitHub</title>");
assertThat(resultItems.getAll()).containsEntry("star", " 86 ");
}
示例5: startSpider
import us.codecraft.webmagic.Spider; //导入依赖的package包/类
private static void startSpider(Params params) {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom()
.language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
pageProcessor.getSite().setSleepTime(params.getSleepTime());
pageProcessor.getSite().setRetryTimes(3);
pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404,403, 500,502));
Spider spider = Spider.create(pageProcessor).thread(params.getThread());
spider.clearPipeline().addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
}
});
if (params.getUrls() == null || params.getUrls().size() == 0) {
System.err.println("Need at least one argument");
System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]");
System.exit(-1);
}
for (String url : params.getUrls()) {
spider.addUrl(url);
}
spider.run();
}
示例6: main
import us.codecraft.webmagic.Spider; //导入依赖的package包/类
public static void main(String[] args) {
Spider.create(new GithubRepoPageProcessorSamples())
//从"https://github.com/code4craft"开始抓
.addUrl("https://github.com/code4craft")
//开启5个线程抓取
.thread(5)
//启动爬虫
.run();
}
示例7: main
import us.codecraft.webmagic.Spider; //导入依赖的package包/类
public static void main(String[] args) {
InitLogger.init();
String startUrl = "https://en.wikipedia.org/wiki/Wiki";
Spider.create(new WarmUp())
.addUrl(startUrl)
.thread(1)
.run();
}
示例8: main
import us.codecraft.webmagic.Spider; //导入依赖的package包/类
public static void main(String[] args) {
//tn:resultjsonavatarnew
//ie:utf-8 字符编码(ie输入 oe输出)
//word:美女 搜索关键字
//pn:60 开始条数
//rn:30 显示数量
//z:0 尺寸(0全部尺寸 9特大 3大 2中 1小)
//width:1024 自定义尺寸-宽
//height:768 自定义尺寸-高
//ic:0 颜色(0全部颜色 1红色 2黄色 4绿色 8青色 16蓝色 32紫色 64粉色 128棕色 256橙色 512黑色 1024白色 2048黑白)
//s:0 3头像图片
//face:0 1面部特写
//st:-1 -1全部类型 1卡通画 2简笔画
//lm:-1 (6动态图片 7静态图片)
//gsm:3c pn值的十六进制数
String key = "海贼王"; //百度图片 关键词
DownloadPicture downloadPicture = new DownloadPicture();
ArrayList<String> nameList = new ArrayList<>();
ArrayList<String> urlList = new ArrayList<>();
for(int i=0;i<2;i++){ //控制爬取页数,一页30张图片
String url = "http://image.baidu.com/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word="+key+"&pn="+i*3+"0&rn=30&z=3&ic=0&s=0&face=0&st=-1&lm=-1";
Spider.create(new DownloadPicture())
.addUrl(url)
.run();
urlList.addAll(urls);
nameList.addAll(names);
}
downloadPicture.downloadPicture(urlList,nameList,key);
}
示例9: deleteAll
import us.codecraft.webmagic.Spider; //导入依赖的package包/类
/**
* 删除全部爬虫
*/
public void deleteAll() {
List<String> spiderUUID2BeRemoved = spiderMap.entrySet().stream().filter(
spiderEntry -> spiderEntry.getValue().getStatus() == Spider.Status.Stopped
).map(Map.Entry::getKey).collect(Collectors.toList());
for (String uuid : spiderUUID2BeRemoved) {
try {
deleteTaskById(uuid);
spiderMap.remove(uuid);
} catch (Exception e) {
LOG.error("删除任务ID:{}出错,{}", uuid, e.getLocalizedMessage());
}
}
taskManager.deleteTasksByState(State.STOP);
}
示例10: main
import us.codecraft.webmagic.Spider; //导入依赖的package包/类
public static void main(String[] args) throws JMException {
int threadNumber = Runtime.getRuntime().availableProcessors();
Spider spider = Spider.create(new Crawler()).addUrl("http://list.iqiyi.com/www/1/----------------iqiyi--.html")
.addPipeline(new FilePipeline("/root/iqiyi")).thread(threadNumber * 16);
SpiderMonitor.instance().register(spider);
spider.start();
}
示例11: test
import us.codecraft.webmagic.Spider; //导入依赖的package包/类
@Test
public void test() throws Exception {
Spider.create(new MySqlPageProcessor(htmlService))
.scheduler(new FileCacheQueueScheduler("F:\\webmagic\\cache\\"))
.addUrl("http://www.cnblogs.com/")
.addPipeline(new MySqlPipeline(htmlService))
.thread(10)
.run();
}
示例12: main
import us.codecraft.webmagic.Spider; //导入依赖的package包/类
@Test
public void main() {
Spider.create(new GithubRepoPageProcessor())
//从"https://github.com/code4craft"开始抓
.addUrl("http://re.jd.com/index/standard")
// 保存在F盘的webmagic目录
.addPipeline(new JsonFilePipeline("F:\\webmagic\\"))
// .addPipeline(new ConsolePipeline())
//开启5个线程抓取
.thread(5)
//启动爬虫
.run();
}
示例13: index
import us.codecraft.webmagic.Spider; //导入依赖的package包/类
@RequestMapping(value = "main/index", method = RequestMethod.GET)
@ResponseBody
public String index() {
String startUrl = String.format(Constants.House_SEED, 1);
Spider.create(processor).addUrl(startUrl).addPipeline(pipeline).run();
return "done!";
}
示例14: main
import us.codecraft.webmagic.Spider; //导入依赖的package包/类
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
Spider.create(new MamacnPageProcessor())
.setScheduler(new FileCacheQueueScheduler("/data/webmagic/mamacn"))
.addUrl("http://www.mama.cn/photo/t1-p1.html")
.addPipeline(new OneFilePipeline("/data/webmagic/mamacn/data"))
.thread(5)
.run();
}
示例15: main
import us.codecraft.webmagic.Spider; //导入依赖的package包/类
public static void main(String[] args) {
Spider.create(new ZhihuPageProcessor()).
addUrl("http://www.zhihu.com/search?type=question&q=java").
addPipeline(new FilePipeline("D:\\webmagic\\")).
thread(5).
run();
}