本文整理汇总了Java中us.codecraft.webmagic.utils.UrlUtils类的典型用法代码示例。如果您正苦于以下问题:Java UrlUtils类的具体用法?Java UrlUtils怎么用?Java UrlUtils使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
UrlUtils类属于us.codecraft.webmagic.utils包,在下文中一共展示了UrlUtils类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getContent
import us.codecraft.webmagic.utils.UrlUtils; //导入依赖的package包/类
protected String getContent(String charset, HttpResponse response) throws IOException {
if(charset == null) {
long contentLength = response.getEntity().getContentLength();
if(response.getFirstHeader("Content-Type") != null
&& !response.getFirstHeader("Content-Type").getValue().toLowerCase().contains("text/html"))
throw new IllegalArgumentException("此链接为非html内容,不下载,内容类型:" + response.getFirstHeader("Content-Type"));
else if(contentLength>value.getMaxDownloadLength())
throw new IllegalArgumentException("网页内容长度超过最大限制,要求最大长度:" + value.getMaxDownloadLength() + ",实际长度:" + contentLength);
byte[] contentBytes = IOUtils.toByteArray(response.getEntity().getContent());
String htmlCharset = UrlUtils.getCharset(response.getEntity().getContentType().getValue());
if (htmlCharset != null) {
return new String(contentBytes, htmlCharset);
} else {
LOG.warn("自动探测字符集失败, 使用 {} 作为字符集。请在Site.setCharset()指定字符集", Charset.defaultCharset());
return new String(contentBytes);
}
} else
return IOUtils.toString(response.getEntity().getContent(), charset);
}
示例2: handleResponse
import us.codecraft.webmagic.utils.UrlUtils; //导入依赖的package包/类
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task)
throws IOException {
String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset);
Page page = new Page();
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
// set http response value
page.putHttpResponse(Constant.STATUS_CODE, httpResponse.getStatusLine().getStatusCode() + "");
Header[] headers = httpResponse.getAllHeaders();
for (Header header : headers) {
page.putHttpResponse(header.getName(), header.getValue());
}
return page;
}
示例3: getAll
import us.codecraft.webmagic.utils.UrlUtils; //导入依赖的package包/类
/**
* Download urls synchronizing.
*
* @param urls urls
* @param <T> type of process result
* @return list downloaded
*/
public <T> List<T> getAll(Collection<String> urls) {
destroyWhenExit = false;
spawnUrl = false;
if (startRequests!=null){
startRequests.clear();
}
for (Request request : UrlUtils.convertToRequests(urls)) {
addRequest(request);
}
CollectorPipeline collectorPipeline = getCollectorPipeline();
pipelines.add(collectorPipeline);
run();
spawnUrl = true;
destroyWhenExit = true;
return collectorPipeline.getCollected();
}
示例4: convertHttpClientContext
import us.codecraft.webmagic.utils.UrlUtils; //导入依赖的package包/类
private HttpClientContext convertHttpClientContext(Request request, Site site, Proxy proxy) {
HttpClientContext httpContext = new HttpClientContext();
if (proxy != null && proxy.getUsername() != null) {
AuthState authState = new AuthState();
authState.update(new BasicScheme(ChallengeState.PROXY), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
}
if (request.getCookies() != null && !request.getCookies().isEmpty()) {
CookieStore cookieStore = new BasicCookieStore();
for (Map.Entry<String, String> cookieEntry : request.getCookies().entrySet()) {
BasicClientCookie cookie1 = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());
cookie1.setDomain(UrlUtils.removePort(UrlUtils.getDomain(request.getUrl())));
cookieStore.addCookie(cookie1);
}
httpContext.setCookieStore(cookieStore);
}
return httpContext;
}
示例5: getAll
import us.codecraft.webmagic.utils.UrlUtils; //导入依赖的package包/类
/**
* Download urls synchronizing.
*
* @param urls urls
* @return list downloaded
*/
public <T> List<T> getAll(Collection<String> urls) {
destroyWhenExit = false;
spawnUrl = false;
startRequests.clear();
for (Request request : UrlUtils.convertToRequests(urls)) {
addRequest(request);
}
CollectorPipeline collectorPipeline = getCollectorPipeline();
pipelines.add(collectorPipeline);
run();
spawnUrl = true;
destroyWhenExit = true;
return collectorPipeline.getCollected();
}
示例6: addTargetRequests
import us.codecraft.webmagic.utils.UrlUtils; //导入依赖的package包/类
/**
* 添加待抓取的链接
*
* @param requests 待抓取的链接
*/
public void addTargetRequests(List<String> requests) {
synchronized (targetRequests) {
for (String s : requests) {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
break;
}
s = UrlUtils.canonicalizeUrl(s, url.toString());
targetRequests.add(new Request(s));
}
}
}
示例7: addTargetRequest
import us.codecraft.webmagic.utils.UrlUtils; //导入依赖的package包/类
/**
* 添加待抓取的链接
*
* @param requestString 待抓取的链接
*/
public void addTargetRequest(String requestString) {
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
return;
}
synchronized (targetRequests) {
requestString = UrlUtils.canonicalizeUrl(requestString, url.toString());
targetRequests.add(new Request(requestString));
}
}
示例8: getDomain
import us.codecraft.webmagic.utils.UrlUtils; //导入依赖的package包/类
/**
* 获取已设置的domain
*
* @return 已设置的domain
*/
public String getDomain() {
if (domain == null) {
if (startUrls.size() > 0) {
domain = UrlUtils.getDomain(startUrls.get(0));
}
}
return domain;
}
示例9: SimplePageProcessor
import us.codecraft.webmagic.utils.UrlUtils; //导入依赖的package包/类
public SimplePageProcessor(String startUrl, String urlPattern) {
this.site = Site.me().addStartUrl(startUrl).
setDomain(UrlUtils.getDomain(startUrl)).setUserAgent(UA);
//compile "*" expression to regex
this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")";
}
示例10: addTargetRequests
import us.codecraft.webmagic.utils.UrlUtils; //导入依赖的package包/类
/**
* add urls to fetch
*
* @param requests requests
*/
public void addTargetRequests(List<String> requests) {
for (String s : requests) {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
continue;
}
s = UrlUtils.canonicalizeUrl(s, url.toString());
targetRequests.add(new Request(s));
}
}
示例11: addTargetRequest
import us.codecraft.webmagic.utils.UrlUtils; //导入依赖的package包/类
/**
* add url to fetch
*
* @param requestString requestString
*/
public void addTargetRequest(String requestString) {
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
return;
}
requestString = UrlUtils.canonicalizeUrl(requestString, url.toString());
targetRequests.add(new Request(requestString));
}
示例12: convertHttpUriRequest
import us.codecraft.webmagic.utils.UrlUtils; //导入依赖的package包/类
private HttpUriRequest convertHttpUriRequest(Request request, Site site, Proxy proxy) {
RequestBuilder requestBuilder = selectRequestMethod(request).setUri(UrlUtils.fixIllegalCharacterInUrl(request.getUrl()));
if (site.getHeaders() != null) {
for (Map.Entry<String, String> headerEntry : site.getHeaders().entrySet()) {
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
}
}
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom();
if (site != null) {
requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut())
.setSocketTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut())
.setCookieSpec(CookieSpecs.STANDARD);
}
if (proxy != null) {
requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort()));
}
requestBuilder.setConfig(requestConfigBuilder.build());
HttpUriRequest httpUriRequest = requestBuilder.build();
if (request.getHeaders() != null && !request.getHeaders().isEmpty()) {
for (Map.Entry<String, String> header : request.getHeaders().entrySet()) {
httpUriRequest.addHeader(header.getKey(), header.getValue());
}
}
return httpUriRequest;
}
示例13: addRequest
import us.codecraft.webmagic.utils.UrlUtils; //导入依赖的package包/类
private void addRequest(Request request) {
if (site.getDomain() == null && request != null && request.getUrl() != null) {
site.setDomain(UrlUtils.getDomain(request.getUrl()));
}
scheduler.push(request, this);
}
示例14: test_illegal_uri_correct
import us.codecraft.webmagic.utils.UrlUtils; //导入依赖的package包/类
@Test
public void test_illegal_uri_correct() throws Exception {
HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
HttpClientRequestContext requestContext = httpUriRequestConverter.convert(new Request(UrlUtils.fixIllegalCharacterInUrl("http://bj.zhongkao.com/beikao/yimo/##")), Site.me(), null);
assertThat(requestContext.getHttpUriRequest().getURI()).isEqualTo(new URI("http://bj.zhongkao.com/beikao/yimo/#"));
}
示例15: registerMBean
import us.codecraft.webmagic.utils.UrlUtils; //导入依赖的package包/类
protected void registerMBean(SpiderStatusMXBean spiderStatus) throws MalformedObjectNameException, InstanceAlreadyExistsException, MBeanRegistrationException, NotCompliantMBeanException {
// ObjectName objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName());
ObjectName objName = new ObjectName(jmxServerName + ":name=" + UrlUtils.removePort(spiderStatus.getName()));
mbeanServer.registerMBean(spiderStatus, objName);
}