本文整理汇总了Java中crawlercommons.domains.PaidLevelDomain.getPLD方法的典型用法代码示例。如果您正苦于以下问题:Java PaidLevelDomain.getPLD方法的具体用法?Java PaidLevelDomain.getPLD怎么用?Java PaidLevelDomain.getPLD使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类crawlercommons.domains.PaidLevelDomain
的用法示例。
在下文中一共展示了PaidLevelDomain.getPLD方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getPolitenessKey
import crawlercommons.domains.PaidLevelDomain; //导入方法依赖的package包/类
private String getPolitenessKey(URL u) {
String key;
if (QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) {
try {
final InetAddress addr = InetAddress.getByName(u.getHost());
key = addr.getHostAddress();
} catch (final UnknownHostException e) {
// unable to resolve it, so don't fall back to host name
LOG.warn("Unable to resolve: {}, skipping.", u.getHost());
return null;
}
} else if (QUEUE_MODE_DOMAIN.equalsIgnoreCase(queueMode)) {
key = PaidLevelDomain.getPLD(u.getHost());
if (key == null) {
LOG.warn("Unknown domain for url: {}, using hostname as key",
u.toExternalForm());
key = u.getHost();
}
} else {
key = u.getHost();
if (key == null) {
LOG.warn("Unknown host for url: {}, using URL string as key",
u.toExternalForm());
key = u.toExternalForm();
}
}
return key.toLowerCase(Locale.ROOT);
}
示例2: create
import crawlercommons.domains.PaidLevelDomain; //导入方法依赖的package包/类
/**
* Create an item. Queue id will be created based on
* <code>queueMode</code> argument, either as a protocol + hostname
* pair, protocol + IP address pair or protocol+domain pair.
*/
public static FetchItem create(URL u, Tuple t, String queueMode) {
String queueID;
String url = u.toExternalForm();
String key = null;
// reuse any key that might have been given
// be it the hostname, domain or IP
if (t.contains("key")) {
key = t.getStringByField("key");
}
if (StringUtils.isNotBlank(key)) {
queueID = key.toLowerCase(Locale.ROOT);
return new FetchItem(url, u, t, queueID);
}
if (FetchItemQueues.QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) {
try {
final InetAddress addr = InetAddress.getByName(u.getHost());
key = addr.getHostAddress();
} catch (final UnknownHostException e) {
LOG.warn(
"Unable to resolve IP for {}, using hostname as key.",
u.getHost());
key = u.getHost();
}
} else if (FetchItemQueues.QUEUE_MODE_DOMAIN
.equalsIgnoreCase(queueMode)) {
key = PaidLevelDomain.getPLD(u.getHost());
if (key == null) {
LOG.warn(
"Unknown domain for url: {}, using hostname as key",
url);
key = u.getHost();
}
} else {
key = u.getHost();
}
if (key == null) {
LOG.warn("Unknown host for url: {}, using URL string as key",
url);
key = u.toExternalForm();
}
queueID = key.toLowerCase(Locale.ROOT);
return new FetchItem(url, u, t, queueID);
}
示例3: getPartition
import crawlercommons.domains.PaidLevelDomain; //导入方法依赖的package包/类
/**
* Returns the host, domain, IP of a URL so that it can be partitioned for
* politeness, depending on the value of the config
* <i>partition.url.mode</i>.
**/
public String getPartition(String url, Metadata metadata) {
String partitionKey = null;
String host = "";
// IP in metadata?
if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_IP)) {
String ip_provided = metadata.getFirstValue("ip");
if (StringUtils.isNotBlank(ip_provided)) {
partitionKey = ip_provided;
}
}
if (partitionKey == null) {
URL u;
try {
u = new URL(url);
host = u.getHost();
} catch (MalformedURLException e1) {
LOG.warn("Invalid URL: {}", url);
return null;
}
}
// partition by hostname
if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_HOST))
partitionKey = host;
// partition by domain : needs fixing
else if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_DOMAIN)) {
partitionKey = PaidLevelDomain.getPLD(host);
}
// partition by IP
if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_IP)
&& partitionKey == null) {
try {
long start = System.currentTimeMillis();
final InetAddress addr = InetAddress.getByName(host);
partitionKey = addr.getHostAddress();
long end = System.currentTimeMillis();
LOG.debug("Resolved IP {} in {} msec for : {}", partitionKey,
end - start, url);
} catch (final Exception e) {
LOG.warn("Unable to resolve IP for: {}", host);
return null;
}
}
LOG.debug("Partition Key for: {} > {}", url, partitionKey);
return partitionKey;
}
示例4: filter
import crawlercommons.domains.PaidLevelDomain; //导入方法依赖的package包/类
@Override
public String filter(URL sourceUrl, Metadata sourceMetadata,
String urlToFilter) {
if (sourceUrl == null || (!ignoreOutsideHost && !ignoreOutsideDomain)) {
return urlToFilter;
}
URL tURL;
try {
tURL = new URL(urlToFilter);
} catch (MalformedURLException e1) {
return null;
}
String fromHost;
String fromDomain = null;
// Using identity comparison because URL.equals performs poorly
if (sourceUrl == previousSourceUrl) {
fromHost = previousSourceHost;
if (ignoreOutsideDomain) {
fromDomain = previousSourceDomain;
}
} else {
fromHost = sourceUrl.getHost();
if (ignoreOutsideDomain) {
fromDomain = PaidLevelDomain.getPLD(fromHost);
}
previousSourceHost = fromHost;
previousSourceDomain = fromDomain;
previousSourceUrl = sourceUrl;
}
// resolve the hosts
String toHost = tURL.getHost();
if (ignoreOutsideHost) {
if (toHost == null || !toHost.equalsIgnoreCase(fromHost)) {
return null;
}
}
if (ignoreOutsideDomain) {
String toDomain = PaidLevelDomain.getPLD(toHost);
if (toDomain == null || !toDomain.equals(fromDomain)) {
return null;
}
}
return urlToFilter;
}