本文整理汇总了Java中crawlercommons.domains.PaidLevelDomain类的典型用法代码示例。如果您正苦于以下问题:Java PaidLevelDomain类的具体用法?Java PaidLevelDomain怎么用?Java PaidLevelDomain使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
PaidLevelDomain类属于crawlercommons.domains包,在下文中一共展示了PaidLevelDomain类的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testIPv4
import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Test
public final void testIPv4() throws MalformedURLException {
assertEquals("1.2.3.4", PaidLevelDomain.getPLD("1.2.3.4"));
URL url = new URL("http://1.2.3.4:8080/a/b/c?_queue=1");
assertEquals("1.2.3.4", PaidLevelDomain.getPLD(url));
}
示例2: testStandardDomains
import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Test
public final void testStandardDomains() throws MalformedURLException {
assertEquals("domain.com", PaidLevelDomain.getPLD("domain.com"));
assertEquals("domain.com", PaidLevelDomain.getPLD("www.domain.com"));
assertEquals("domain.com", PaidLevelDomain.getPLD("www.zzz.domain.com"));
assertEquals("domain.com", PaidLevelDomain.getPLD(new URL("https://www.zzz.domain.com:9000/a/b?c=d")));
}
示例3: testPrivateDomains
import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Test
public final void testPrivateDomains() {
/*
* do not match "private" domains (based on public suffixes from the
* private section of the public suffix list)
*/
assertEquals("blogspot.com", PaidLevelDomain.getPLD("myblog.blogspot.com"));
}
示例4: getPolitenessKey
import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
private String getPolitenessKey(URL u) {
String key;
if (QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) {
try {
final InetAddress addr = InetAddress.getByName(u.getHost());
key = addr.getHostAddress();
} catch (final UnknownHostException e) {
// unable to resolve it, so don't fall back to host name
LOG.warn("Unable to resolve: {}, skipping.", u.getHost());
return null;
}
} else if (QUEUE_MODE_DOMAIN.equalsIgnoreCase(queueMode)) {
key = PaidLevelDomain.getPLD(u.getHost());
if (key == null) {
LOG.warn("Unknown domain for url: {}, using hostname as key",
u.toExternalForm());
key = u.getHost();
}
} else {
key = u.getHost();
if (key == null) {
LOG.warn("Unknown host for url: {}, using URL string as key",
u.toExternalForm());
key = u.toExternalForm();
}
}
return key.toLowerCase(Locale.ROOT);
}
示例5: create
import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
/**
* Create an item. Queue id will be created based on
* <code>queueMode</code> argument, either as a protocol + hostname
* pair, protocol + IP address pair or protocol+domain pair.
*/
public static FetchItem create(URL u, Tuple t, String queueMode) {
String queueID;
String url = u.toExternalForm();
String key = null;
// reuse any key that might have been given
// be it the hostname, domain or IP
if (t.contains("key")) {
key = t.getStringByField("key");
}
if (StringUtils.isNotBlank(key)) {
queueID = key.toLowerCase(Locale.ROOT);
return new FetchItem(url, u, t, queueID);
}
if (FetchItemQueues.QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) {
try {
final InetAddress addr = InetAddress.getByName(u.getHost());
key = addr.getHostAddress();
} catch (final UnknownHostException e) {
LOG.warn(
"Unable to resolve IP for {}, using hostname as key.",
u.getHost());
key = u.getHost();
}
} else if (FetchItemQueues.QUEUE_MODE_DOMAIN
.equalsIgnoreCase(queueMode)) {
key = PaidLevelDomain.getPLD(u.getHost());
if (key == null) {
LOG.warn(
"Unknown domain for url: {}, using hostname as key",
url);
key = u.getHost();
}
} else {
key = u.getHost();
}
if (key == null) {
LOG.warn("Unknown host for url: {}, using URL string as key",
url);
key = u.toExternalForm();
}
queueID = key.toLowerCase(Locale.ROOT);
return new FetchItem(url, u, t, queueID);
}
示例6: testInvalidFQDN
import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Test
public void testInvalidFQDN() {
assertEquals("blah", PaidLevelDomain.getPLD("blah"));
assertEquals("1.2.3", PaidLevelDomain.getPLD("1.2.3"));
assertEquals("me.i", PaidLevelDomain.getPLD("me.i"));
}
示例7: testIPv6
import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Test
public final void testIPv6() throws MalformedURLException, UnknownHostException {
InetAddress inet = InetAddress.getByName("1080:0:0:0:8:800:200c:417a");
URL url = new URL("http", inet.getHostAddress(), 8080, "a/b/c");
assertEquals("[1080:0:0:0:8:800:200c:417a]", PaidLevelDomain.getPLD(url));
}
示例8: testBizDomains
import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Test
public final void testBizDomains() {
assertEquals("xxx.biz", PaidLevelDomain.getPLD("xxx.biz"));
assertEquals("xxx.biz", PaidLevelDomain.getPLD("www.xxx.biz"));
}
示例9: testJapaneseDomains
import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Test
public final void testJapaneseDomains() {
assertEquals("xxx.co.jp", PaidLevelDomain.getPLD("xxx.co.jp"));
assertEquals("xxx.co.jp", PaidLevelDomain.getPLD("www.xxx.co.jp"));
assertEquals("xxx.ne.jp", PaidLevelDomain.getPLD("www.xxx.ne.jp"));
}
示例10: testGermanDomains
import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Test
public final void testGermanDomains() {
assertEquals("de.com", PaidLevelDomain.getPLD("xxx.de.com"));
assertEquals("de.com", PaidLevelDomain.getPLD("www.xxx.de.com"));
}
示例11: testItalianDomains
import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Test
public final void testItalianDomains() {
assertEquals("xxx.it", PaidLevelDomain.getPLD("xxx.it"));
assertEquals("xxx.it", PaidLevelDomain.getPLD("www.xxx.it"));
assertEquals("com.it", PaidLevelDomain.getPLD("xxx.com.it"));
}
示例12: testFinnishDomains
import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Test
public final void testFinnishDomains() {
assertEquals("fi.com", PaidLevelDomain.getPLD("www.fi.com"));
}
示例13: getPartition
import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
/**
* Returns the host, domain, IP of a URL so that it can be partitioned for
* politeness, depending on the value of the config
* <i>partition.url.mode</i>.
**/
public String getPartition(String url, Metadata metadata) {
String partitionKey = null;
String host = "";
// IP in metadata?
if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_IP)) {
String ip_provided = metadata.getFirstValue("ip");
if (StringUtils.isNotBlank(ip_provided)) {
partitionKey = ip_provided;
}
}
if (partitionKey == null) {
URL u;
try {
u = new URL(url);
host = u.getHost();
} catch (MalformedURLException e1) {
LOG.warn("Invalid URL: {}", url);
return null;
}
}
// partition by hostname
if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_HOST))
partitionKey = host;
// partition by domain : needs fixing
else if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_DOMAIN)) {
partitionKey = PaidLevelDomain.getPLD(host);
}
// partition by IP
if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_IP)
&& partitionKey == null) {
try {
long start = System.currentTimeMillis();
final InetAddress addr = InetAddress.getByName(host);
partitionKey = addr.getHostAddress();
long end = System.currentTimeMillis();
LOG.debug("Resolved IP {} in {} msec for : {}", partitionKey,
end - start, url);
} catch (final Exception e) {
LOG.warn("Unable to resolve IP for: {}", host);
return null;
}
}
LOG.debug("Partition Key for: {} > {}", url, partitionKey);
return partitionKey;
}
示例14: filter
import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Override
public String filter(URL sourceUrl, Metadata sourceMetadata,
String urlToFilter) {
if (sourceUrl == null || (!ignoreOutsideHost && !ignoreOutsideDomain)) {
return urlToFilter;
}
URL tURL;
try {
tURL = new URL(urlToFilter);
} catch (MalformedURLException e1) {
return null;
}
String fromHost;
String fromDomain = null;
// Using identity comparison because URL.equals performs poorly
if (sourceUrl == previousSourceUrl) {
fromHost = previousSourceHost;
if (ignoreOutsideDomain) {
fromDomain = previousSourceDomain;
}
} else {
fromHost = sourceUrl.getHost();
if (ignoreOutsideDomain) {
fromDomain = PaidLevelDomain.getPLD(fromHost);
}
previousSourceHost = fromHost;
previousSourceDomain = fromDomain;
previousSourceUrl = sourceUrl;
}
// resolve the hosts
String toHost = tURL.getHost();
if (ignoreOutsideHost) {
if (toHost == null || !toHost.equalsIgnoreCase(fromHost)) {
return null;
}
}
if (ignoreOutsideDomain) {
String toDomain = PaidLevelDomain.getPLD(toHost);
if (toDomain == null || !toDomain.equals(fromDomain)) {
return null;
}
}
return urlToFilter;
}