当前位置: 首页>>代码示例>>Java>>正文


Java PaidLevelDomain类代码示例

本文整理汇总了Java中crawlercommons.domains.PaidLevelDomain的典型用法代码示例。如果您正苦于以下问题:Java PaidLevelDomain类的具体用法?Java PaidLevelDomain怎么用?Java PaidLevelDomain使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


PaidLevelDomain类属于crawlercommons.domains包,在下文中一共展示了PaidLevelDomain类的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: testIPv4

import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Test
public final void testIPv4() throws MalformedURLException {
    assertEquals("1.2.3.4", PaidLevelDomain.getPLD("1.2.3.4"));

    URL url = new URL("http://1.2.3.4:8080/a/b/c?_queue=1");
    assertEquals("1.2.3.4", PaidLevelDomain.getPLD(url));
}
 
开发者ID:crawler-commons,项目名称:crawler-commons,代码行数:8,代码来源:PaidLevelDomainTest.java

示例2: testStandardDomains

import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Test
public final void testStandardDomains() throws MalformedURLException {
    assertEquals("domain.com", PaidLevelDomain.getPLD("domain.com"));
    assertEquals("domain.com", PaidLevelDomain.getPLD("www.domain.com"));
    assertEquals("domain.com", PaidLevelDomain.getPLD("www.zzz.domain.com"));
    assertEquals("domain.com", PaidLevelDomain.getPLD(new URL("https://www.zzz.domain.com:9000/a/b?c=d")));
}
 
开发者ID:crawler-commons,项目名称:crawler-commons,代码行数:8,代码来源:PaidLevelDomainTest.java

示例3: testPrivateDomains

import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Test
public final void testPrivateDomains() {
    /*
     * do not match "private" domains (based on public suffixes from the
     * private section of the public suffix list)
     */
    assertEquals("blogspot.com", PaidLevelDomain.getPLD("myblog.blogspot.com"));
}
 
开发者ID:crawler-commons,项目名称:crawler-commons,代码行数:9,代码来源:PaidLevelDomainTest.java

示例4: getPolitenessKey

import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
private String getPolitenessKey(URL u) {
    String key;
    if (QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) {
        try {
            final InetAddress addr = InetAddress.getByName(u.getHost());
            key = addr.getHostAddress();
        } catch (final UnknownHostException e) {
            // unable to resolve it, so don't fall back to host name
            LOG.warn("Unable to resolve: {}, skipping.", u.getHost());
            return null;
        }
    } else if (QUEUE_MODE_DOMAIN.equalsIgnoreCase(queueMode)) {
        key = PaidLevelDomain.getPLD(u.getHost());
        if (key == null) {
            LOG.warn("Unknown domain for url: {}, using hostname as key",
                    u.toExternalForm());
            key = u.getHost();
        }
    } else {
        key = u.getHost();
        if (key == null) {
            LOG.warn("Unknown host for url: {}, using URL string as key",
                    u.toExternalForm());
            key = u.toExternalForm();
        }
    }
    return key.toLowerCase(Locale.ROOT);
}
 
开发者ID:DigitalPebble,项目名称:storm-crawler,代码行数:29,代码来源:SimpleFetcherBolt.java

示例5: create

import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
/**
 * Create an item. Queue id will be created based on
 * <code>queueMode</code> argument, either as a protocol + hostname
 * pair, protocol + IP address pair or protocol+domain pair.
 */

public static FetchItem create(URL u, Tuple t, String queueMode) {

    String queueID;

    String url = u.toExternalForm();

    String key = null;
    // reuse any key that might have been given
    // be it the hostname, domain or IP
    if (t.contains("key")) {
        key = t.getStringByField("key");
    }
    if (StringUtils.isNotBlank(key)) {
        queueID = key.toLowerCase(Locale.ROOT);
        return new FetchItem(url, u, t, queueID);
    }

    if (FetchItemQueues.QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) {
        try {
            final InetAddress addr = InetAddress.getByName(u.getHost());
            key = addr.getHostAddress();
        } catch (final UnknownHostException e) {
            LOG.warn(
                    "Unable to resolve IP for {}, using hostname as key.",
                    u.getHost());
            key = u.getHost();
        }
    } else if (FetchItemQueues.QUEUE_MODE_DOMAIN
            .equalsIgnoreCase(queueMode)) {
        key = PaidLevelDomain.getPLD(u.getHost());
        if (key == null) {
            LOG.warn(
                    "Unknown domain for url: {}, using hostname as key",
                    url);
            key = u.getHost();
        }
    } else {
        key = u.getHost();
    }

    if (key == null) {
        LOG.warn("Unknown host for url: {}, using URL string as key",
                url);
        key = u.toExternalForm();
    }

    queueID = key.toLowerCase(Locale.ROOT);
    return new FetchItem(url, u, t, queueID);
}
 
开发者ID:eorliac,项目名称:patent-crawler,代码行数:56,代码来源:FetcherBolt.java

示例6: testInvalidFQDN

import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Test
public void testInvalidFQDN() {
    assertEquals("blah", PaidLevelDomain.getPLD("blah"));
    assertEquals("1.2.3", PaidLevelDomain.getPLD("1.2.3"));
    assertEquals("me.i", PaidLevelDomain.getPLD("me.i"));
}
 
开发者ID:crawler-commons,项目名称:crawler-commons,代码行数:7,代码来源:PaidLevelDomainTest.java

示例7: testIPv6

import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Test
public final void testIPv6() throws MalformedURLException, UnknownHostException {
    InetAddress inet = InetAddress.getByName("1080:0:0:0:8:800:200c:417a");
    URL url = new URL("http", inet.getHostAddress(), 8080, "a/b/c");
    assertEquals("[1080:0:0:0:8:800:200c:417a]", PaidLevelDomain.getPLD(url));
}
 
开发者ID:crawler-commons,项目名称:crawler-commons,代码行数:7,代码来源:PaidLevelDomainTest.java

示例8: testBizDomains

import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Test
public final void testBizDomains() {
    assertEquals("xxx.biz", PaidLevelDomain.getPLD("xxx.biz"));
    assertEquals("xxx.biz", PaidLevelDomain.getPLD("www.xxx.biz"));
}
 
开发者ID:crawler-commons,项目名称:crawler-commons,代码行数:6,代码来源:PaidLevelDomainTest.java

示例9: testJapaneseDomains

import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Test
public final void testJapaneseDomains() {
    assertEquals("xxx.co.jp", PaidLevelDomain.getPLD("xxx.co.jp"));
    assertEquals("xxx.co.jp", PaidLevelDomain.getPLD("www.xxx.co.jp"));
    assertEquals("xxx.ne.jp", PaidLevelDomain.getPLD("www.xxx.ne.jp"));
}
 
开发者ID:crawler-commons,项目名称:crawler-commons,代码行数:7,代码来源:PaidLevelDomainTest.java

示例10: testGermanDomains

import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Test
public final void testGermanDomains() {
    assertEquals("de.com", PaidLevelDomain.getPLD("xxx.de.com"));
    assertEquals("de.com", PaidLevelDomain.getPLD("www.xxx.de.com"));
}
 
开发者ID:crawler-commons,项目名称:crawler-commons,代码行数:6,代码来源:PaidLevelDomainTest.java

示例11: testItalianDomains

import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Test
public final void testItalianDomains() {
    assertEquals("xxx.it", PaidLevelDomain.getPLD("xxx.it"));
    assertEquals("xxx.it", PaidLevelDomain.getPLD("www.xxx.it"));
    assertEquals("com.it", PaidLevelDomain.getPLD("xxx.com.it"));
}
 
开发者ID:crawler-commons,项目名称:crawler-commons,代码行数:7,代码来源:PaidLevelDomainTest.java

示例12: testFinnishDomains

import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Test
public final void testFinnishDomains() {
    assertEquals("fi.com", PaidLevelDomain.getPLD("www.fi.com"));
}
 
开发者ID:crawler-commons,项目名称:crawler-commons,代码行数:5,代码来源:PaidLevelDomainTest.java

示例13: getPartition

import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
/**
 * Returns the host, domain, IP of a URL so that it can be partitioned for
 * politeness, depending on the value of the config
 * <i>partition.url.mode</i>.
 **/
public String getPartition(String url, Metadata metadata) {

    String partitionKey = null;
    String host = "";

    // IP in metadata?
    if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_IP)) {
        String ip_provided = metadata.getFirstValue("ip");
        if (StringUtils.isNotBlank(ip_provided)) {
            partitionKey = ip_provided;
        }
    }

    if (partitionKey == null) {
        URL u;
        try {
            u = new URL(url);
            host = u.getHost();
        } catch (MalformedURLException e1) {
            LOG.warn("Invalid URL: {}", url);
            return null;
        }
    }

    // partition by hostname
    if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_HOST))
        partitionKey = host;

    // partition by domain : needs fixing
    else if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_DOMAIN)) {
        partitionKey = PaidLevelDomain.getPLD(host);
    }

    // partition by IP
    if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_IP)
            && partitionKey == null) {
        try {
            long start = System.currentTimeMillis();
            final InetAddress addr = InetAddress.getByName(host);
            partitionKey = addr.getHostAddress();
            long end = System.currentTimeMillis();
            LOG.debug("Resolved IP {} in {} msec for : {}", partitionKey,
                    end - start, url);
        } catch (final Exception e) {
            LOG.warn("Unable to resolve IP for: {}", host);
            return null;
        }
    }

    LOG.debug("Partition Key for: {} > {}", url, partitionKey);

    return partitionKey;
}
 
开发者ID:DigitalPebble,项目名称:storm-crawler,代码行数:59,代码来源:URLPartitioner.java

示例14: filter

import crawlercommons.domains.PaidLevelDomain; //导入依赖的package包/类
@Override
public String filter(URL sourceUrl, Metadata sourceMetadata,
        String urlToFilter) {
    if (sourceUrl == null || (!ignoreOutsideHost && !ignoreOutsideDomain)) {
        return urlToFilter;
    }

    URL tURL;
    try {
        tURL = new URL(urlToFilter);
    } catch (MalformedURLException e1) {
        return null;
    }

    String fromHost;
    String fromDomain = null;
    // Using identity comparison because URL.equals performs poorly
    if (sourceUrl == previousSourceUrl) {
        fromHost = previousSourceHost;
        if (ignoreOutsideDomain) {
            fromDomain = previousSourceDomain;
        }
    } else {
        fromHost = sourceUrl.getHost();
        if (ignoreOutsideDomain) {
            fromDomain = PaidLevelDomain.getPLD(fromHost);
        }
        previousSourceHost = fromHost;
        previousSourceDomain = fromDomain;
        previousSourceUrl = sourceUrl;
    }

    // resolve the hosts
    String toHost = tURL.getHost();

    if (ignoreOutsideHost) {
        if (toHost == null || !toHost.equalsIgnoreCase(fromHost)) {
            return null;
        }
    }

    if (ignoreOutsideDomain) {
        String toDomain = PaidLevelDomain.getPLD(toHost);
        if (toDomain == null || !toDomain.equals(fromDomain)) {
            return null;
        }
    }

    return urlToFilter;
}
 
开发者ID:DigitalPebble,项目名称:storm-crawler,代码行数:51,代码来源:HostURLFilter.java


注:本文中的crawlercommons.domains.PaidLevelDomain类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。