当前位置: 首页>>代码示例>>Java>>正文


Java PaidLevelDomain.getPLD方法代码示例

本文整理汇总了Java中crawlercommons.domains.PaidLevelDomain.getPLD方法的典型用法代码示例。如果您正苦于以下问题:Java PaidLevelDomain.getPLD方法的具体用法?Java PaidLevelDomain.getPLD怎么用?Java PaidLevelDomain.getPLD使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在crawlercommons.domains.PaidLevelDomain的用法示例。


在下文中一共展示了PaidLevelDomain.getPLD方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: getPolitenessKey

import crawlercommons.domains.PaidLevelDomain; //导入方法依赖的package包/类
private String getPolitenessKey(URL u) {
    String key;
    if (QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) {
        try {
            final InetAddress addr = InetAddress.getByName(u.getHost());
            key = addr.getHostAddress();
        } catch (final UnknownHostException e) {
            // unable to resolve it, so don't fall back to host name
            LOG.warn("Unable to resolve: {}, skipping.", u.getHost());
            return null;
        }
    } else if (QUEUE_MODE_DOMAIN.equalsIgnoreCase(queueMode)) {
        key = PaidLevelDomain.getPLD(u.getHost());
        if (key == null) {
            LOG.warn("Unknown domain for url: {}, using hostname as key",
                    u.toExternalForm());
            key = u.getHost();
        }
    } else {
        key = u.getHost();
        if (key == null) {
            LOG.warn("Unknown host for url: {}, using URL string as key",
                    u.toExternalForm());
            key = u.toExternalForm();
        }
    }
    return key.toLowerCase(Locale.ROOT);
}
 
开发者ID:DigitalPebble,项目名称:storm-crawler,代码行数:29,代码来源:SimpleFetcherBolt.java

示例2: create

import crawlercommons.domains.PaidLevelDomain; //导入方法依赖的package包/类
/**
 * Create an item. Queue id will be created based on
 * <code>queueMode</code> argument, either as a protocol + hostname
 * pair, protocol + IP address pair or protocol+domain pair.
 */

public static FetchItem create(URL u, Tuple t, String queueMode) {

    String queueID;

    String url = u.toExternalForm();

    String key = null;
    // reuse any key that might have been given
    // be it the hostname, domain or IP
    if (t.contains("key")) {
        key = t.getStringByField("key");
    }
    if (StringUtils.isNotBlank(key)) {
        queueID = key.toLowerCase(Locale.ROOT);
        return new FetchItem(url, u, t, queueID);
    }

    if (FetchItemQueues.QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) {
        try {
            final InetAddress addr = InetAddress.getByName(u.getHost());
            key = addr.getHostAddress();
        } catch (final UnknownHostException e) {
            LOG.warn(
                    "Unable to resolve IP for {}, using hostname as key.",
                    u.getHost());
            key = u.getHost();
        }
    } else if (FetchItemQueues.QUEUE_MODE_DOMAIN
            .equalsIgnoreCase(queueMode)) {
        key = PaidLevelDomain.getPLD(u.getHost());
        if (key == null) {
            LOG.warn(
                    "Unknown domain for url: {}, using hostname as key",
                    url);
            key = u.getHost();
        }
    } else {
        key = u.getHost();
    }

    if (key == null) {
        LOG.warn("Unknown host for url: {}, using URL string as key",
                url);
        key = u.toExternalForm();
    }

    queueID = key.toLowerCase(Locale.ROOT);
    return new FetchItem(url, u, t, queueID);
}
 
开发者ID:eorliac,项目名称:patent-crawler,代码行数:56,代码来源:FetcherBolt.java

示例3: getPartition

import crawlercommons.domains.PaidLevelDomain; //导入方法依赖的package包/类
/**
 * Returns the host, domain, IP of a URL so that it can be partitioned for
 * politeness, depending on the value of the config
 * <i>partition.url.mode</i>.
 **/
public String getPartition(String url, Metadata metadata) {

    String partitionKey = null;
    String host = "";

    // IP in metadata?
    if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_IP)) {
        String ip_provided = metadata.getFirstValue("ip");
        if (StringUtils.isNotBlank(ip_provided)) {
            partitionKey = ip_provided;
        }
    }

    if (partitionKey == null) {
        URL u;
        try {
            u = new URL(url);
            host = u.getHost();
        } catch (MalformedURLException e1) {
            LOG.warn("Invalid URL: {}", url);
            return null;
        }
    }

    // partition by hostname
    if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_HOST))
        partitionKey = host;

    // partition by domain : needs fixing
    else if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_DOMAIN)) {
        partitionKey = PaidLevelDomain.getPLD(host);
    }

    // partition by IP
    if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_IP)
            && partitionKey == null) {
        try {
            long start = System.currentTimeMillis();
            final InetAddress addr = InetAddress.getByName(host);
            partitionKey = addr.getHostAddress();
            long end = System.currentTimeMillis();
            LOG.debug("Resolved IP {} in {} msec for : {}", partitionKey,
                    end - start, url);
        } catch (final Exception e) {
            LOG.warn("Unable to resolve IP for: {}", host);
            return null;
        }
    }

    LOG.debug("Partition Key for: {} > {}", url, partitionKey);

    return partitionKey;
}
 
开发者ID:DigitalPebble,项目名称:storm-crawler,代码行数:59,代码来源:URLPartitioner.java

示例4: filter

import crawlercommons.domains.PaidLevelDomain; //导入方法依赖的package包/类
@Override
public String filter(URL sourceUrl, Metadata sourceMetadata,
        String urlToFilter) {
    if (sourceUrl == null || (!ignoreOutsideHost && !ignoreOutsideDomain)) {
        return urlToFilter;
    }

    URL tURL;
    try {
        tURL = new URL(urlToFilter);
    } catch (MalformedURLException e1) {
        return null;
    }

    String fromHost;
    String fromDomain = null;
    // Using identity comparison because URL.equals performs poorly
    if (sourceUrl == previousSourceUrl) {
        fromHost = previousSourceHost;
        if (ignoreOutsideDomain) {
            fromDomain = previousSourceDomain;
        }
    } else {
        fromHost = sourceUrl.getHost();
        if (ignoreOutsideDomain) {
            fromDomain = PaidLevelDomain.getPLD(fromHost);
        }
        previousSourceHost = fromHost;
        previousSourceDomain = fromDomain;
        previousSourceUrl = sourceUrl;
    }

    // resolve the hosts
    String toHost = tURL.getHost();

    if (ignoreOutsideHost) {
        if (toHost == null || !toHost.equalsIgnoreCase(fromHost)) {
            return null;
        }
    }

    if (ignoreOutsideDomain) {
        String toDomain = PaidLevelDomain.getPLD(toHost);
        if (toDomain == null || !toDomain.equals(fromDomain)) {
            return null;
        }
    }

    return urlToFilter;
}
 
开发者ID:DigitalPebble,项目名称:storm-crawler,代码行数:51,代码来源:HostURLFilter.java


注:本文中的crawlercommons.domains.PaidLevelDomain.getPLD方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。