当前位置: 首页>>代码示例>>Python>>正文


Python tldextract.TLDExtract方法代码示例

本文整理汇总了Python中tldextract.TLDExtract方法的典型用法代码示例。如果您正苦于以下问题:Python tldextract.TLDExtract方法的具体用法?Python tldextract.TLDExtract怎么用?Python tldextract.TLDExtract使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在tldextract的用法示例。


在下文中一共展示了tldextract.TLDExtract方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: setup

# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def setup(self, settings):
        '''
        Setup redis and tldextract
        '''
        self.extract = tldextract.TLDExtract()
        self.redis_conn = redis.Redis(host=settings['REDIS_HOST'],
                                      port=settings['REDIS_PORT'],
                                      db=settings.get('REDIS_DB'))

        try:
            self.redis_conn.info()
            self.logger.debug("Connected to Redis in ActionHandler")
        except ConnectionError:
            self.logger.error("Failed to connect to Redis in ActionHandler")
            # plugin is essential to functionality
            sys.exit(1) 
开发者ID:istresearch,项目名称:scrapy-cluster,代码行数:18,代码来源:action_handler.py

示例2: setup

# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def setup(self, settings):
        '''
        Setup redis and tldextract
        '''
        self.extract = tldextract.TLDExtract()
        self.redis_conn = redis.Redis(host=settings['REDIS_HOST'],
                                      port=settings['REDIS_PORT'],
                                      db=settings.get('REDIS_DB'))

        try:
            self.redis_conn.info()
            self.logger.debug("Connected to Redis in ZookeeperHandler")
        except ConnectionError:
            self.logger.error("Failed to connect to Redis in ZookeeperHandler")
            # plugin is essential to functionality
            sys.exit(1) 
开发者ID:istresearch,项目名称:scrapy-cluster,代码行数:18,代码来源:zookeeper_handler.py

示例3: setup

# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def setup(self, settings):
        '''
        Setup redis and tldextract
        '''
        self.extract = tldextract.TLDExtract()
        self.redis_conn = redis.Redis(host=settings['REDIS_HOST'],
                                      port=settings['REDIS_PORT'],
                                      db=settings.get('REDIS_DB'))

        try:
            self.redis_conn.info()
            self.logger.debug("Connected to Redis in ScraperHandler")
        except ConnectionError:
            self.logger.error("Failed to connect to Redis in ScraperHandler")
            # plugin is essential to functionality
            sys.exit(1) 
开发者ID:istresearch,项目名称:scrapy-cluster,代码行数:18,代码来源:scraper_handler.py

示例4: extract

# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def extract(self):
        """
        extract domain

        >>> d = Domain('www.example.com')
        <domain.Domain object>
        >>> d.extract()
        ExtractResult(subdomain='www', domain='example', suffix='com')

        :return: extracted domain results
        """
        data_storage_dir = setting.data_storage_dir
        extract_cache_file = data_storage_dir.joinpath('public_suffix_list.dat')
        tldext = tldextract.TLDExtract(extract_cache_file)
        result = self.match()
        if result:
            return tldext(result)
        else:
            return None 
开发者ID:shmilylty,项目名称:OneForAll,代码行数:21,代码来源:domain.py

示例5: __init__

# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def __init__(self, jconfig, vtapikey):
        """
        Load the DOMAINS_WHITELIST and setup the tld-extractor
        """
        try:
            with open(jconfig['DOMAINS_WHITELIST'], 'rb') as f_in:
                UrlChecker.whitelist = pickle.load(f_in)
        except:
            log.error("URL whitelist loading error")
            UrlChecker.whitelist = list()
        cache_file = jconfig['TOP_DOMAINS_CACHE']
        UrlChecker.__tld = tldextract.TLDExtract(cache_file=cache_file)
        UrlChecker.__vtapikey = vtapikey 
开发者ID:jimmy-sonny,项目名称:YaYaGen,代码行数:15,代码来源:url_checker.py

示例6: test_scrape_handler

# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def test_scrape_handler(self):
        valid = {
            "url": "www.stuff.com",
            "crawlid": "abc124",
            "appid": "testapp",
            "spiderid": "link",
            "priority": 5,
        }
        handler = ScraperHandler()
        handler.extract = tldextract.TLDExtract()
        handler.redis_conn = MagicMock()

        # check it is added to redis
        handler.redis_conn.zadd = MagicMock(side_effect=AssertionError("added"))
        try:
            handler.handle(valid)
            self.fail("Action not called")
        except AssertionError as e:
            self.assertEquals("added", str(e))

        # check timeout is added
        handler.redis_conn.zadd = MagicMock()
        handler.redis_conn.set = MagicMock(side_effect=AssertionError("expires"))
        valid['expires'] = 124242
        try:
            handler.handle(valid)
            self.fail("Expires not called")
        except AssertionError as e:
            self.assertEquals("expires", str(e)) 
开发者ID:istresearch,项目名称:scrapy-cluster,代码行数:31,代码来源:test_plugins.py

示例7: __init__

# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def __init__(self, server, persist, update_int, timeout, retries, logger,
                 hits, window, mod, ip_refresh, add_type, add_ip, ip_regex,
                 backlog_blacklist, queue_timeout):
        '''
        Initialize the scheduler
        '''
        self.redis_conn = server
        self.persist = persist
        self.queue_dict = {}
        self.update_interval = update_int
        self.hits = hits
        self.window = window
        self.moderated = mod
        self.rfp_timeout = timeout
        self.ip_update_interval = ip_refresh
        self.add_type = add_type
        self.add_ip = add_ip
        self.item_retires = retries
        self.logger = logger
        self.ip_regex = re.compile(ip_regex)
        self.backlog_blacklist = backlog_blacklist
        self.queue_timeout = queue_timeout

        # set up tldextract
        self.extract = tldextract.TLDExtract()

        self.update_ipaddress()

        # if we need better uuid's mod this line
        self.my_uuid = str(uuid.uuid4()).split('-')[4] 
开发者ID:istresearch,项目名称:scrapy-cluster,代码行数:32,代码来源:distributed_scheduler.py

示例8: tldextract_parser

# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def tldextract_parser(url):
    parts = None

    try:
        parts = TLDExtract(**tld_extract_dict)(url)
    except Exception as e:
        logging.error(e)

    return parts 
开发者ID:yeti-platform,项目名称:yeti,代码行数:11,代码来源:utils.py

示例9: split_hostname

# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def split_hostname(hostname):
    """
    Splits a hostname into its subdomain, domain and TLD parts.

    For example:

    >>> from golismero.api.net.web_utils import ParsedURL
    >>> d = ParsedURL("http://www.example.com/")
    >>> d.split_hostname()
    ('www', 'example', 'com')
    >>> d = ParsedURL("http://some.subdomain.of.example.co.uk/")
    >>> d.split_hostname()
    ('some.subdomain.of', 'example', 'co.uk')
    >>> '.'.join(d.split_hostname())
    'some.subdomain.of.example.co.uk'

    :param hostname: Hostname to split.
    :type hostname: str

    :returns: Subdomain, domain and TLD.
    :rtype: tuple(str, str, str)
    """
    extract = TLDExtract(fetch = False)
    result  = extract( to_utf8(hostname) )
    return result.subdomain, result.domain, result.suffix


#------------------------------------------------------------------------------ 
开发者ID:blackye,项目名称:luscan-devel,代码行数:30,代码来源:web_utils.py

示例10: get_domain

# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def get_domain(url):
    tldextract = TLDExtract(suffix_list_urls=None)
    url_info = tldextract(url)
    return "{}.{}".format(url_info.domain, url_info.suffix) 
开发者ID:hhursev,项目名称:recipe-scrapers,代码行数:6,代码来源:__init__.py

示例11: get_domain

# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def get_domain(self, url):
    no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None)
    tld = no_fetch_extract(url)
    self.domain = "{}.{}".format(tld.domain, tld.suffix) 
开发者ID:fanmatics,项目名称:metadoc,代码行数:6,代码来源:domaintools.py

示例12: __init__

# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def __init__(self, config=None):
        if not config:
            # If there is not config specified, we load a non-interactive configuration.
            self.config = non_interactive_config_resolver()
        elif not isinstance(config, ConfigResolver):
            # If config is not a ConfigResolver, we are in a legacy situation.
            # We protect this part of the Client API.
            self.config = legacy_config_resolver(config)
        else:
            self.config = config

        # Validate configuration
        self._validate_config()

        runtime_config = {}

        # Process domain, strip subdomain
        domain_extractor = tldextract.TLDExtract(cache_file=TLDEXTRACT_CACHE_FILE,
                                                 include_psl_private_domains=True)
        domain_parts = domain_extractor(
            self.config.resolve('lexicon:domain'))
        runtime_config['domain'] = '{0}.{1}'.format(
            domain_parts.domain, domain_parts.suffix)

        if self.config.resolve('lexicon:delegated'):
            # handle delegated domain
            delegated = self.config.resolve('lexicon:delegated').rstrip('.')
            if delegated != runtime_config.get('domain'):
                # convert to relative name
                if delegated.endswith(runtime_config.get('domain')):
                    delegated = delegated[:-len(runtime_config.get('domain'))]
                    delegated = delegated.rstrip('.')
                # update domain
                runtime_config['domain'] = '{0}.{1}'.format(
                    delegated, runtime_config.get('domain'))

        self.action = self.config.resolve('lexicon:action')
        self.provider_name = (self.config.resolve('lexicon:provider_name')
                              or self.config.resolve('lexicon:provider'))

        self.config.add_config_source(DictConfigSource(runtime_config), 0)

        provider_module = importlib.import_module(
            'lexicon.providers.' + self.provider_name)
        provider_class = getattr(provider_module, 'Provider')
        self.provider = provider_class(self.config) 
开发者ID:AnalogJ,项目名称:lexicon,代码行数:48,代码来源:client.py

示例13: extend_IOC

# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def extend_IOC(self, argument, observable_list):
        """
            Extending IOC from URL into URL + DOMAIN + IP
        """
        if config['offline']:
            # Cache search
            # TODO
            if "TLDE_cache" in config:
                cache_file = "%s%s" % (config['temporary_cache_path'], config['TLDE_cache'])
                cache_extract = tldextract.TLDExtract(cache_file=cache_file)
                extract = cache_extract(argument)
        else:
            # Live search
            no_cache_extract = tldextract.TLDExtract(cache_file=False)
            extract = no_cache_extract(argument)

        try:
            registered_domain = extract.registered_domain
        except:
            registered_domain = None
        try:
            suffix_domain = extract.suffix
        except:
            suffix_domain = None
        try:
            complete_domain = '.'.join(part for part in extract if part)
        except:
            complete_domain = None
        domains = [registered_domain, suffix_domain, complete_domain]

        IPs = [None, None, None]
        if not config["offline"]:
            for domain in domains:
                try:
                    IP = socket.gethostbyname(domain)
                except:
                    IP = None
                IPs.append(IP)

        for domain in domains:
            if domain is not None and domain not in observable_list:
                observable_list.append(domain)
        for IP in IPs:
            if IP is not None and IP not in observable_list:
                observable_list.append(IP) 
开发者ID:conix-security,项目名称:BTG,代码行数:47,代码来源:BTG.py


注:本文中的tldextract.TLDExtract方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。