本文整理汇总了Python中tldextract.TLDExtract方法的典型用法代码示例。如果您正苦于以下问题:Python tldextract.TLDExtract方法的具体用法?Python tldextract.TLDExtract怎么用?Python tldextract.TLDExtract使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tldextract
的用法示例。
在下文中一共展示了tldextract.TLDExtract方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: setup
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def setup(self, settings):
'''
Setup redis and tldextract
'''
self.extract = tldextract.TLDExtract()
self.redis_conn = redis.Redis(host=settings['REDIS_HOST'],
port=settings['REDIS_PORT'],
db=settings.get('REDIS_DB'))
try:
self.redis_conn.info()
self.logger.debug("Connected to Redis in ActionHandler")
except ConnectionError:
self.logger.error("Failed to connect to Redis in ActionHandler")
# plugin is essential to functionality
sys.exit(1)
示例2: setup
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def setup(self, settings):
'''
Setup redis and tldextract
'''
self.extract = tldextract.TLDExtract()
self.redis_conn = redis.Redis(host=settings['REDIS_HOST'],
port=settings['REDIS_PORT'],
db=settings.get('REDIS_DB'))
try:
self.redis_conn.info()
self.logger.debug("Connected to Redis in ZookeeperHandler")
except ConnectionError:
self.logger.error("Failed to connect to Redis in ZookeeperHandler")
# plugin is essential to functionality
sys.exit(1)
示例3: setup
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def setup(self, settings):
'''
Setup redis and tldextract
'''
self.extract = tldextract.TLDExtract()
self.redis_conn = redis.Redis(host=settings['REDIS_HOST'],
port=settings['REDIS_PORT'],
db=settings.get('REDIS_DB'))
try:
self.redis_conn.info()
self.logger.debug("Connected to Redis in ScraperHandler")
except ConnectionError:
self.logger.error("Failed to connect to Redis in ScraperHandler")
# plugin is essential to functionality
sys.exit(1)
示例4: extract
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def extract(self):
"""
extract domain
>>> d = Domain('www.example.com')
<domain.Domain object>
>>> d.extract()
ExtractResult(subdomain='www', domain='example', suffix='com')
:return: extracted domain results
"""
data_storage_dir = setting.data_storage_dir
extract_cache_file = data_storage_dir.joinpath('public_suffix_list.dat')
tldext = tldextract.TLDExtract(extract_cache_file)
result = self.match()
if result:
return tldext(result)
else:
return None
示例5: __init__
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def __init__(self, jconfig, vtapikey):
"""
Load the DOMAINS_WHITELIST and setup the tld-extractor
"""
try:
with open(jconfig['DOMAINS_WHITELIST'], 'rb') as f_in:
UrlChecker.whitelist = pickle.load(f_in)
except:
log.error("URL whitelist loading error")
UrlChecker.whitelist = list()
cache_file = jconfig['TOP_DOMAINS_CACHE']
UrlChecker.__tld = tldextract.TLDExtract(cache_file=cache_file)
UrlChecker.__vtapikey = vtapikey
示例6: test_scrape_handler
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def test_scrape_handler(self):
valid = {
"url": "www.stuff.com",
"crawlid": "abc124",
"appid": "testapp",
"spiderid": "link",
"priority": 5,
}
handler = ScraperHandler()
handler.extract = tldextract.TLDExtract()
handler.redis_conn = MagicMock()
# check it is added to redis
handler.redis_conn.zadd = MagicMock(side_effect=AssertionError("added"))
try:
handler.handle(valid)
self.fail("Action not called")
except AssertionError as e:
self.assertEquals("added", str(e))
# check timeout is added
handler.redis_conn.zadd = MagicMock()
handler.redis_conn.set = MagicMock(side_effect=AssertionError("expires"))
valid['expires'] = 124242
try:
handler.handle(valid)
self.fail("Expires not called")
except AssertionError as e:
self.assertEquals("expires", str(e))
示例7: __init__
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def __init__(self, server, persist, update_int, timeout, retries, logger,
hits, window, mod, ip_refresh, add_type, add_ip, ip_regex,
backlog_blacklist, queue_timeout):
'''
Initialize the scheduler
'''
self.redis_conn = server
self.persist = persist
self.queue_dict = {}
self.update_interval = update_int
self.hits = hits
self.window = window
self.moderated = mod
self.rfp_timeout = timeout
self.ip_update_interval = ip_refresh
self.add_type = add_type
self.add_ip = add_ip
self.item_retires = retries
self.logger = logger
self.ip_regex = re.compile(ip_regex)
self.backlog_blacklist = backlog_blacklist
self.queue_timeout = queue_timeout
# set up tldextract
self.extract = tldextract.TLDExtract()
self.update_ipaddress()
# if we need better uuid's mod this line
self.my_uuid = str(uuid.uuid4()).split('-')[4]
示例8: tldextract_parser
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def tldextract_parser(url):
parts = None
try:
parts = TLDExtract(**tld_extract_dict)(url)
except Exception as e:
logging.error(e)
return parts
示例9: split_hostname
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def split_hostname(hostname):
"""
Splits a hostname into its subdomain, domain and TLD parts.
For example:
>>> from golismero.api.net.web_utils import ParsedURL
>>> d = ParsedURL("http://www.example.com/")
>>> d.split_hostname()
('www', 'example', 'com')
>>> d = ParsedURL("http://some.subdomain.of.example.co.uk/")
>>> d.split_hostname()
('some.subdomain.of', 'example', 'co.uk')
>>> '.'.join(d.split_hostname())
'some.subdomain.of.example.co.uk'
:param hostname: Hostname to split.
:type hostname: str
:returns: Subdomain, domain and TLD.
:rtype: tuple(str, str, str)
"""
extract = TLDExtract(fetch = False)
result = extract( to_utf8(hostname) )
return result.subdomain, result.domain, result.suffix
#------------------------------------------------------------------------------
示例10: get_domain
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def get_domain(url):
tldextract = TLDExtract(suffix_list_urls=None)
url_info = tldextract(url)
return "{}.{}".format(url_info.domain, url_info.suffix)
示例11: get_domain
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def get_domain(self, url):
no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=None)
tld = no_fetch_extract(url)
self.domain = "{}.{}".format(tld.domain, tld.suffix)
示例12: __init__
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def __init__(self, config=None):
if not config:
# If there is not config specified, we load a non-interactive configuration.
self.config = non_interactive_config_resolver()
elif not isinstance(config, ConfigResolver):
# If config is not a ConfigResolver, we are in a legacy situation.
# We protect this part of the Client API.
self.config = legacy_config_resolver(config)
else:
self.config = config
# Validate configuration
self._validate_config()
runtime_config = {}
# Process domain, strip subdomain
domain_extractor = tldextract.TLDExtract(cache_file=TLDEXTRACT_CACHE_FILE,
include_psl_private_domains=True)
domain_parts = domain_extractor(
self.config.resolve('lexicon:domain'))
runtime_config['domain'] = '{0}.{1}'.format(
domain_parts.domain, domain_parts.suffix)
if self.config.resolve('lexicon:delegated'):
# handle delegated domain
delegated = self.config.resolve('lexicon:delegated').rstrip('.')
if delegated != runtime_config.get('domain'):
# convert to relative name
if delegated.endswith(runtime_config.get('domain')):
delegated = delegated[:-len(runtime_config.get('domain'))]
delegated = delegated.rstrip('.')
# update domain
runtime_config['domain'] = '{0}.{1}'.format(
delegated, runtime_config.get('domain'))
self.action = self.config.resolve('lexicon:action')
self.provider_name = (self.config.resolve('lexicon:provider_name')
or self.config.resolve('lexicon:provider'))
self.config.add_config_source(DictConfigSource(runtime_config), 0)
provider_module = importlib.import_module(
'lexicon.providers.' + self.provider_name)
provider_class = getattr(provider_module, 'Provider')
self.provider = provider_class(self.config)
示例13: extend_IOC
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import TLDExtract [as 别名]
def extend_IOC(self, argument, observable_list):
"""
Extending IOC from URL into URL + DOMAIN + IP
"""
if config['offline']:
# Cache search
# TODO
if "TLDE_cache" in config:
cache_file = "%s%s" % (config['temporary_cache_path'], config['TLDE_cache'])
cache_extract = tldextract.TLDExtract(cache_file=cache_file)
extract = cache_extract(argument)
else:
# Live search
no_cache_extract = tldextract.TLDExtract(cache_file=False)
extract = no_cache_extract(argument)
try:
registered_domain = extract.registered_domain
except:
registered_domain = None
try:
suffix_domain = extract.suffix
except:
suffix_domain = None
try:
complete_domain = '.'.join(part for part in extract if part)
except:
complete_domain = None
domains = [registered_domain, suffix_domain, complete_domain]
IPs = [None, None, None]
if not config["offline"]:
for domain in domains:
try:
IP = socket.gethostbyname(domain)
except:
IP = None
IPs.append(IP)
for domain in domains:
if domain is not None and domain not in observable_list:
observable_list.append(domain)
for IP in IPs:
if IP is not None and IP not in observable_list:
observable_list.append(IP)