本文整理汇总了Python中tldextract.extract方法的典型用法代码示例。如果您正苦于以下问题:Python tldextract.extract方法的具体用法?Python tldextract.extract怎么用?Python tldextract.extract使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tldextract
的用法示例。
在下文中一共展示了tldextract.extract方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_subdomains
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import extract [as 别名]
def get_subdomains(domain):
#validate domain
if domain:
p = domain.strip().lower()
re_domain = re.findall(r'^(([a-z0-9]+(-[a-z0-9]+)*\.)+[a-z]{2,})$', p)
if len(re_domain) > 0 and re_domain[0][0] == p and tldextract.extract(p).suffix != '':
pass
else:
logger.error('Domain validation failed: {d}'.format(d=p))
else:
logger.warning('domain is empty')
return
try:
esd = EnumSubDomain(domain)
return esd.run()
except Exception:
logger.error('Unexpected error occured when brute subdomain for {}'.format(domain),exc_info=True)
示例2: internal_links
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import extract [as 别名]
def internal_links(target):
global total, int_total
print(G + '[+]' + C + ' Extracting Internal Links' + W, end = '')
ext = tldextract.extract(target)
domain = ext.registered_domain
links = soup.find_all('a')
for link in links:
url = link.get('href')
if url != None:
if domain in url:
int_total.append(url)
int_total = set(int_total)
print(G + '['.rjust(6, '.') + ' {} ]'.format(str(len(int_total))))
示例3: external_links
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import extract [as 别名]
def external_links(target):
global total, ext_total
print(G + '[+]' + C + ' Extracting External Links' + W, end = '')
ext = tldextract.extract(target)
domain = ext.registered_domain
links = soup.find_all('a')
for link in links:
url = link.get('href')
if url != None:
if domain not in url and 'http' in url:
ext_total.append(url)
ext_total = set(ext_total)
print(G + '['.rjust(6, '.') + ' {} ]'.format(str(len(ext_total))))
示例4: remove_tld
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import extract [as 别名]
def remove_tld(domain):
"""Remove the TLD from a domain name.
Params:
- domain: (type: string) FQDN.
Returns:
- domain: (type: string) FQDN without TLD.
"""
try:
tld = extract(domain).suffix
domain = ''.join(domain.rsplit(tld, 1)).strip('.')
except Exception as e:
LOGGING.warning(
'Error stripping TLD ({0}): {1}'.format(
domain, str(e)))
return domain
示例5: transfer_session_cookies_to_driver
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import extract [as 别名]
def transfer_session_cookies_to_driver(self, domain=None):
"""Copies the Session's cookies into the webdriver
Using the 'domain' parameter we choose the cookies we wish to transfer, we only
transfer the cookies which belong to that domain. The domain defaults to our last visited
site if not provided.
"""
if not domain and self._last_requests_url:
domain = tldextract.extract(self._last_requests_url).registered_domain
elif not domain and not self._last_requests_url:
raise Exception('Trying to transfer cookies to selenium without specifying a domain '
'and without having visited any page in the current session')
# Transfer cookies
for c in [c for c in self.cookies if domain in c.domain]:
self.driver.ensure_add_cookie({'name': c.name, 'value': c.value, 'path': c.path,
'expiry': c.expires, 'domain': c.domain})
示例6: getDomain
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import extract [as 别名]
def getDomain(url):
"""
This function will get top level domain from given URL.
Parameters
-------
url: str
Original URL provided in the argument.
Returns
--------
str
top level domain will be returned.
"""
if urlparse(url).netloc != '':
finalset.add(urlparse(url).netloc)
ext = tldextract.extract(str(url))
return ext.registered_domain
示例7: parse_domain
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import extract [as 别名]
def parse_domain(url):
"""
Extract a domain from a URL.
Args:
url (str)
Returns: str
"""
url = url.lower().strip()
# Get the last `http://...` sequence.
url = re.compile('http[s]?:/{1,2}').split(url)[-1]
tld = tldextract.extract(url)
return tld.registered_domain
示例8: can_create_user
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import extract [as 别名]
def can_create_user(email_address, blacklisted_domains=None):
"""
Returns true if a user with the specified e-mail address can be created.
"""
if features.BLACKLISTED_EMAILS and email_address and "@" in email_address:
blacklisted_domains = blacklisted_domains or []
_, email_domain = email_address.split("@", 1)
extracted = tldextract.extract(email_domain)
if extracted.registered_domain.lower() in blacklisted_domains:
return False
if not features.USER_CREATION:
return False
if features.INVITE_ONLY_USER_CREATION:
if not email_address:
return False
# Check to see that there is an invite for the e-mail address.
return bool(model.team.lookup_team_invites_by_email(email_address))
# Otherwise the user can be created (assuming it doesn't already exist, of course)
return True
示例9: collect_url
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import extract [as 别名]
def collect_url(web_url):
html_data_string = ''
try:
received_response = urlopen(web_url)
if 'text/html' in received_response.getheader('Content-Type'):
data_bytes = received_response.read()
html_data_string = data_bytes.decode("latin-1")
link_finder = link_crawler(Crawl_bot.start_link, web_url)
link_finder.feed(html_data_string)
##############################################################################################################################################################################################
#######################################FOR SCRAPPING PURPOSES#################################################################################################################################
f = open(Crawl_bot.folder_name + '/' + ((tldextract.extract(web_url)).domain), 'w')
f.write(html_data_string)
f.close()
###############################################################################################################################################################################################
###############################################################################################################################################################################################
except Exception as e:
print(str(e))
return set()
return link_finder.page_urls()
示例10: search_google_
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import extract [as 别名]
def search_google_(target):
engine = Google()
results = engine.search("'" + target + "'")
for r in results:
print ("|--[INFO][GOOGLE][RESULTS][>] " + r["title"] + " | " + r["text"] + " | " + r["link"])
try:
tsd, td, tsu = extract(r["link"])
domain = td + '.' + tsu
web = requests.get(r["link"], timeout=3)
print ("|----[INFO][WEB][HTTP CODE][>] " + str(web.status_code) + "\n")
if web.status_code >= 200 or web.status_code < 300:
if not domain in config.BL_parserPhone:
TEXT = er.remove_tags(str(web.text))
parser.parserMAIN(TEXT)
except Exception as e:
print ("|----[ERROR][HTTP CONNECTION][>] " + str(e))
示例11: join_words_subdomains
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import extract [as 别名]
def join_words_subdomains(args, alteration_words):
with open(args.input, "r") as fp:
with open(args.output_tmp, "a") as wp:
for line in fp:
ext = tldextract.extract(line.strip())
current_sub = ext.subdomain.split(".")
for word in alteration_words:
for index, value in enumerate(current_sub):
original_sub = current_sub[index]
current_sub[index] = current_sub[index] + word.strip()
# join the list to make into actual subdomain (aa.bb.cc)
actual_sub = ".".join(current_sub)
# save full URL as line in file
full_url = "{0}.{1}.{2}\n".format(
actual_sub, ext.domain, ext.suffix)
write_domain(args, wp, full_url)
current_sub[index] = original_sub
# second dash alteration
current_sub[index] = word.strip() + current_sub[index]
actual_sub = ".".join(current_sub)
# save second full URL as line in file
full_url = "{0}.{1}.{2}\n".format(
actual_sub, ext.domain, ext.suffix)
write_domain(args, wp, full_url)
current_sub[index] = original_sub
示例12: cleanup_url
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import extract [as 别名]
def cleanup_url(url, tls_support):
"""
Add protocols to the URI if they are missing, else return None.
"""
parsed = urlparse(url)
if parsed.scheme == '':
_tld = tldextract.extract(url)
_tld = f'{_tld.subdomain}.{_tld.domain}.{_tld.suffix}'
try:
tls_supported = tls_support[_tld]
except KeyError:
tls_supported = TlsTest.test_tls_supported(url)
tls_support[_tld] = tls_supported
log.info('Tested domain {}'.format(_tld))
if tls_supported:
return "'https://{}'".format(url)
else:
return "'http://{}'".format(url)
else:
return None
示例13: getGoogleDomains
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import extract [as 别名]
def getGoogleDomains(self):
googleList = []
file = "results/"+self.org+"/google.txt"
if os.path.exists(file):
fh = open(file)
for line in fh:
extracted = tldextract.extract(line)
tld = extracted.domain+"."+extracted.suffix
if tld not in googleList:
googleList.append(tld)
for domain in googleList:
print(domain)
fh.close()
else:
print("Not found")
exit()
示例14: getBingDomains
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import extract [as 别名]
def getBingDomains(self):
googleList = []
file = "results/"+self.org+"/bing.txt"
if os.path.exists(file):
fh = open(file)
for line in fh:
extracted = tldextract.extract(line)
tld = extracted.domain+"."+extracted.suffix
if tld not in googleList:
googleList.append(tld)
for domain in googleList:
print(domain)
fh.close()
else:
print("Not found")
exit()
示例15: getYahooDomains
# 需要导入模块: import tldextract [as 别名]
# 或者: from tldextract import extract [as 别名]
def getYahooDomains(self):
googleList = []
file = "results/"+self.org+"/yahoo.txt"
if os.path.exists(file):
fh = open(file)
for line in fh:
extracted = tldextract.extract(line)
tld = extracted.domain+"."+extracted.suffix
if tld not in googleList:
googleList.append(tld)
for domain in googleList:
print(domain)
fh.close()
else:
print("Not found")
exit()