本文整理汇总了Python中Helper.Helper.get_domain方法的典型用法代码示例。如果您正苦于以下问题:Python Helper.get_domain方法的具体用法?Python Helper.get_domain怎么用?Python Helper.get_domain使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Helper.Helper
的用法示例。
在下文中一共展示了Helper.get_domain方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: fill_back_queue
# 需要导入模块: from Helper import Helper [as 别名]
# 或者: from Helper.Helper import get_domain [as 别名]
def fill_back_queue(self):
while len(self.backQueue) < self.numBackQueues:
if not self.frontQueue.empty():
url = self.frontQueue.get()
else:
return -1
domain = Helper.get_domain(url)
if domain in self.backQueue:
self.backQueue[domain][0].put(url)
self.backQueue[domain][1] = time()
else:
self.backQueue[domain] = [Queue(), time()]
self.backQueue[domain][0].put(url)
示例2: process_url
# 需要导入模块: from Helper import Helper [as 别名]
# 或者: from Helper.Helper import get_domain [as 别名]
def process_url(self, url):
Helper.debug("process start")
try:
source = request.urlopen(url).read()
except:
return set()
Helper.debug("process 1:db")
self.db_cache(url, source)
#db = sqlite3.connect("data/pages.db")
#cursor = db.cursor()
#cursor.execute("""SELECT url FROM pages""")
#all_urls = [''.join(item) for item in cursor.fetchall()]
#if url in all_urls:
# cursor.execute("""
# UPDATE pages SET html = ? WHERE url = ? """, (source, url))
#else:
# cursor.execute("""
# INSERT INTO pages(url, html) VALUES (?,?)""", (url, source))
#db.commit()
#db.close()
Helper.debug("process 2:re")
# Regex for finding links
rgx = re.compile('a href="(\/\S+|[\/aA-zZ0-9]\S+\.\S+)"')
linkMatches = rgx.findall(str(source))
tempFrontier = set()
tempFrontier.add(url)
Helper.debug("process 3:add links")
if self.frontier.frontQueue.qsize() < 10:
for link in linkMatches:
if ('https://' in link or 'http://' in link or link[0] == '/') \
and 'ftp.' not in link \
and'ftp://' not in link \
and 'mailto:' not in link:
tempFrontier.add(self.normalize_url(link, Helper.get_domain(url)))
#tempFrontier = tempFrontier - set(self.get_disallowed_sites(url, 'GingerWhiskeyCrawler'))
Helper.debug("process end")
return tempFrontier
示例3: get
# 需要导入模块: from Helper import Helper [as 别名]
# 或者: from Helper.Helper import get_domain [as 别名]
def get(self):
if len(self.backQueue) < (self.numBackQueues*0.8):
self.fill_back_queue()
next = sorted(self.backQueue.values(), key=lambda x: x[1])[0]
url = next[0].get()
domain = Helper.get_domain(url)
#print(str(next[1]+2 - time()))
if time() < next[1]+2:
sleep(next[1]+2 - time())
self.backQueue[domain][1] = time()
if self.backQueue[domain][0].empty():
self.backQueue.pop(domain)
self.fill_back_queue()
return url
示例4: get_disallowed_sites
# 需要导入模块: from Helper import Helper [as 别名]
# 或者: from Helper.Helper import get_domain [as 别名]
def get_disallowed_sites(self, url, myAgent):
Helper.debug("Get disallowed sites 1")
domain = Helper.get_domain(url)
if domain in self.robots.keys():
return self.robots[domain]
try:
robot = request.urlopen('http://' + domain + '/robots.txt')
Helper.debug(' Fetching robots.txt: '+domain)
except:
return []
reAgent = re.compile("User-[aA]gent: *(\S+) *$")
reDis = re.compile("Disallow: *(/\S*) *$")
agent = None
disallowed = {}
Helper.debug("Get disallowed sites 2")
for line in robot:
l = str(line).replace("\\n", "").replace("\\r", "")[:-1]
if reAgent.findall(l):
agent = reAgent.findall(l)[0]
disallowed[agent] = []
if reDis.findall(l):
if agent in disallowed:
disallowed[agent].append(reDis.findall(l)[0])
Helper.debug("Get disallowed sites 3")
result = []
if myAgent in disallowed:
for link in disallowed[myAgent]:
result.append(link) # self.normalize_url(link, domain))
if '*' in disallowed:
for link in disallowed['*']:
result.append(link) # self.normalize_url(link, domain))
Helper.debug("Get disallowed sites 4")
self.robots[domain] = result
return result