当前位置: 首页>>代码示例>>Python>>正文


Python Helper.get_domain方法代码示例

本文整理汇总了Python中Helper.Helper.get_domain方法的典型用法代码示例。如果您正苦于以下问题:Python Helper.get_domain方法的具体用法?Python Helper.get_domain怎么用?Python Helper.get_domain使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在Helper.Helper的用法示例。


在下文中一共展示了Helper.get_domain方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: fill_back_queue

# 需要导入模块: from Helper import Helper [as 别名]
# 或者: from Helper.Helper import get_domain [as 别名]
    def fill_back_queue(self):
        while len(self.backQueue) < self.numBackQueues:

            if not self.frontQueue.empty():
                url = self.frontQueue.get()
            else:
                return -1

            domain = Helper.get_domain(url)
            if domain in self.backQueue:
                self.backQueue[domain][0].put(url)
                self.backQueue[domain][1] = time()
            else:
                self.backQueue[domain] = [Queue(), time()]
                self.backQueue[domain][0].put(url)
开发者ID:Roknahr,项目名称:pyCrawler,代码行数:17,代码来源:Frontier.py

示例2: process_url

# 需要导入模块: from Helper import Helper [as 别名]
# 或者: from Helper.Helper import get_domain [as 别名]
    def process_url(self, url):
        Helper.debug("process start")
        try:
            source = request.urlopen(url).read()
        except:
            return set()
        Helper.debug("process 1:db")
        
        self.db_cache(url, source)

        #db = sqlite3.connect("data/pages.db")
        #cursor = db.cursor()
        #cursor.execute("""SELECT url FROM pages""")
        #all_urls = [''.join(item) for item in cursor.fetchall()]
        #if url in all_urls:
        #    cursor.execute("""
        #        UPDATE pages SET html = ? WHERE url = ? """, (source, url))
        #else:
        #    cursor.execute("""
        #        INSERT INTO pages(url, html) VALUES (?,?)""", (url, source))
        #db.commit()
        #db.close()
        
        Helper.debug("process 2:re")
        # Regex for finding links
        rgx = re.compile('a href="(\/\S+|[\/aA-zZ0-9]\S+\.\S+)"')

        linkMatches = rgx.findall(str(source))

        tempFrontier = set()

        tempFrontier.add(url)
        Helper.debug("process 3:add links")
        if self.frontier.frontQueue.qsize() < 10:
            for link in linkMatches:
                if ('https://' in link or 'http://' in link or link[0] == '/') \
                    and 'ftp.' not in link \
                    and'ftp://' not in link \
                    and 'mailto:' not in link:
                    tempFrontier.add(self.normalize_url(link, Helper.get_domain(url)))
        
        #tempFrontier = tempFrontier - set(self.get_disallowed_sites(url, 'GingerWhiskeyCrawler'))
        Helper.debug("process end")
        return tempFrontier
开发者ID:Roknahr,项目名称:pyCrawler,代码行数:46,代码来源:WebCrawler.py

示例3: get

# 需要导入模块: from Helper import Helper [as 别名]
# 或者: from Helper.Helper import get_domain [as 别名]
    def get(self):
        if len(self.backQueue) < (self.numBackQueues*0.8):
            self.fill_back_queue()

        next = sorted(self.backQueue.values(), key=lambda x: x[1])[0]

        url = next[0].get()

        domain = Helper.get_domain(url)
        #print(str(next[1]+2 - time()))
        if time() < next[1]+2:
            sleep(next[1]+2 - time())

        self.backQueue[domain][1] = time()

        if self.backQueue[domain][0].empty():
            self.backQueue.pop(domain)
            self.fill_back_queue()

        return url
开发者ID:Roknahr,项目名称:pyCrawler,代码行数:22,代码来源:Frontier.py

示例4: get_disallowed_sites

# 需要导入模块: from Helper import Helper [as 别名]
# 或者: from Helper.Helper import get_domain [as 别名]
    def get_disallowed_sites(self, url, myAgent):
        Helper.debug("Get disallowed sites 1")

        domain = Helper.get_domain(url)

        if domain in self.robots.keys():
            return self.robots[domain]

        try:
            robot = request.urlopen('http://' + domain + '/robots.txt')
            Helper.debug('    Fetching robots.txt: '+domain)
        except:
            return []

        reAgent = re.compile("User-[aA]gent: *(\S+) *$")
        reDis = re.compile("Disallow: *(/\S*) *$")

        agent = None
        disallowed = {}
        Helper.debug("Get disallowed sites 2")
        for line in robot:
            l = str(line).replace("\\n", "").replace("\\r", "")[:-1]
            if reAgent.findall(l): 
                agent = reAgent.findall(l)[0]
                disallowed[agent] = []
            if reDis.findall(l): 
                if agent in disallowed:
                    disallowed[agent].append(reDis.findall(l)[0])
        Helper.debug("Get disallowed sites 3")    
        result = []
        if myAgent in disallowed:
            for link in disallowed[myAgent]:
                result.append(link)  # self.normalize_url(link, domain))
        if '*' in disallowed:
            for link in disallowed['*']:
                result.append(link)  # self.normalize_url(link, domain))
        Helper.debug("Get disallowed sites 4")
        self.robots[domain] = result
        return result
开发者ID:Roknahr,项目名称:pyCrawler,代码行数:41,代码来源:WebCrawler.py


注:本文中的Helper.Helper.get_domain方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。