当前位置: 首页>>代码示例>>Python>>正文


Python robotparser.RobotFileParser类代码示例

本文整理汇总了Python中robotparser.RobotFileParser的典型用法代码示例。如果您正苦于以下问题:Python RobotFileParser类的具体用法?Python RobotFileParser怎么用?Python RobotFileParser使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了RobotFileParser类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: accessible

def accessible(url):
    u = urlparse(url)
    if u.netloc not in robots_cache:
        resp = requests.get('http://%s/robots.txt' % u.netloc)
        rp = RobotFileParser()
        rp.parse(resp.content.splitlines())
        robots_cache[u.netloc] = rp
    return robots_cache[u.netloc].can_fetch('*', url)
开发者ID:Mondego,项目名称:pyreco,代码行数:8,代码来源:allPythonContent.py

示例2: _allowed_to_open

 def _allowed_to_open(self, url):
     host = urlparse.urlsplit(url)[1]
     robots_url = urlparse.urlunsplit(('http', host, '/robots.txt', '', ''))
     rp = RobotFileParser(robots_url)
     try:
         rp.read()
     except:
         return False
     return rp.can_fetch(self._agent_name, url)
开发者ID:AntonDatsik,项目名称:L4,代码行数:9,代码来源:crawler.py

示例3: _get_robot_parser

 def _get_robot_parser(self):
     try:
         return pickle.loads(str(self.robot_parser_pickle))
     except (TypeError, IndexError):
         parser = RobotFileParser()
         parser.set_url(str(self.protocol) + "://" + str(self.domain) + \
                        "/robots.txt")
         self.robot_parser = parser
         return parser
开发者ID:rafaduran,项目名称:rdc_crawler,代码行数:9,代码来源:models.py

示例4: _get_robot_parser

    def _get_robot_parser(self):
        if self.robot_parser_pickle is not None:
            return pickle.loads(base64.b64decode(self.robot_parser_pickle))
        else:
            parser = RobotFileParser()
            parser.set_url(self.protocol + "://" + self.domain + "/robots.txt")
            self.robot_parser = parser

            return parser
开发者ID:joskid,项目名称:celery-crawler,代码行数:9,代码来源:models.py

示例5: http_open

    def http_open(self, req):

        url = req.get_full_url()
        host = urlsplit(url)[1]
        robots_url = urlunsplit(('http', host, '/robots.txt', '', ''))
        robotfileparser = RobotFileParser(robots_url)
        robotfileparser.read()
        if not robotfileparser.can_fetch(self.crawlername, url):
            raise RuntimeError('Forbidden by robots.txt')
        return urllib2.HTTPHandler.http_open(self, req)
开发者ID:KroosT,项目名称:BSUIR_Python_Lab_003,代码行数:10,代码来源:crawler.py

示例6: check_robots

 def check_robots(self, url):
     '''check the robots.txt in this url's domain'''
     hostname = urlparse(url).netloc
     if hostname not in self.domain_list.keys():      # no records in domain_list
         rp = RobotFileParser('http://%s/robots.txt' % hostname)
         print("%s: fetching %s" % (url, rp.url))
         try:
             rp.read()                                # get new robots.txt
         except IOError, e:                           # url's server not available(connection timeout)
             log.error(str(e))
             rp.disallow_all = True                   # reject all request
         self.domain_list[hostname] = rp              # add domain entry into domain_list
开发者ID:YvesChan,项目名称:OpenSP,代码行数:12,代码来源:spider.py

示例7: getRobots

def getRobots(url):
    parsed = urlparse(url)
    robots_url = parsed.scheme + '://' + parsed.netloc + '/robots.txt'
    if robots_url not in robots:
        rp = RobotFileParser()
        try:
            r = requests.get(robots_url, verify=False, timeout=1)
            r.raise_for_status()
        except Exception:
            rp.parse('')
        else:
            rp.parse(r.text)
        #print "  new robot at " + robots_url
        robots[robots_url] = rp
    return robots[robots_url]
开发者ID:itsmeolivia,项目名称:webcrawler,代码行数:15,代码来源:p5.py

示例8: robots_precheck

 def robots_precheck(self, url):
     """
     If we have the robots.txt file available, check it to see if the
     request is permissible.
     
     This does not fetch robots.txt.
     """
     
     fetcher = RedFetcher(url)
     robots_txt = fetcher.fetch_robots_txt(url, lambda a:a, network=False)
     if robots_txt == "":
         return True
     checker = RobotFileParser()
     checker.parse(robots_txt.splitlines())
     return checker.can_fetch(UA_STRING, url)
开发者ID:jrottenberg,项目名称:redbot,代码行数:15,代码来源:webui.py

示例9: __init__

    def __init__(self, main_page=None, robotrules=True):
        """
            Constuctor method that initializes the members that are used during crawling process
        :param main_page: The root page that needs to be crawled for generation of sitemap
        """

        logging.info("Consider Robot.txt ? ==> "+str(robotrules))
        self.robotrules = robotrules
        self.site_map = {}                          # map that records the visits of urls, datemodified and assets
        self.network = {}                           # map that maintains the network/graph of webpages visited
                                                    # The intention of this map is for visual rendering using d3.js

        self.unvisited = set([])                    # a set to keep the list of urls yet to be visited
        self.start_page = None                       # the root page, this is used to avoid cycle and keeping crawl
                                                    # process limited to single domain.
        self.robot_txt_rules = None

        if main_page:
            self.unvisited.add(main_page)
            try:
                self.start_page = urlparse(main_page).netloc
            except:
                logging.error("Improper URL, Please provide a Valid Url:"+main_page)
                exit(0)

        if self.robotrules == "True":
            try:
                logging.info("robot.txt respected")
                self.robot_txt_rules = RobotFileParser()
                self.robot_txt_rules.set_url(main_page + "/robots.txt")
                self.robot_txt_rules.read()
            except:
                logging.error("Unable to read the robot.txt file")
                self.robotrules = False # error reading robot.txt, ignore it forever
开发者ID:nchikkam,项目名称:projects,代码行数:34,代码来源:crawler.py

示例10: __init__

 def __init__(self, starturl, index_html='', maxlevel=1,
              cookie_file=None, acldb=None, urldb=None, default_charset=None,
              delay=0, timeout=300, debug=0):
   (proto, self.hostport, _x, _y, _z) = urlsplit(starturl)
   assert proto == 'http'
   #Thread.__init__(self)
   self.debug = debug
   self.index_html = index_html
   if cookie_file:
     self.cookiejar = MozillaCookieJar(cookie_file)
     self.cookiejar.load()
   else:
     self.cookiejar = None
   self.robotstxt = RobotFileParser()
   self.robotstxt.set_url(urljoin(starturl, '/robots.txt'))
   self.robotstxt.read()
   self.conn = None
   self.urldb = urldb
   self.acldb = acldb
   self.curlevel = 0
   self.delay = delay
   self.timeout = timeout
   self.default_charset = default_charset
   if starturl.endswith('/'):
     starturl += self.index_html
   self.urls = [(starturl, maxlevel)]
   self.crawled = {}                   # 1:injected, 2:crawled
   return
开发者ID:dreamfrog,项目名称:jophiel,代码行数:28,代码来源:textcrawler.py

示例11: checkRobots

	def checkRobots(self):
		if self.domain[len(self.domain)-1] != "/":
			self.domain += "/"
		request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent})
		self.rp = RobotFileParser()
		self.rp.set_url(self.domain+"robots.txt")
		self.rp.read()
开发者ID:vionemc,项目名称:python-sitemap,代码行数:7,代码来源:crawler.py

示例12: Host

class Host(object):
    ''' Represents one host. Responsible for parsing and analyzing
    ``robots.txt``.
    
    :param hostname: the name of the host extracted from an URL.
    '''
    def __init__(self, hostname):
        self.hostname = hostname
        
        self.rp = RobotFileParser()
        self.rp.set_url('http://%s/robots.txt' % self.hostname)
        
    def url_allowed(self, url):
        ''' Checks if the given url is allowed to crawl.
        
        :param url: URL to check.
        '''
        return self.rp.can_fetch(USER_AGENT, url)
开发者ID:schlamar,项目名称:PySeeek,代码行数:18,代码来源:crawler.py

示例13: crawl

    def crawl(self, seed_url, max_urls=30, max_depth=1, obey_robots=False, max_size=1000000, force_html=True, **kwargs):
        """Crawl website html and return list of URLs crawled

        seed_url: url to start crawling from
        max_urls: maximum number of URLs to crawl (use None for no limit)
        max_depth: maximum depth to follow links into website (use None for no limit)
        obey_robots: whether to obey robots.txt
        max_size is passed to get() and is limited to 1MB by default
        force_text is passed to get() and is set to True by default so only crawl HTML content
        **kwargs is passed to get()
        """
        user_agent = kwargs.get("user_agent", self.user_agent)
        server = "http://" + extract_domain(seed_url)
        robots = RobotFileParser()
        if obey_robots:
            robots.parse(self.get(server + "/robots.txt").splitlines())  # load robots.txt
        outstanding = [(seed_url, 0), (server, 0)]  # which URLs need to crawl
        crawled = []  # urls that have crawled

        while outstanding:
            # more URLs to crawl
            if len(crawled) == max_urls:
                break
            url, cur_depth = outstanding.pop(0)
            if url not in crawled:
                html = self.get(url, max_size=max_size, force_html=force_html, **kwargs)
                crawled.append(url)
                if max_depth is None or cur_depth < max_depth:
                    # continue crawling
                    for scraped_url in re.findall(re.compile("<a[^>]+href=[\"'](.*?)[\"']", re.IGNORECASE), html):
                        if "#" in scraped_url:
                            scraped_url = scraped_url[
                                : scraped_url.index("#")
                            ]  # remove internal links to prevent duplicates
                        if os.path.splitext(scraped_url)[
                            -1
                        ].lower() not in Download.IGNORED_EXTENSIONS and robots.can_fetch(user_agent, scraped_url):
                            scraped_url = urljoin(server, scraped_url)  # support relative links
                            # check if same domain or sub-domain
                            this_server = extract_domain(scraped_url)
                            if this_server and (this_server in server or server in this_server):
                                outstanding.append((scraped_url, cur_depth + 1))
        return crawled
开发者ID:staticerror,项目名称:SeKing,代码行数:43,代码来源:download.py

示例14: run_continue

    def run_continue(self, robots_txt):
        """
        Continue after getting the robots file.
        TODO: refactor callback style into events.
        """
        if robots_txt == "": # empty or non-200
            pass
        else:
            checker = RobotFileParser()
            checker.parse(robots_txt.decode('ascii', 'replace').encode('ascii', 'replace').splitlines())
            if not checker.can_fetch(UA_STRING, self.request.uri):
                self.response.http_error = RobotsTxtError()
                self.finish_task()
                return # TODO: show error?

        if 'user-agent' not in [i[0].lower() for i in self.request.headers]:
            self.request.headers.append(
                (u"User-Agent", UA_STRING))
        self.exchange = self.client.exchange()
        self.exchange.on('response_start', self._response_start)
        self.exchange.on('response_body', self._response_body)
        self.exchange.on('response_done', self._response_done)
        self.exchange.on('error', self._response_error)
        if self.status_cb and self.name:
            self.status_cb("fetching %s (%s)" % (
                self.request.uri, self.name
            ))
        req_hdrs = [
            (k.encode('ascii', 'replace'), v.encode('latin-1', 'replace')) \
            for (k, v) in self.request.headers
        ]
        self.exchange.request_start(
            self.request.method, self.request.uri, req_hdrs
        )
        self.request.start_time = thor.time()
        if self.request.payload != None:
            self.exchange.request_body(self.request.payload)
            self.transfer_out += len(self.request.payload)
        self.exchange.request_done([])
开发者ID:ElijahLynn,项目名称:redbot,代码行数:39,代码来源:fetch.py

示例15: is_page_robot_scannable

 def is_page_robot_scannable(self):
     """
     Returns a boolean that tells whether the page is robot scrapeable.
     """
     robotcheck = RobotFileParser()
     robotcheck.set_url(self.urlparse[0]+'://'+self.urlparse[1]+'/robots.txt')
     robotcheck.read()
     return robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url)
开发者ID:gabrielmiller,项目名称:Py-Webscraper,代码行数:8,代码来源:webpage.py


注:本文中的robotparser.RobotFileParser类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。