当前位置: 首页>>代码示例>>Python>>正文


Python RobotFileParser.parse方法代码示例

本文整理汇总了Python中robotparser.RobotFileParser.parse方法的典型用法代码示例。如果您正苦于以下问题:Python RobotFileParser.parse方法的具体用法?Python RobotFileParser.parse怎么用?Python RobotFileParser.parse使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在robotparser.RobotFileParser的用法示例。


在下文中一共展示了RobotFileParser.parse方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: accessible

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import parse [as 别名]
def accessible(url):
    u = urlparse(url)
    if u.netloc not in robots_cache:
        resp = requests.get('http://%s/robots.txt' % u.netloc)
        rp = RobotFileParser()
        rp.parse(resp.content.splitlines())
        robots_cache[u.netloc] = rp
    return robots_cache[u.netloc].can_fetch('*', url)
开发者ID:Mondego,项目名称:pyreco,代码行数:10,代码来源:allPythonContent.py

示例2: robots_precheck

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import parse [as 别名]
 def robots_precheck(self, url):
     """
     If we have the robots.txt file available, check it to see if the
     request is permissible.
     
     This does not fetch robots.txt.
     """
     
     fetcher = RedFetcher(url)
     robots_txt = fetcher.fetch_robots_txt(url, lambda a:a, network=False)
     if robots_txt == "":
         return True
     checker = RobotFileParser()
     checker.parse(robots_txt.splitlines())
     return checker.can_fetch(UA_STRING, url)
开发者ID:jrottenberg,项目名称:redbot,代码行数:17,代码来源:webui.py

示例3: getRobots

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import parse [as 别名]
def getRobots(url):
    parsed = urlparse(url)
    robots_url = parsed.scheme + '://' + parsed.netloc + '/robots.txt'
    if robots_url not in robots:
        rp = RobotFileParser()
        try:
            r = requests.get(robots_url, verify=False, timeout=1)
            r.raise_for_status()
        except Exception:
            rp.parse('')
        else:
            rp.parse(r.text)
        #print "  new robot at " + robots_url
        robots[robots_url] = rp
    return robots[robots_url]
开发者ID:itsmeolivia,项目名称:webcrawler,代码行数:17,代码来源:p5.py

示例4: crawl

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import parse [as 别名]
    def crawl(self, seed_url, max_urls=30, max_depth=1, obey_robots=False, max_size=1000000, force_html=True, **kwargs):
        """Crawl website html and return list of URLs crawled

        seed_url: url to start crawling from
        max_urls: maximum number of URLs to crawl (use None for no limit)
        max_depth: maximum depth to follow links into website (use None for no limit)
        obey_robots: whether to obey robots.txt
        max_size is passed to get() and is limited to 1MB by default
        force_text is passed to get() and is set to True by default so only crawl HTML content
        **kwargs is passed to get()
        """
        user_agent = kwargs.get("user_agent", self.user_agent)
        server = "http://" + extract_domain(seed_url)
        robots = RobotFileParser()
        if obey_robots:
            robots.parse(self.get(server + "/robots.txt").splitlines())  # load robots.txt
        outstanding = [(seed_url, 0), (server, 0)]  # which URLs need to crawl
        crawled = []  # urls that have crawled

        while outstanding:
            # more URLs to crawl
            if len(crawled) == max_urls:
                break
            url, cur_depth = outstanding.pop(0)
            if url not in crawled:
                html = self.get(url, max_size=max_size, force_html=force_html, **kwargs)
                crawled.append(url)
                if max_depth is None or cur_depth < max_depth:
                    # continue crawling
                    for scraped_url in re.findall(re.compile("<a[^>]+href=[\"'](.*?)[\"']", re.IGNORECASE), html):
                        if "#" in scraped_url:
                            scraped_url = scraped_url[
                                : scraped_url.index("#")
                            ]  # remove internal links to prevent duplicates
                        if os.path.splitext(scraped_url)[
                            -1
                        ].lower() not in Download.IGNORED_EXTENSIONS and robots.can_fetch(user_agent, scraped_url):
                            scraped_url = urljoin(server, scraped_url)  # support relative links
                            # check if same domain or sub-domain
                            this_server = extract_domain(scraped_url)
                            if this_server and (this_server in server or server in this_server):
                                outstanding.append((scraped_url, cur_depth + 1))
        return crawled
开发者ID:staticerror,项目名称:SeKing,代码行数:45,代码来源:download.py

示例5: run_continue

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import parse [as 别名]
    def run_continue(self, robots_txt):
        """
        Continue after getting the robots file.
        TODO: refactor callback style into events.
        """
        if robots_txt == "": # empty or non-200
            pass
        else:
            checker = RobotFileParser()
            checker.parse(robots_txt.decode('ascii', 'replace').encode('ascii', 'replace').splitlines())
            if not checker.can_fetch(UA_STRING, self.request.uri):
                self.response.http_error = RobotsTxtError()
                self.finish_task()
                return # TODO: show error?

        if 'user-agent' not in [i[0].lower() for i in self.request.headers]:
            self.request.headers.append(
                (u"User-Agent", UA_STRING))
        self.exchange = self.client.exchange()
        self.exchange.on('response_start', self._response_start)
        self.exchange.on('response_body', self._response_body)
        self.exchange.on('response_done', self._response_done)
        self.exchange.on('error', self._response_error)
        if self.status_cb and self.name:
            self.status_cb("fetching %s (%s)" % (
                self.request.uri, self.name
            ))
        req_hdrs = [
            (k.encode('ascii', 'replace'), v.encode('latin-1', 'replace')) \
            for (k, v) in self.request.headers
        ]
        self.exchange.request_start(
            self.request.method, self.request.uri, req_hdrs
        )
        self.request.start_time = thor.time()
        if self.request.payload != None:
            self.exchange.request_body(self.request.payload)
            self.transfer_out += len(self.request.payload)
        self.exchange.request_done([])
开发者ID:ElijahLynn,项目名称:redbot,代码行数:41,代码来源:fetch.py

示例6: knock

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import parse [as 别名]
	def knock(self, user_agent, url, override, retries=0, debug_force_status=None):

		"""
		Makes a request for '/robots.txt' and returns True if 'user_agent' can fetch 'url'. Returns False otherwise
		If we get a HTTP response code other than '200' or any request error occurs, this function will return True
		If we get a gaierror (DNS lookup error), this function will return False as everything else is doomed to fail

		If 'override' is True, this function will automatically return True. Default value for override is False
		"""

		if override:

			return True

		host = net.urlparse(url)[1]

		robot = RobotFileParser()

		clearance = False

		if retries > 0:

			time_mod.sleep(self.crawl_delay)

		try:

			# We try to get the resource /robots.txt

			connection = net.HTTPConnection(host, 80)

			connection.request(
				self.GET,
				"/robots.txt",
				None,
				{ "User-Agent" : user_agent }
			)

			response = connection.getresponse()

			robot_lines = response.read().splitlines()

			connection.close()

			if debug_force_status:

				response.status = debug_force_status

			if response.status == 200 and filter(None, robot_lines) != []:

				# If everthing went well, we feed the content of the resource to the parser

				robot.parse(robot_lines)

				# And resolve if we have clearance to fetch the url

				clearance = robot.can_fetch(user_agent, url)

				# We try to get the Crawl-delay directive, if it exists

				try:

					self.crawl_delay = int(
						"".join(list(
							directive for directive in robot_lines if directive.lower().startswith("crawl-delay")
						)).split(":")[1]
					)

				except IndexError:

					# If no 'Crawl-delay' is specified, we leave it at 1 second

					pass

			elif response.status in [408, 500, 503]:

				if retries < 3:

					try:

						time_mod.sleep(self.current_headers["retry-after"] - self.crawl_delay)

					except KeyError:

						pass

					except TypeError:

						pass

					clearance = self.knock(user_agent, url, False, retries + 1)

				else:

					clearance = True

			else:

				clearance = True			

			if retries < 1:
#.........这里部分代码省略.........
开发者ID:bworwa,项目名称:xcraper,代码行数:103,代码来源:request.py


注:本文中的robotparser.RobotFileParser.parse方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。