本文整理匯總了Python中robotparser.RobotFileParser.parse方法的典型用法代碼示例。如果您正苦於以下問題:Python RobotFileParser.parse方法的具體用法?Python RobotFileParser.parse怎麽用?Python RobotFileParser.parse使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類robotparser.RobotFileParser
的用法示例。
在下文中一共展示了RobotFileParser.parse方法的6個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: accessible
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import parse [as 別名]
def accessible(url):
u = urlparse(url)
if u.netloc not in robots_cache:
resp = requests.get('http://%s/robots.txt' % u.netloc)
rp = RobotFileParser()
rp.parse(resp.content.splitlines())
robots_cache[u.netloc] = rp
return robots_cache[u.netloc].can_fetch('*', url)
示例2: robots_precheck
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import parse [as 別名]
def robots_precheck(self, url):
"""
If we have the robots.txt file available, check it to see if the
request is permissible.
This does not fetch robots.txt.
"""
fetcher = RedFetcher(url)
robots_txt = fetcher.fetch_robots_txt(url, lambda a:a, network=False)
if robots_txt == "":
return True
checker = RobotFileParser()
checker.parse(robots_txt.splitlines())
return checker.can_fetch(UA_STRING, url)
示例3: getRobots
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import parse [as 別名]
def getRobots(url):
parsed = urlparse(url)
robots_url = parsed.scheme + '://' + parsed.netloc + '/robots.txt'
if robots_url not in robots:
rp = RobotFileParser()
try:
r = requests.get(robots_url, verify=False, timeout=1)
r.raise_for_status()
except Exception:
rp.parse('')
else:
rp.parse(r.text)
#print " new robot at " + robots_url
robots[robots_url] = rp
return robots[robots_url]
示例4: crawl
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import parse [as 別名]
def crawl(self, seed_url, max_urls=30, max_depth=1, obey_robots=False, max_size=1000000, force_html=True, **kwargs):
"""Crawl website html and return list of URLs crawled
seed_url: url to start crawling from
max_urls: maximum number of URLs to crawl (use None for no limit)
max_depth: maximum depth to follow links into website (use None for no limit)
obey_robots: whether to obey robots.txt
max_size is passed to get() and is limited to 1MB by default
force_text is passed to get() and is set to True by default so only crawl HTML content
**kwargs is passed to get()
"""
user_agent = kwargs.get("user_agent", self.user_agent)
server = "http://" + extract_domain(seed_url)
robots = RobotFileParser()
if obey_robots:
robots.parse(self.get(server + "/robots.txt").splitlines()) # load robots.txt
outstanding = [(seed_url, 0), (server, 0)] # which URLs need to crawl
crawled = [] # urls that have crawled
while outstanding:
# more URLs to crawl
if len(crawled) == max_urls:
break
url, cur_depth = outstanding.pop(0)
if url not in crawled:
html = self.get(url, max_size=max_size, force_html=force_html, **kwargs)
crawled.append(url)
if max_depth is None or cur_depth < max_depth:
# continue crawling
for scraped_url in re.findall(re.compile("<a[^>]+href=[\"'](.*?)[\"']", re.IGNORECASE), html):
if "#" in scraped_url:
scraped_url = scraped_url[
: scraped_url.index("#")
] # remove internal links to prevent duplicates
if os.path.splitext(scraped_url)[
-1
].lower() not in Download.IGNORED_EXTENSIONS and robots.can_fetch(user_agent, scraped_url):
scraped_url = urljoin(server, scraped_url) # support relative links
# check if same domain or sub-domain
this_server = extract_domain(scraped_url)
if this_server and (this_server in server or server in this_server):
outstanding.append((scraped_url, cur_depth + 1))
return crawled
示例5: run_continue
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import parse [as 別名]
def run_continue(self, robots_txt):
"""
Continue after getting the robots file.
TODO: refactor callback style into events.
"""
if robots_txt == "": # empty or non-200
pass
else:
checker = RobotFileParser()
checker.parse(robots_txt.decode('ascii', 'replace').encode('ascii', 'replace').splitlines())
if not checker.can_fetch(UA_STRING, self.request.uri):
self.response.http_error = RobotsTxtError()
self.finish_task()
return # TODO: show error?
if 'user-agent' not in [i[0].lower() for i in self.request.headers]:
self.request.headers.append(
(u"User-Agent", UA_STRING))
self.exchange = self.client.exchange()
self.exchange.on('response_start', self._response_start)
self.exchange.on('response_body', self._response_body)
self.exchange.on('response_done', self._response_done)
self.exchange.on('error', self._response_error)
if self.status_cb and self.name:
self.status_cb("fetching %s (%s)" % (
self.request.uri, self.name
))
req_hdrs = [
(k.encode('ascii', 'replace'), v.encode('latin-1', 'replace')) \
for (k, v) in self.request.headers
]
self.exchange.request_start(
self.request.method, self.request.uri, req_hdrs
)
self.request.start_time = thor.time()
if self.request.payload != None:
self.exchange.request_body(self.request.payload)
self.transfer_out += len(self.request.payload)
self.exchange.request_done([])
示例6: knock
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import parse [as 別名]
def knock(self, user_agent, url, override, retries=0, debug_force_status=None):
"""
Makes a request for '/robots.txt' and returns True if 'user_agent' can fetch 'url'. Returns False otherwise
If we get a HTTP response code other than '200' or any request error occurs, this function will return True
If we get a gaierror (DNS lookup error), this function will return False as everything else is doomed to fail
If 'override' is True, this function will automatically return True. Default value for override is False
"""
if override:
return True
host = net.urlparse(url)[1]
robot = RobotFileParser()
clearance = False
if retries > 0:
time_mod.sleep(self.crawl_delay)
try:
# We try to get the resource /robots.txt
connection = net.HTTPConnection(host, 80)
connection.request(
self.GET,
"/robots.txt",
None,
{ "User-Agent" : user_agent }
)
response = connection.getresponse()
robot_lines = response.read().splitlines()
connection.close()
if debug_force_status:
response.status = debug_force_status
if response.status == 200 and filter(None, robot_lines) != []:
# If everthing went well, we feed the content of the resource to the parser
robot.parse(robot_lines)
# And resolve if we have clearance to fetch the url
clearance = robot.can_fetch(user_agent, url)
# We try to get the Crawl-delay directive, if it exists
try:
self.crawl_delay = int(
"".join(list(
directive for directive in robot_lines if directive.lower().startswith("crawl-delay")
)).split(":")[1]
)
except IndexError:
# If no 'Crawl-delay' is specified, we leave it at 1 second
pass
elif response.status in [408, 500, 503]:
if retries < 3:
try:
time_mod.sleep(self.current_headers["retry-after"] - self.crawl_delay)
except KeyError:
pass
except TypeError:
pass
clearance = self.knock(user_agent, url, False, retries + 1)
else:
clearance = True
else:
clearance = True
if retries < 1:
#.........這裏部分代碼省略.........