本文整理汇总了Python中robotparser.RobotFileParser类的典型用法代码示例。如果您正苦于以下问题:Python RobotFileParser类的具体用法?Python RobotFileParser怎么用?Python RobotFileParser使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了RobotFileParser类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: accessible
def accessible(url):
u = urlparse(url)
if u.netloc not in robots_cache:
resp = requests.get('http://%s/robots.txt' % u.netloc)
rp = RobotFileParser()
rp.parse(resp.content.splitlines())
robots_cache[u.netloc] = rp
return robots_cache[u.netloc].can_fetch('*', url)
示例2: _allowed_to_open
def _allowed_to_open(self, url):
host = urlparse.urlsplit(url)[1]
robots_url = urlparse.urlunsplit(('http', host, '/robots.txt', '', ''))
rp = RobotFileParser(robots_url)
try:
rp.read()
except:
return False
return rp.can_fetch(self._agent_name, url)
示例3: _get_robot_parser
def _get_robot_parser(self):
try:
return pickle.loads(str(self.robot_parser_pickle))
except (TypeError, IndexError):
parser = RobotFileParser()
parser.set_url(str(self.protocol) + "://" + str(self.domain) + \
"/robots.txt")
self.robot_parser = parser
return parser
示例4: _get_robot_parser
def _get_robot_parser(self):
if self.robot_parser_pickle is not None:
return pickle.loads(base64.b64decode(self.robot_parser_pickle))
else:
parser = RobotFileParser()
parser.set_url(self.protocol + "://" + self.domain + "/robots.txt")
self.robot_parser = parser
return parser
示例5: http_open
def http_open(self, req):
url = req.get_full_url()
host = urlsplit(url)[1]
robots_url = urlunsplit(('http', host, '/robots.txt', '', ''))
robotfileparser = RobotFileParser(robots_url)
robotfileparser.read()
if not robotfileparser.can_fetch(self.crawlername, url):
raise RuntimeError('Forbidden by robots.txt')
return urllib2.HTTPHandler.http_open(self, req)
示例6: check_robots
def check_robots(self, url):
'''check the robots.txt in this url's domain'''
hostname = urlparse(url).netloc
if hostname not in self.domain_list.keys(): # no records in domain_list
rp = RobotFileParser('http://%s/robots.txt' % hostname)
print("%s: fetching %s" % (url, rp.url))
try:
rp.read() # get new robots.txt
except IOError, e: # url's server not available(connection timeout)
log.error(str(e))
rp.disallow_all = True # reject all request
self.domain_list[hostname] = rp # add domain entry into domain_list
示例7: getRobots
def getRobots(url):
parsed = urlparse(url)
robots_url = parsed.scheme + '://' + parsed.netloc + '/robots.txt'
if robots_url not in robots:
rp = RobotFileParser()
try:
r = requests.get(robots_url, verify=False, timeout=1)
r.raise_for_status()
except Exception:
rp.parse('')
else:
rp.parse(r.text)
#print " new robot at " + robots_url
robots[robots_url] = rp
return robots[robots_url]
示例8: robots_precheck
def robots_precheck(self, url):
"""
If we have the robots.txt file available, check it to see if the
request is permissible.
This does not fetch robots.txt.
"""
fetcher = RedFetcher(url)
robots_txt = fetcher.fetch_robots_txt(url, lambda a:a, network=False)
if robots_txt == "":
return True
checker = RobotFileParser()
checker.parse(robots_txt.splitlines())
return checker.can_fetch(UA_STRING, url)
示例9: __init__
def __init__(self, main_page=None, robotrules=True):
"""
Constuctor method that initializes the members that are used during crawling process
:param main_page: The root page that needs to be crawled for generation of sitemap
"""
logging.info("Consider Robot.txt ? ==> "+str(robotrules))
self.robotrules = robotrules
self.site_map = {} # map that records the visits of urls, datemodified and assets
self.network = {} # map that maintains the network/graph of webpages visited
# The intention of this map is for visual rendering using d3.js
self.unvisited = set([]) # a set to keep the list of urls yet to be visited
self.start_page = None # the root page, this is used to avoid cycle and keeping crawl
# process limited to single domain.
self.robot_txt_rules = None
if main_page:
self.unvisited.add(main_page)
try:
self.start_page = urlparse(main_page).netloc
except:
logging.error("Improper URL, Please provide a Valid Url:"+main_page)
exit(0)
if self.robotrules == "True":
try:
logging.info("robot.txt respected")
self.robot_txt_rules = RobotFileParser()
self.robot_txt_rules.set_url(main_page + "/robots.txt")
self.robot_txt_rules.read()
except:
logging.error("Unable to read the robot.txt file")
self.robotrules = False # error reading robot.txt, ignore it forever
示例10: __init__
def __init__(self, starturl, index_html='', maxlevel=1,
cookie_file=None, acldb=None, urldb=None, default_charset=None,
delay=0, timeout=300, debug=0):
(proto, self.hostport, _x, _y, _z) = urlsplit(starturl)
assert proto == 'http'
#Thread.__init__(self)
self.debug = debug
self.index_html = index_html
if cookie_file:
self.cookiejar = MozillaCookieJar(cookie_file)
self.cookiejar.load()
else:
self.cookiejar = None
self.robotstxt = RobotFileParser()
self.robotstxt.set_url(urljoin(starturl, '/robots.txt'))
self.robotstxt.read()
self.conn = None
self.urldb = urldb
self.acldb = acldb
self.curlevel = 0
self.delay = delay
self.timeout = timeout
self.default_charset = default_charset
if starturl.endswith('/'):
starturl += self.index_html
self.urls = [(starturl, maxlevel)]
self.crawled = {} # 1:injected, 2:crawled
return
示例11: checkRobots
def checkRobots(self):
if self.domain[len(self.domain)-1] != "/":
self.domain += "/"
request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent})
self.rp = RobotFileParser()
self.rp.set_url(self.domain+"robots.txt")
self.rp.read()
示例12: Host
class Host(object):
''' Represents one host. Responsible for parsing and analyzing
``robots.txt``.
:param hostname: the name of the host extracted from an URL.
'''
def __init__(self, hostname):
self.hostname = hostname
self.rp = RobotFileParser()
self.rp.set_url('http://%s/robots.txt' % self.hostname)
def url_allowed(self, url):
''' Checks if the given url is allowed to crawl.
:param url: URL to check.
'''
return self.rp.can_fetch(USER_AGENT, url)
示例13: crawl
def crawl(self, seed_url, max_urls=30, max_depth=1, obey_robots=False, max_size=1000000, force_html=True, **kwargs):
"""Crawl website html and return list of URLs crawled
seed_url: url to start crawling from
max_urls: maximum number of URLs to crawl (use None for no limit)
max_depth: maximum depth to follow links into website (use None for no limit)
obey_robots: whether to obey robots.txt
max_size is passed to get() and is limited to 1MB by default
force_text is passed to get() and is set to True by default so only crawl HTML content
**kwargs is passed to get()
"""
user_agent = kwargs.get("user_agent", self.user_agent)
server = "http://" + extract_domain(seed_url)
robots = RobotFileParser()
if obey_robots:
robots.parse(self.get(server + "/robots.txt").splitlines()) # load robots.txt
outstanding = [(seed_url, 0), (server, 0)] # which URLs need to crawl
crawled = [] # urls that have crawled
while outstanding:
# more URLs to crawl
if len(crawled) == max_urls:
break
url, cur_depth = outstanding.pop(0)
if url not in crawled:
html = self.get(url, max_size=max_size, force_html=force_html, **kwargs)
crawled.append(url)
if max_depth is None or cur_depth < max_depth:
# continue crawling
for scraped_url in re.findall(re.compile("<a[^>]+href=[\"'](.*?)[\"']", re.IGNORECASE), html):
if "#" in scraped_url:
scraped_url = scraped_url[
: scraped_url.index("#")
] # remove internal links to prevent duplicates
if os.path.splitext(scraped_url)[
-1
].lower() not in Download.IGNORED_EXTENSIONS and robots.can_fetch(user_agent, scraped_url):
scraped_url = urljoin(server, scraped_url) # support relative links
# check if same domain or sub-domain
this_server = extract_domain(scraped_url)
if this_server and (this_server in server or server in this_server):
outstanding.append((scraped_url, cur_depth + 1))
return crawled
示例14: run_continue
def run_continue(self, robots_txt):
"""
Continue after getting the robots file.
TODO: refactor callback style into events.
"""
if robots_txt == "": # empty or non-200
pass
else:
checker = RobotFileParser()
checker.parse(robots_txt.decode('ascii', 'replace').encode('ascii', 'replace').splitlines())
if not checker.can_fetch(UA_STRING, self.request.uri):
self.response.http_error = RobotsTxtError()
self.finish_task()
return # TODO: show error?
if 'user-agent' not in [i[0].lower() for i in self.request.headers]:
self.request.headers.append(
(u"User-Agent", UA_STRING))
self.exchange = self.client.exchange()
self.exchange.on('response_start', self._response_start)
self.exchange.on('response_body', self._response_body)
self.exchange.on('response_done', self._response_done)
self.exchange.on('error', self._response_error)
if self.status_cb and self.name:
self.status_cb("fetching %s (%s)" % (
self.request.uri, self.name
))
req_hdrs = [
(k.encode('ascii', 'replace'), v.encode('latin-1', 'replace')) \
for (k, v) in self.request.headers
]
self.exchange.request_start(
self.request.method, self.request.uri, req_hdrs
)
self.request.start_time = thor.time()
if self.request.payload != None:
self.exchange.request_body(self.request.payload)
self.transfer_out += len(self.request.payload)
self.exchange.request_done([])
示例15: is_page_robot_scannable
def is_page_robot_scannable(self):
"""
Returns a boolean that tells whether the page is robot scrapeable.
"""
robotcheck = RobotFileParser()
robotcheck.set_url(self.urlparse[0]+'://'+self.urlparse[1]+'/robots.txt')
robotcheck.read()
return robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url)