本文整理汇总了Python中robotparser.RobotFileParser.can_fetch方法的典型用法代码示例。如果您正苦于以下问题:Python RobotFileParser.can_fetch方法的具体用法?Python RobotFileParser.can_fetch怎么用?Python RobotFileParser.can_fetch使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类robotparser.RobotFileParser
的用法示例。
在下文中一共展示了RobotFileParser.can_fetch方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: is_page_robot_scannable
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
def is_page_robot_scannable(self):
"""
Returns a boolean that tells whether the page is robot scrapeable.
"""
robotcheck = RobotFileParser()
robotcheck.set_url(self.urlparse[0]+'://'+self.urlparse[1]+'/robots.txt')
robotcheck.read()
return robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url)
示例2: _allowed_to_open
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
def _allowed_to_open(self, url):
host = urlparse.urlsplit(url)[1]
robots_url = urlparse.urlunsplit(('http', host, '/robots.txt', '', ''))
rp = RobotFileParser(robots_url)
try:
rp.read()
except:
return False
return rp.can_fetch(self._agent_name, url)
示例3: checkRobots
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
def checkRobots(URL):
time.sleep(1)
parsed = urlparse(URL)
robotsUrl = parsed.scheme + "://"+ parsed.netloc+"/robots.txt"
robotParser = RobotFileParser()
robotParser.set_url(robotsUrl)
robotParser.read()
result = robotParser.can_fetch("*",URL)
return result
示例4: http_open
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
def http_open(self, req):
url = req.get_full_url()
host = urlsplit(url)[1]
robots_url = urlunsplit(('http', host, '/robots.txt', '', ''))
robotfileparser = RobotFileParser(robots_url)
robotfileparser.read()
if not robotfileparser.can_fetch(self.crawlername, url):
raise RuntimeError('Forbidden by robots.txt')
return urllib2.HTTPHandler.http_open(self, req)
示例5: can_fetch
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
def can_fetch(self,url):
host,path=urlparse.urlparse(url)[1:3]
if (self.rules.has_key(host)):
return self.rules[host].can_fetch(self.agent,url)
else:
rp=RobotFileParser()
robot_url="http://"+host+"/robots.txt"
rp.set_url(robot_url)
rp.read()
self.rules[host]=rp
return rp.can_fetch(self.agent,url)
示例6: robots_check
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
def robots_check(url):
# creating url for robots.txt
root_url = tld.get_tld(url)
prefix = "http://www."
suffix = "/robots.txt"
robots_url = prefix + root_url + suffix
# checking url validity
rp = RobotFileParser()
rp.set_url(robots_url)
rp.read()
return rp.can_fetch("*", url)
示例7: robots_precheck
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
def robots_precheck(self, url):
"""
If we have the robots.txt file available, check it to see if the
request is permissible.
This does not fetch robots.txt.
"""
fetcher = RedFetcher(url)
robots_txt = fetcher.fetch_robots_txt(url, lambda a:a, network=False)
if robots_txt == "":
return True
checker = RobotFileParser()
checker.parse(robots_txt.splitlines())
return checker.can_fetch(UA_STRING, url)
示例8: Host
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
class Host(object):
''' Represents one host. Responsible for parsing and analyzing
``robots.txt``.
:param hostname: the name of the host extracted from an URL.
'''
def __init__(self, hostname):
self.hostname = hostname
self.rp = RobotFileParser()
self.rp.set_url('http://%s/robots.txt' % self.hostname)
def url_allowed(self, url):
''' Checks if the given url is allowed to crawl.
:param url: URL to check.
'''
return self.rp.can_fetch(USER_AGENT, url)
示例9: crawl
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
def crawl(self, seed_url, max_urls=30, max_depth=1, obey_robots=False, max_size=1000000, force_html=True, **kwargs):
"""Crawl website html and return list of URLs crawled
seed_url: url to start crawling from
max_urls: maximum number of URLs to crawl (use None for no limit)
max_depth: maximum depth to follow links into website (use None for no limit)
obey_robots: whether to obey robots.txt
max_size is passed to get() and is limited to 1MB by default
force_text is passed to get() and is set to True by default so only crawl HTML content
**kwargs is passed to get()
"""
user_agent = kwargs.get("user_agent", self.user_agent)
server = "http://" + extract_domain(seed_url)
robots = RobotFileParser()
if obey_robots:
robots.parse(self.get(server + "/robots.txt").splitlines()) # load robots.txt
outstanding = [(seed_url, 0), (server, 0)] # which URLs need to crawl
crawled = [] # urls that have crawled
while outstanding:
# more URLs to crawl
if len(crawled) == max_urls:
break
url, cur_depth = outstanding.pop(0)
if url not in crawled:
html = self.get(url, max_size=max_size, force_html=force_html, **kwargs)
crawled.append(url)
if max_depth is None or cur_depth < max_depth:
# continue crawling
for scraped_url in re.findall(re.compile("<a[^>]+href=[\"'](.*?)[\"']", re.IGNORECASE), html):
if "#" in scraped_url:
scraped_url = scraped_url[
: scraped_url.index("#")
] # remove internal links to prevent duplicates
if os.path.splitext(scraped_url)[
-1
].lower() not in Download.IGNORED_EXTENSIONS and robots.can_fetch(user_agent, scraped_url):
scraped_url = urljoin(server, scraped_url) # support relative links
# check if same domain or sub-domain
this_server = extract_domain(scraped_url)
if this_server and (this_server in server or server in this_server):
outstanding.append((scraped_url, cur_depth + 1))
return crawled
示例10: urlopen
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
def urlopen(self, host):
robo_url = host.get_robots_url()
print self.robotdict
cached_parser = self.robotdict.get(robo_url)
if cached_parser:
logging.info("Found in Cache: " + robo_url)
else:
logging.info("Fetching: " + robo_url)
cached_parser = RobotFileParser()
self.robotdict.put(robo_url, cached_parser)
cached_parser.set_url(robo_url)
cached_parser.read()
if cached_parser.can_fetch('*', host. get_url()):
print 'Going to fetch:', host.get_url()
return self.fetch_file(host.get_url())
else:
logging.info("Forbidden by Robots.txt")
return None
示例11: _get_soup
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
def _get_soup(path):
"""Gets soup from the given path, respecting robots.txt"""
full_path = BASE_URL + path
# Set a user-agent
user_agent = 'dcnotify/%s' % __version__
http_headers = {'User-Agent': '%s' % user_agent}
# Honor robots.txt
robots = RobotFileParser()
robots.set_url("%s/robots.txt" % BASE_URL)
robots.read()
if not robots.can_fetch(user_agent, full_path):
raise ValueError("Path disallowed by robots.txt")
# Make a make a request, raising any HTTP errors that might occur
request = get(full_path, headers=http_headers)
request.raise_for_status()
return bs(request.text)
示例12: run_continue
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
def run_continue(self, robots_txt):
"""
Continue after getting the robots file.
TODO: refactor callback style into events.
"""
if robots_txt == "": # empty or non-200
pass
else:
checker = RobotFileParser()
checker.parse(robots_txt.decode('ascii', 'replace').encode('ascii', 'replace').splitlines())
if not checker.can_fetch(UA_STRING, self.request.uri):
self.response.http_error = RobotsTxtError()
self.finish_task()
return # TODO: show error?
if 'user-agent' not in [i[0].lower() for i in self.request.headers]:
self.request.headers.append(
(u"User-Agent", UA_STRING))
self.exchange = self.client.exchange()
self.exchange.on('response_start', self._response_start)
self.exchange.on('response_body', self._response_body)
self.exchange.on('response_done', self._response_done)
self.exchange.on('error', self._response_error)
if self.status_cb and self.name:
self.status_cb("fetching %s (%s)" % (
self.request.uri, self.name
))
req_hdrs = [
(k.encode('ascii', 'replace'), v.encode('latin-1', 'replace')) \
for (k, v) in self.request.headers
]
self.exchange.request_start(
self.request.method, self.request.uri, req_hdrs
)
self.request.start_time = thor.time()
if self.request.payload != None:
self.exchange.request_body(self.request.payload)
self.transfer_out += len(self.request.payload)
self.exchange.request_done([])
示例13: __init__
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
def __init__(self, url):
self.url = urlManip.cleanURL(url)
self.pages = []
self.suggestions = set()
self.loaded = False
logger.info("Loading %s..." % (self.url))
try:
requests.get(self.url)
self.loaded = True
except IOError as e:
logger.error("%s cannot be loaded: %s" % (self.url, e))
# if the website can be loaded
if self.loaded == True:
logger.info("Load successful. Generating suggestions...")
# get robots.txt
rp = RobotFileParser(self.url + "robots.txt")
try:
rp.read()
except IOError:
logger.warning("robots.txt cannot be found.")
# get home page
self.pages.append(Page(self.url))
# get all pages on homepage
self.pages[0].load()
for link in self.pages[0].internalLinks:
if rp.can_fetch("*", link):
if link[:4] == 'http':
self.pages.append(Page(link))
else:
self.pages.append(Page(self.url + link))
else:
logger.debug("Ignoring %s based on robots.txt" % link)
示例14: Crawler
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
#.........这里部分代码省略.........
elif link.startswith('#'):
link = 'http://' + url[1] + url[2] + link
elif not link.startswith('http'):
link = 'http://' + url[1] + '/' + link
# Remove the anchor part if needed
if "#" in link:
link = link[:link.index('#')]
# Drop attributes if needed
for toDrop in self.drop:
link=re.sub(toDrop,'',link)
# Parse the url to get domain and file extension
parsed_link = urlparse.urlparse(link)
domain_link = parsed_link.netloc
target_extension = os.path.splitext(parsed_link.path)[1][1:]
if (link in self.crawled):
continue
if (link in self.tocrawl):
continue
if (link in self.excluded):
continue
if (domain_link != self.target_domain):
continue
if ("javascript" in link):
continue
# Count one more URL
self.nb_url+=1
# Check if the navigation is allowed by the robots.txt
if (not self.can_fetch(link)):
self.exclude_link(link)
self.nb_rp+=1
continue
# Check if the current file extension is allowed or not.
if (target_extension in self.skipext):
self.exclude_link(link)
self.nb_exclude+=1
continue
# Check if the current url doesn't contain an excluded word
if (not self.exclude_url(link)):
self.exclude_link(link)
self.nb_exclude+=1
continue
self.tocrawl.add(link)
return None
def __continue_crawling(self):
if self.tocrawl:
self.__crawling()
def exclude_link(self,link):
if link not in self.excluded:
self.excluded.add(link)
def checkRobots(self):
if self.domain[len(self.domain)-1] != "/":
self.domain += "/"
request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent})
示例15: SiteMap
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
#.........这里部分代码省略.........
try:
list.append(x['src'])
except KeyError:
pass
csss = []
imgs = []
jss = []
for link in css:
csss.append(link['href'])
for link in img:
imgs.append(link['src'])
for link in jss:
jss.append(link['src'])
return {
'css': csss,
'img': imgs,
'js': jss
}
def crawl(self):
"""
The main driver method that crawls the pages. This main does below steps:
for every unvisited [vertex|page] that belongs to the requested domain:
crawl the page
record valid links and their last-modified-dates
:return: None
"""
page = self.unvisited.pop()
# if robot.txt is defined, use Disallow to avoid pages. domain.robot.txt doesn't exist so the crawler
# must find all the pages for report.
logging.info("Starting to Crawl Page: " + page)
url = urlparse(page)
try:
response = urlopen(page)
except:
logging.debug("Issue with the url: " + page)
return None
try:
html_body = response.read() # response.getcode()
response.close()
# record visit ans assets
self.record_visit(page, response.headers, html_body)
logging.debug("Queued Pages: {0}, Crawled Pages: {1}".format(len(self.unvisited), len(self.site_map)))
except:
logging.debug("Issue while opening url: " + page)
return None
connects = self.get_out_going_edges(url, html_body)
# simple Graph that keeps the order of the pages crawled.
for i, url in enumerate(connects):
self.network[page] = {
'to': connects,
'assets': {
'css': self.site_map[page]['assets']['css'],
'js': self.site_map[page]['assets']['js'],
'img': self.site_map[page]['assets']['img']
}
}
return None
def get_site_map(self):
"""
Returns the compiled sitemap structure
:return: sitemap data structure
"""
return self.site_map
def get_network_graph(self):
"""
Returns the compiled network in the order of the crawled pages
:return: network graph
"""
return self.network
def get_network_json_format(self):
"""
Returns the crawl traverse order sequence in json format
:return: network in json format
"""
return json.dumps(self.network)
def set_start_page(self, url):
"""
This could be useful if one is testing
:param url: start page to start the crawling.
:return:
"""
self.start_page = url
def robot_allows(self, link):
if not self.robotrules: return True
try:
if self.robot_txt_rules.can_fetch("*", link):
return True
return False
except:
return True