本文整理匯總了Python中robotparser.RobotFileParser.read方法的典型用法代碼示例。如果您正苦於以下問題:Python RobotFileParser.read方法的具體用法?Python RobotFileParser.read怎麽用?Python RobotFileParser.read使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類robotparser.RobotFileParser
的用法示例。
在下文中一共展示了RobotFileParser.read方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: is_page_robot_scannable
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
def is_page_robot_scannable(self):
"""
Returns a boolean that tells whether the page is robot scrapeable.
"""
robotcheck = RobotFileParser()
robotcheck.set_url(self.urlparse[0]+'://'+self.urlparse[1]+'/robots.txt')
robotcheck.read()
return robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url)
示例2: _allowed_to_open
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
def _allowed_to_open(self, url):
host = urlparse.urlsplit(url)[1]
robots_url = urlparse.urlunsplit(('http', host, '/robots.txt', '', ''))
rp = RobotFileParser(robots_url)
try:
rp.read()
except:
return False
return rp.can_fetch(self._agent_name, url)
示例3: checkRobots
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
def checkRobots(URL):
time.sleep(1)
parsed = urlparse(URL)
robotsUrl = parsed.scheme + "://"+ parsed.netloc+"/robots.txt"
robotParser = RobotFileParser()
robotParser.set_url(robotsUrl)
robotParser.read()
result = robotParser.can_fetch("*",URL)
return result
示例4: http_open
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
def http_open(self, req):
url = req.get_full_url()
host = urlsplit(url)[1]
robots_url = urlunsplit(('http', host, '/robots.txt', '', ''))
robotfileparser = RobotFileParser(robots_url)
robotfileparser.read()
if not robotfileparser.can_fetch(self.crawlername, url):
raise RuntimeError('Forbidden by robots.txt')
return urllib2.HTTPHandler.http_open(self, req)
示例5: can_fetch
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
def can_fetch(self,url):
host,path=urlparse.urlparse(url)[1:3]
if (self.rules.has_key(host)):
return self.rules[host].can_fetch(self.agent,url)
else:
rp=RobotFileParser()
robot_url="http://"+host+"/robots.txt"
rp.set_url(robot_url)
rp.read()
self.rules[host]=rp
return rp.can_fetch(self.agent,url)
示例6: check_robots
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
def check_robots(self, url):
'''check the robots.txt in this url's domain'''
hostname = urlparse(url).netloc
if hostname not in self.domain_list.keys(): # no records in domain_list
rp = RobotFileParser('http://%s/robots.txt' % hostname)
print("%s: fetching %s" % (url, rp.url))
try:
rp.read() # get new robots.txt
except IOError, e: # url's server not available(connection timeout)
log.error(str(e))
rp.disallow_all = True # reject all request
self.domain_list[hostname] = rp # add domain entry into domain_list
示例7: robots_check
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
def robots_check(url):
# creating url for robots.txt
root_url = tld.get_tld(url)
prefix = "http://www."
suffix = "/robots.txt"
robots_url = prefix + root_url + suffix
# checking url validity
rp = RobotFileParser()
rp.set_url(robots_url)
rp.read()
return rp.can_fetch("*", url)
示例8: _get_soup
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
def _get_soup(path):
"""Gets soup from the given path, respecting robots.txt"""
full_path = BASE_URL + path
# Set a user-agent
user_agent = 'dcnotify/%s' % __version__
http_headers = {'User-Agent': '%s' % user_agent}
# Honor robots.txt
robots = RobotFileParser()
robots.set_url("%s/robots.txt" % BASE_URL)
robots.read()
if not robots.can_fetch(user_agent, full_path):
raise ValueError("Path disallowed by robots.txt")
# Make a make a request, raising any HTTP errors that might occur
request = get(full_path, headers=http_headers)
request.raise_for_status()
return bs(request.text)
示例9: urlopen
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
def urlopen(self, host):
robo_url = host.get_robots_url()
print self.robotdict
cached_parser = self.robotdict.get(robo_url)
if cached_parser:
logging.info("Found in Cache: " + robo_url)
else:
logging.info("Fetching: " + robo_url)
cached_parser = RobotFileParser()
self.robotdict.put(robo_url, cached_parser)
cached_parser.set_url(robo_url)
cached_parser.read()
if cached_parser.can_fetch('*', host. get_url()):
print 'Going to fetch:', host.get_url()
return self.fetch_file(host.get_url())
else:
logging.info("Forbidden by Robots.txt")
return None
示例10: __init__
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
def __init__(self, url):
self.url = urlManip.cleanURL(url)
self.pages = []
self.suggestions = set()
self.loaded = False
logger.info("Loading %s..." % (self.url))
try:
requests.get(self.url)
self.loaded = True
except IOError as e:
logger.error("%s cannot be loaded: %s" % (self.url, e))
# if the website can be loaded
if self.loaded == True:
logger.info("Load successful. Generating suggestions...")
# get robots.txt
rp = RobotFileParser(self.url + "robots.txt")
try:
rp.read()
except IOError:
logger.warning("robots.txt cannot be found.")
# get home page
self.pages.append(Page(self.url))
# get all pages on homepage
self.pages[0].load()
for link in self.pages[0].internalLinks:
if rp.can_fetch("*", link):
if link[:4] == 'http':
self.pages.append(Page(link))
else:
self.pages.append(Page(self.url + link))
else:
logger.debug("Ignoring %s based on robots.txt" % link)
示例11: Crawler
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
#.........這裏部分代碼省略.........
logging.debug("Crawling as reach the end of all found link")
print (config.xml_footer, file if file else self.output_file)
def __crawling(self):
crawling = self.tocrawl.pop()
url = urlparse.urlparse(crawling)
self.crawled.add(crawling)
request = Request(crawling, headers={"User-Agent":config.crawler_user_agent})
try:
response = urlopen(request)
except Exception as e:
if hasattr(e,'code'):
if e.code in self.response_code:
self.response_code[e.code]+=1
else:
self.response_code[e.code]=1
# Gestion des urls marked pour le reporting
if self.report:
if e.code in self.marked:
self.marked[e.code].append(crawling)
else:
self.marked[e.code] = [crawling]
logging.debug ("{1} ==> {0}".format(e, crawling))
return self.__continue_crawling()
# Read the response
try:
msg = response.read()
if response.getcode() in self.response_code:
self.response_code[response.getcode()]+=1
else:
self.response_code[response.getcode()]=1
response.close()
# Get the last modify date
if 'last-modified' in response.headers:
date = response.headers['Last-Modified']
else:
date = response.headers['Date']
date = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S %Z')
except Exception as e:
logging.debug ("{1} ===> {0}".format(e, crawling))
return None
print ("<url><loc>"+url.geturl()+"</loc><lastmod>"+date.strftime('%Y-%m-%dT%H:%M:%S+00:00')+"</lastmod></url>", file if file else self.output_file)
if self.output_file:
self.output_file.flush()
# Found links
links = self.linkregex.findall(msg)
for link in links:
link = link.decode("utf-8")
#logging.debug("Found : {0}".format(link))
if link.startswith('/'):
link = 'http://' + url[1] + link
elif link.startswith('#'):
示例12: SiteMap
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
class SiteMap():
def __init__(self, main_page=None, robotrules=True):
"""
Constuctor method that initializes the members that are used during crawling process
:param main_page: The root page that needs to be crawled for generation of sitemap
"""
logging.info("Consider Robot.txt ? ==> "+str(robotrules))
self.robotrules = robotrules
self.site_map = {} # map that records the visits of urls, datemodified and assets
self.network = {} # map that maintains the network/graph of webpages visited
# The intention of this map is for visual rendering using d3.js
self.unvisited = set([]) # a set to keep the list of urls yet to be visited
self.start_page = None # the root page, this is used to avoid cycle and keeping crawl
# process limited to single domain.
self.robot_txt_rules = None
if main_page:
self.unvisited.add(main_page)
try:
self.start_page = urlparse(main_page).netloc
except:
logging.error("Improper URL, Please provide a Valid Url:"+main_page)
exit(0)
if self.robotrules == "True":
try:
logging.info("robot.txt respected")
self.robot_txt_rules = RobotFileParser()
self.robot_txt_rules.set_url(main_page + "/robots.txt")
self.robot_txt_rules.read()
except:
logging.error("Unable to read the robot.txt file")
self.robotrules = False # error reading robot.txt, ignore it forever
@timeit
def generate(self, site_map=None):
"""
This method holds the invoking control of the crawler method and drives the crawling process.
Basically a BFS style method that keeps popping the elements from the queue [self.unvisited set]
and scraping the urls.
Once the crawling process is done, this creates sitemap using the self.site_map dictionary with
just url, date-modified tags with dummy frequency and priorities.
:param site_map: name of the site_map file so as to create xml entries.
:return:
"""
while self.unvisited:
self.crawl()
# create xml from the site_map dictionary
header = """<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:xhtml="http://www.w3.org/1999/xhtml"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
"""
footer = """\n</urlset>\n"""
entry = "\t<url>\n\
\t\t<loc>%s</loc>\n\
\t\t<lastmod>%s</lastmod>\n\
\t\t<changefreq>monthly</changefreq>\n\
\t\t<priority> 1 </priority>\n\
\t</url>\
"
xml = header
for url in self.site_map.keys():
xml += entry % (url, self.site_map[url]['date']) + "\n"
xml += footer
if site_map != None:
self.write_to_file(site_map, xml)
else:
self.write_to_file("sitemap.xml", xml)
return xml
def write_to_file(self, file_name, content):
"""
A utility method to just write the contents of the file into a given file name.
Alert: This overwrites if the file does exist in the current directory.
:param file_name: name of the file, sitemap in our case.
:param content: contents of the file
:return: None
"""
f = open(file_name, 'w')
f.write(content)
f.close()
def compose_url_from_href(self, url, href):
"""
There are different ways a href could specify a location and it varies in different ways based on how
the page is designed. This method takes few styles into consideration and ignores some, cleans and creates
a valid url link so as to keep it ready for the crawl method.
:param url: basae url of the current page
:param href: one of the hyper links of the page
:return: a well formed and valid http link
"""
#.........這裏部分代碼省略.........
示例13: test_parse
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
def test_parse(self):
from robotparser import RobotFileParser
rules=RobotFileParser()
rules.set_url("http://www.sogou.com/robots.txt")
rules.read()
self.assertEqual(rules.can_fetch("mozilla","http://www.sogou.com/sohu/robots.txt"),False)
示例14: __init__
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
class SimpleCrawler:
USER_AGENT = 'SimpleCrawler/0.1'
HEADERS = {
'User-Agent': USER_AGENT,
'Accept-Encoding': 'gzip',
'Connection': 'keep-alive'
}
CONTENT_TYPE_PAT = re.compile(r'([^\s;]+)(.*charset=([^\s;]+))?', re.I)
def __init__(self, starturl, index_html='', maxlevel=1,
cookie_file=None, acldb=None, urldb=None, default_charset=None,
delay=0, timeout=300, debug=0):
(proto, self.hostport, _x, _y, _z) = urlsplit(starturl)
assert proto == 'http'
#Thread.__init__(self)
self.debug = debug
self.index_html = index_html
if cookie_file:
self.cookiejar = MozillaCookieJar(cookie_file)
self.cookiejar.load()
else:
self.cookiejar = None
self.robotstxt = RobotFileParser()
self.robotstxt.set_url(urljoin(starturl, '/robots.txt'))
self.robotstxt.read()
self.conn = None
self.urldb = urldb
self.acldb = acldb
self.curlevel = 0
self.delay = delay
self.timeout = timeout
self.default_charset = default_charset
if starturl.endswith('/'):
starturl += self.index_html
self.urls = [(starturl, maxlevel)]
self.crawled = {} # 1:injected, 2:crawled
return
def accept_url(self, url):
if url.endswith('/'):
url += self.index_html
if self.acldb and not self.acldb.allowed(url):
return None
return url
def inject_url(self, url):
if (not self.curlevel) or (not url) or (url in self.crawled): return False
if not self.robotstxt.can_fetch(self.USER_AGENT, url):
if self.debug:
print >>stderr, 'DISALLOW: %r' % url
return None
if self.debug:
print >>stderr, 'INJECT: %r' % url
self.crawled[url] = 1
self.urls.append((url, self.curlevel-1))
return True
def get1(self, url, maxretry=3, maxredirect=3):
if self.debug:
print >>stderr, 'GET: %r' % url
# loop
for rtry in range(maxredirect):
# forge urllib2.Request object.
req = Request(url)
# add cookie headers if necessary.
if self.cookiejar:
self.cookiejar.add_cookie_header(req)
headers = req.unredirected_hdrs
headers.update(self.HEADERS)
else:
headers = self.HEADERS
# get response.
for ctry in range(maxretry):
try:
if not self.conn:
print >>stderr, 'Making connection: %r...' % (self.hostport,)
self.conn = HTTPConnection(self.hostport)
self.conn.request('GET', req.get_selector().replace(' ',''), '', headers)
self.conn.sock.settimeout(self.timeout)
resp = self.conn.getresponse()
break
except BadStatusLine, x:
# connection closed unexpectedly
print >>stderr, 'Connection closed unexpectedly.'
# it restarts the connection...
self.conn.close()
self.conn = None
except socket.error, x:
# connection closed unexpectedly
print >>stderr, 'Socket error:', x
self.conn.close()
self.conn = None
else:
示例15: Webpage
# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
class Webpage(object):
"""
Objects that refer to individual webpages. If the url is scrapeable the
object will be filled with that data, indexed, and inserted into a database
to be searched.
"""
number_of_scraped_pages = 0
def __init__(self, url):
"""
Creates a webpage object and assigns it the provided url.
"""
self.url = url
if self.url not in black_list and self.url not in scraped_urls:
self.needs_to_be_scraped = True
else:
self.needs_to_be_scraped = False
def page_robot_scannable(self):
"""
Checks whether the page is allowed to be crawled
"""
if self.need_to_be_scraped is True:
# REFACTOR to remove try statement.
try:
headers = {'User-agent':settings.SPIDER_USER_AGENT}
self.urlparse = urlparse.urlparse(self.url)
self.robotcheck = RobotFileParser()
self.robotcheck.set_url('http://'+self.urlparse[1]+'/robots.txt') # Only works with http right now.
self.robotcheck.read()
self.need_to_be_scraped = self.robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url)
except:
self.need_to_be_scraped = False
def get_page(self):
"""
The url is requested with a GET request. The page html is scraped
directly, while elements of it aee scraped in parse_page
"""
self.headers = {'User-agent':settings.SPIDER_USER_AGENT}
#REFACTOR to remove try
try:
self.request = requests.get(self.url, headers=headers)
self.pagehtml = BeautifulSoup(self.request.text) #REFACTOR, don't use BeautifulSoup
self.count = self.instanceID.next()
Webpage.number_of_scraped_pages += 1
except:
raise Exception
def get_visible_elements(self, element):
"""
Checks that the element is not contained in <style>, <script>, <head>,
<title> or [document]. It also cannot be commented out.
"""
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
return False
elif re.match('<!--.*-->', str(element)):
return False
return True
def parse_page(self):
"""
This method parses the HTML page and extracts the title of the page,
the outgoing links, the number of outgoing links, and the text.
"""
self.title = self.pagehtml.find('title').text
self.page_text = self.pagehtml.findAll(text=true)
for item in filter(get_visible_elements, self.pagetext):
if item != '\n':
self.pagetext+= item
self.pagelinks = {}
for link in soup.findAll('a'):
self.pagelinks[link.get('href')] = 1
for link in self.pagehtml:
pass
# determine if link is relative or absolute. if relative, change it to absolute
def inverted_index_page_text(self):
"""
Iterates through the words in the page text and creates and adds them
to an index.
"""
self.pagetextlist = self.pagetext.split(' ') #Noted error: This catches punctuation along with words.
for index, word in enumerate(self.pagetextlist):
if word not in STOP_WORDS:
if not inverted_index.get(word):
inverted_index[word]={'url':self.url,'offsets':[index]}
else:
inverted_index[word]['offsets'].append(index)
def set_page_scraped(self):
"""
Once the page is scraped it is flagged as such
"""
self.needs_to_be_scraped = False