本文整理汇总了Python中robotparser.RobotFileParser.set_url方法的典型用法代码示例。如果您正苦于以下问题:Python RobotFileParser.set_url方法的具体用法?Python RobotFileParser.set_url怎么用?Python RobotFileParser.set_url使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类robotparser.RobotFileParser
的用法示例。
在下文中一共展示了RobotFileParser.set_url方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: is_page_robot_scannable
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import set_url [as 别名]
def is_page_robot_scannable(self):
"""
Returns a boolean that tells whether the page is robot scrapeable.
"""
robotcheck = RobotFileParser()
robotcheck.set_url(self.urlparse[0]+'://'+self.urlparse[1]+'/robots.txt')
robotcheck.read()
return robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url)
示例2: _get_robot_parser
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import set_url [as 别名]
def _get_robot_parser(self):
if self.robot_parser_pickle is not None:
return pickle.loads(base64.b64decode(self.robot_parser_pickle))
else:
parser = RobotFileParser()
parser.set_url(self.protocol + "://" + self.domain + "/robots.txt")
self.robot_parser = parser
return parser
示例3: _get_robot_parser
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import set_url [as 别名]
def _get_robot_parser(self):
try:
return pickle.loads(str(self.robot_parser_pickle))
except (TypeError, IndexError):
parser = RobotFileParser()
parser.set_url(str(self.protocol) + "://" + str(self.domain) + \
"/robots.txt")
self.robot_parser = parser
return parser
示例4: checkRobots
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import set_url [as 别名]
def checkRobots(URL):
time.sleep(1)
parsed = urlparse(URL)
robotsUrl = parsed.scheme + "://"+ parsed.netloc+"/robots.txt"
robotParser = RobotFileParser()
robotParser.set_url(robotsUrl)
robotParser.read()
result = robotParser.can_fetch("*",URL)
return result
示例5: can_fetch
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import set_url [as 别名]
def can_fetch(self,url):
host,path=urlparse.urlparse(url)[1:3]
if (self.rules.has_key(host)):
return self.rules[host].can_fetch(self.agent,url)
else:
rp=RobotFileParser()
robot_url="http://"+host+"/robots.txt"
rp.set_url(robot_url)
rp.read()
self.rules[host]=rp
return rp.can_fetch(self.agent,url)
示例6: robots_check
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import set_url [as 别名]
def robots_check(url):
# creating url for robots.txt
root_url = tld.get_tld(url)
prefix = "http://www."
suffix = "/robots.txt"
robots_url = prefix + root_url + suffix
# checking url validity
rp = RobotFileParser()
rp.set_url(robots_url)
rp.read()
return rp.can_fetch("*", url)
示例7: Host
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import set_url [as 别名]
class Host(object):
''' Represents one host. Responsible for parsing and analyzing
``robots.txt``.
:param hostname: the name of the host extracted from an URL.
'''
def __init__(self, hostname):
self.hostname = hostname
self.rp = RobotFileParser()
self.rp.set_url('http://%s/robots.txt' % self.hostname)
def url_allowed(self, url):
''' Checks if the given url is allowed to crawl.
:param url: URL to check.
'''
return self.rp.can_fetch(USER_AGENT, url)
示例8: _get_soup
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import set_url [as 别名]
def _get_soup(path):
"""Gets soup from the given path, respecting robots.txt"""
full_path = BASE_URL + path
# Set a user-agent
user_agent = 'dcnotify/%s' % __version__
http_headers = {'User-Agent': '%s' % user_agent}
# Honor robots.txt
robots = RobotFileParser()
robots.set_url("%s/robots.txt" % BASE_URL)
robots.read()
if not robots.can_fetch(user_agent, full_path):
raise ValueError("Path disallowed by robots.txt")
# Make a make a request, raising any HTTP errors that might occur
request = get(full_path, headers=http_headers)
request.raise_for_status()
return bs(request.text)
示例9: urlopen
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import set_url [as 别名]
def urlopen(self, host):
robo_url = host.get_robots_url()
print self.robotdict
cached_parser = self.robotdict.get(robo_url)
if cached_parser:
logging.info("Found in Cache: " + robo_url)
else:
logging.info("Fetching: " + robo_url)
cached_parser = RobotFileParser()
self.robotdict.put(robo_url, cached_parser)
cached_parser.set_url(robo_url)
cached_parser.read()
if cached_parser.can_fetch('*', host. get_url()):
print 'Going to fetch:', host.get_url()
return self.fetch_file(host.get_url())
else:
logging.info("Forbidden by Robots.txt")
return None
示例10: Crawler
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import set_url [as 别名]
#.........这里部分代码省略.........
# Drop attributes if needed
for toDrop in self.drop:
link=re.sub(toDrop,'',link)
# Parse the url to get domain and file extension
parsed_link = urlparse.urlparse(link)
domain_link = parsed_link.netloc
target_extension = os.path.splitext(parsed_link.path)[1][1:]
if (link in self.crawled):
continue
if (link in self.tocrawl):
continue
if (link in self.excluded):
continue
if (domain_link != self.target_domain):
continue
if ("javascript" in link):
continue
# Count one more URL
self.nb_url+=1
# Check if the navigation is allowed by the robots.txt
if (not self.can_fetch(link)):
self.exclude_link(link)
self.nb_rp+=1
continue
# Check if the current file extension is allowed or not.
if (target_extension in self.skipext):
self.exclude_link(link)
self.nb_exclude+=1
continue
# Check if the current url doesn't contain an excluded word
if (not self.exclude_url(link)):
self.exclude_link(link)
self.nb_exclude+=1
continue
self.tocrawl.add(link)
return None
def __continue_crawling(self):
if self.tocrawl:
self.__crawling()
def exclude_link(self,link):
if link not in self.excluded:
self.excluded.add(link)
def checkRobots(self):
if self.domain[len(self.domain)-1] != "/":
self.domain += "/"
request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent})
self.rp = RobotFileParser()
self.rp.set_url(self.domain+"robots.txt")
self.rp.read()
def can_fetch(self, link):
try:
if self.parserobots:
if self.rp.can_fetch("*", link):
return True
else:
logging.debug ("Crawling of {0} disabled by robots.txt".format(link))
return False
if not self.parserobots:
return True
return True
except:
# On error continue!
logging.debug ("Error during parsing robots.txt")
return True
def exclude_url(self, link):
for ex in self.exclude:
if ex in link:
return False
return True
def make_report(self):
print ("Number of found URL : {0}".format(self.nb_url))
print ("Number of link crawled : {0}".format(len(self.crawled)))
if self.parserobots:
print ("Number of link block by robots.txt : {0}".format(self.nb_rp))
if self.skipext or self.exclude:
print ("Number of link exclude : {0}".format(self.nb_exclude))
for code in self.response_code:
print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code]))
for code in self.marked:
print ("Link with status {0}:".format(code))
for uri in self.marked[code]:
print ("\t- {0}".format(uri))
示例11: SiteMap
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import set_url [as 别名]
class SiteMap():
def __init__(self, main_page=None, robotrules=True):
"""
Constuctor method that initializes the members that are used during crawling process
:param main_page: The root page that needs to be crawled for generation of sitemap
"""
logging.info("Consider Robot.txt ? ==> "+str(robotrules))
self.robotrules = robotrules
self.site_map = {} # map that records the visits of urls, datemodified and assets
self.network = {} # map that maintains the network/graph of webpages visited
# The intention of this map is for visual rendering using d3.js
self.unvisited = set([]) # a set to keep the list of urls yet to be visited
self.start_page = None # the root page, this is used to avoid cycle and keeping crawl
# process limited to single domain.
self.robot_txt_rules = None
if main_page:
self.unvisited.add(main_page)
try:
self.start_page = urlparse(main_page).netloc
except:
logging.error("Improper URL, Please provide a Valid Url:"+main_page)
exit(0)
if self.robotrules == "True":
try:
logging.info("robot.txt respected")
self.robot_txt_rules = RobotFileParser()
self.robot_txt_rules.set_url(main_page + "/robots.txt")
self.robot_txt_rules.read()
except:
logging.error("Unable to read the robot.txt file")
self.robotrules = False # error reading robot.txt, ignore it forever
@timeit
def generate(self, site_map=None):
"""
This method holds the invoking control of the crawler method and drives the crawling process.
Basically a BFS style method that keeps popping the elements from the queue [self.unvisited set]
and scraping the urls.
Once the crawling process is done, this creates sitemap using the self.site_map dictionary with
just url, date-modified tags with dummy frequency and priorities.
:param site_map: name of the site_map file so as to create xml entries.
:return:
"""
while self.unvisited:
self.crawl()
# create xml from the site_map dictionary
header = """<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:xhtml="http://www.w3.org/1999/xhtml"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
"""
footer = """\n</urlset>\n"""
entry = "\t<url>\n\
\t\t<loc>%s</loc>\n\
\t\t<lastmod>%s</lastmod>\n\
\t\t<changefreq>monthly</changefreq>\n\
\t\t<priority> 1 </priority>\n\
\t</url>\
"
xml = header
for url in self.site_map.keys():
xml += entry % (url, self.site_map[url]['date']) + "\n"
xml += footer
if site_map != None:
self.write_to_file(site_map, xml)
else:
self.write_to_file("sitemap.xml", xml)
return xml
def write_to_file(self, file_name, content):
"""
A utility method to just write the contents of the file into a given file name.
Alert: This overwrites if the file does exist in the current directory.
:param file_name: name of the file, sitemap in our case.
:param content: contents of the file
:return: None
"""
f = open(file_name, 'w')
f.write(content)
f.close()
def compose_url_from_href(self, url, href):
"""
There are different ways a href could specify a location and it varies in different ways based on how
the page is designed. This method takes few styles into consideration and ignores some, cleans and creates
a valid url link so as to keep it ready for the crawl method.
:param url: basae url of the current page
:param href: one of the hyper links of the page
:return: a well formed and valid http link
"""
#.........这里部分代码省略.........
示例12: __init__
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import set_url [as 别名]
class SimpleCrawler:
USER_AGENT = 'SimpleCrawler/0.1'
HEADERS = {
'User-Agent': USER_AGENT,
'Accept-Encoding': 'gzip',
'Connection': 'keep-alive'
}
CONTENT_TYPE_PAT = re.compile(r'([^\s;]+)(.*charset=([^\s;]+))?', re.I)
def __init__(self, starturl, index_html='', maxlevel=1,
cookie_file=None, acldb=None, urldb=None, default_charset=None,
delay=0, timeout=300, debug=0):
(proto, self.hostport, _x, _y, _z) = urlsplit(starturl)
assert proto == 'http'
#Thread.__init__(self)
self.debug = debug
self.index_html = index_html
if cookie_file:
self.cookiejar = MozillaCookieJar(cookie_file)
self.cookiejar.load()
else:
self.cookiejar = None
self.robotstxt = RobotFileParser()
self.robotstxt.set_url(urljoin(starturl, '/robots.txt'))
self.robotstxt.read()
self.conn = None
self.urldb = urldb
self.acldb = acldb
self.curlevel = 0
self.delay = delay
self.timeout = timeout
self.default_charset = default_charset
if starturl.endswith('/'):
starturl += self.index_html
self.urls = [(starturl, maxlevel)]
self.crawled = {} # 1:injected, 2:crawled
return
def accept_url(self, url):
if url.endswith('/'):
url += self.index_html
if self.acldb and not self.acldb.allowed(url):
return None
return url
def inject_url(self, url):
if (not self.curlevel) or (not url) or (url in self.crawled): return False
if not self.robotstxt.can_fetch(self.USER_AGENT, url):
if self.debug:
print >>stderr, 'DISALLOW: %r' % url
return None
if self.debug:
print >>stderr, 'INJECT: %r' % url
self.crawled[url] = 1
self.urls.append((url, self.curlevel-1))
return True
def get1(self, url, maxretry=3, maxredirect=3):
if self.debug:
print >>stderr, 'GET: %r' % url
# loop
for rtry in range(maxredirect):
# forge urllib2.Request object.
req = Request(url)
# add cookie headers if necessary.
if self.cookiejar:
self.cookiejar.add_cookie_header(req)
headers = req.unredirected_hdrs
headers.update(self.HEADERS)
else:
headers = self.HEADERS
# get response.
for ctry in range(maxretry):
try:
if not self.conn:
print >>stderr, 'Making connection: %r...' % (self.hostport,)
self.conn = HTTPConnection(self.hostport)
self.conn.request('GET', req.get_selector().replace(' ',''), '', headers)
self.conn.sock.settimeout(self.timeout)
resp = self.conn.getresponse()
break
except BadStatusLine, x:
# connection closed unexpectedly
print >>stderr, 'Connection closed unexpectedly.'
# it restarts the connection...
self.conn.close()
self.conn = None
except socket.error, x:
# connection closed unexpectedly
print >>stderr, 'Socket error:', x
self.conn.close()
self.conn = None
else:
示例13: Webpage
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import set_url [as 别名]
class Webpage(object):
"""
Objects that refer to individual webpages. If the url is scrapeable the
object will be filled with that data, indexed, and inserted into a database
to be searched.
"""
number_of_scraped_pages = 0
def __init__(self, url):
"""
Creates a webpage object and assigns it the provided url.
"""
self.url = url
if self.url not in black_list and self.url not in scraped_urls:
self.needs_to_be_scraped = True
else:
self.needs_to_be_scraped = False
def page_robot_scannable(self):
"""
Checks whether the page is allowed to be crawled
"""
if self.need_to_be_scraped is True:
# REFACTOR to remove try statement.
try:
headers = {'User-agent':settings.SPIDER_USER_AGENT}
self.urlparse = urlparse.urlparse(self.url)
self.robotcheck = RobotFileParser()
self.robotcheck.set_url('http://'+self.urlparse[1]+'/robots.txt') # Only works with http right now.
self.robotcheck.read()
self.need_to_be_scraped = self.robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url)
except:
self.need_to_be_scraped = False
def get_page(self):
"""
The url is requested with a GET request. The page html is scraped
directly, while elements of it aee scraped in parse_page
"""
self.headers = {'User-agent':settings.SPIDER_USER_AGENT}
#REFACTOR to remove try
try:
self.request = requests.get(self.url, headers=headers)
self.pagehtml = BeautifulSoup(self.request.text) #REFACTOR, don't use BeautifulSoup
self.count = self.instanceID.next()
Webpage.number_of_scraped_pages += 1
except:
raise Exception
def get_visible_elements(self, element):
"""
Checks that the element is not contained in <style>, <script>, <head>,
<title> or [document]. It also cannot be commented out.
"""
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
return False
elif re.match('<!--.*-->', str(element)):
return False
return True
def parse_page(self):
"""
This method parses the HTML page and extracts the title of the page,
the outgoing links, the number of outgoing links, and the text.
"""
self.title = self.pagehtml.find('title').text
self.page_text = self.pagehtml.findAll(text=true)
for item in filter(get_visible_elements, self.pagetext):
if item != '\n':
self.pagetext+= item
self.pagelinks = {}
for link in soup.findAll('a'):
self.pagelinks[link.get('href')] = 1
for link in self.pagehtml:
pass
# determine if link is relative or absolute. if relative, change it to absolute
def inverted_index_page_text(self):
"""
Iterates through the words in the page text and creates and adds them
to an index.
"""
self.pagetextlist = self.pagetext.split(' ') #Noted error: This catches punctuation along with words.
for index, word in enumerate(self.pagetextlist):
if word not in STOP_WORDS:
if not inverted_index.get(word):
inverted_index[word]={'url':self.url,'offsets':[index]}
else:
inverted_index[word]['offsets'].append(index)
def set_page_scraped(self):
"""
Once the page is scraped it is flagged as such
"""
self.needs_to_be_scraped = False
示例14: __init__
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import set_url [as 别名]
class MarioDepth:
def __init__(self, starturl, callback, callpre=None, callfail=None, concount=MAXCONCOUNT, depth=2, accept_url_patterns=None, reject_url_patterns=None):
self.concount = concount
self.callback = callback
self.callpre = callpre
self.callfail = callfail
self.depth = depth
self.starturl = starturl
self.baseurl = URL.baseurl(starturl)
self.urls = []
self.crawled = {}
self.link_title_db = LinkTitleDB()
self.accept_url_patterns = accept_url_patterns
self.reject_url_patterns = reject_url_patterns
self.robotstxt = RobotFileParser()
self.robotstxt.set_url(urljoin(starturl, '/robots.txt'))
self.referer = starturl
try:
self.robotstxt.read()
except:
logger.debug(Traceback())
#self.lightcloud = LightCloud.connect('n0')
def __call__(self, n=None):
if n: self.concount = n
current_depth = self.depth
self.urls.append((self.starturl, current_depth))
while self.urls:
self.depth_get()
logger.debug('%d unprocessed urls'%(len(self.urls)))
def depth_get(self):
mario = MarioBatch(callback=self.next_depth, callpre=self.callpre, callfail=self.callfail)
pool = coros.CoroutinePool(max_size=len(self.urls))
while self.urls:
waiters = []
#self.add_job(mario)
counter = 0
while self.urls:
if counter > 9: break;
counter += 1
waiters.append(pool.execute(self.add_job, mario))
logger.debug('Depth break')
for waiter in waiters:
waiter.wait()
mario(self.concount)
def add_job(self, mario):
if not self.urls: return
url, depth = self.urls.pop()
if self.visited(url, depth): return
mario.add_job(url, args=depth)
def visited(self, url, depth):
#is_duplicate = URL.is_duplicate(url, self.lightcloud)
return depth==0 and is_duplicate or depth < self.depth and self.crawled.has_key(url) and self.crawled[url] == 2
def next_depth(self, response):
#with_timeout(1, self.lightcloud.set, LightCloud.crawled_url_key(response.effective_url), response.url, timeout_value=None)
for link, title in URL.link_title(response.body, response.effective_url):
if not self.inject_url(link, response.args):continue
self.link_title_db.add(link, response.effective_url, title)
if callable(self.callback): self.callback(response)
self.crawled[response.effective_url] = 2
if response.effective_url != response.url:
self.crawled[response.url] = 2
self.referer = response.effective_url
def inject_url(self, url, depth):
if not (depth and url and url not in self.crawled):
#logger.debug('IGNORE(%d): %r'%(depth, url))
return None
if isinstance(url, unicode): url = url.encode('utf-8')
if self.reject_url(url):
logger.debug('REJECT: %r' % url)
return None
try:
can_fetch = self.robotstxt.can_fetch(USER_AGENT['safari'], url)
except:
can_fetch = True
if self.baseurl!='http://hi.baidu.com/' and not can_fetch:
logger.debug('DISALLOW: %r' % url)
return None
logger.debug('INJECT(%d): %r' % (depth-1, url))
self.crawled[url] = 1
self.urls.append((url, depth-1))
return True
def reject_url(self, url):
return self.baseurl != URL.baseurl(url) and (not self.accept_url_patterns or not re.match('|'.join(self.accept_url_patterns), url) or self.reject_url_patterns or re.match('|'.join(self.reject_url_patterns), url))
示例15: test_parse
# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import set_url [as 别名]
def test_parse(self):
from robotparser import RobotFileParser
rules=RobotFileParser()
rules.set_url("http://www.sogou.com/robots.txt")
rules.read()
self.assertEqual(rules.can_fetch("mozilla","http://www.sogou.com/sohu/robots.txt"),False)