當前位置: 首頁>>代碼示例>>Python>>正文


Python RobotFileParser.read方法代碼示例

本文整理匯總了Python中robotparser.RobotFileParser.read方法的典型用法代碼示例。如果您正苦於以下問題:Python RobotFileParser.read方法的具體用法?Python RobotFileParser.read怎麽用?Python RobotFileParser.read使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在robotparser.RobotFileParser的用法示例。


在下文中一共展示了RobotFileParser.read方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: is_page_robot_scannable

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
 def is_page_robot_scannable(self):
     """
     Returns a boolean that tells whether the page is robot scrapeable.
     """
     robotcheck = RobotFileParser()
     robotcheck.set_url(self.urlparse[0]+'://'+self.urlparse[1]+'/robots.txt')
     robotcheck.read()
     return robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url)
開發者ID:gabrielmiller,項目名稱:Py-Webscraper,代碼行數:10,代碼來源:webpage.py

示例2: _allowed_to_open

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
 def _allowed_to_open(self, url):
     host = urlparse.urlsplit(url)[1]
     robots_url = urlparse.urlunsplit(('http', host, '/robots.txt', '', ''))
     rp = RobotFileParser(robots_url)
     try:
         rp.read()
     except:
         return False
     return rp.can_fetch(self._agent_name, url)
開發者ID:AntonDatsik,項目名稱:L4,代碼行數:11,代碼來源:crawler.py

示例3: checkRobots

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
def checkRobots(URL):

	time.sleep(1)
	parsed = urlparse(URL)
	robotsUrl = parsed.scheme + "://"+ parsed.netloc+"/robots.txt"
	robotParser = RobotFileParser()
	robotParser.set_url(robotsUrl)
	robotParser.read()
	result = robotParser.can_fetch("*",URL)
	return result
開發者ID:kdin,項目名稱:Focused-Crawler,代碼行數:12,代碼來源:crawler.py

示例4: http_open

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
    def http_open(self, req):

        url = req.get_full_url()
        host = urlsplit(url)[1]
        robots_url = urlunsplit(('http', host, '/robots.txt', '', ''))
        robotfileparser = RobotFileParser(robots_url)
        robotfileparser.read()
        if not robotfileparser.can_fetch(self.crawlername, url):
            raise RuntimeError('Forbidden by robots.txt')
        return urllib2.HTTPHandler.http_open(self, req)
開發者ID:KroosT,項目名稱:BSUIR_Python_Lab_003,代碼行數:12,代碼來源:crawler.py

示例5: can_fetch

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
	def can_fetch(self,url):
		host,path=urlparse.urlparse(url)[1:3]
		if	(self.rules.has_key(host)):
			return self.rules[host].can_fetch(self.agent,url)
		else:
			rp=RobotFileParser()
			robot_url="http://"+host+"/robots.txt"
			rp.set_url(robot_url)
			rp.read()
			self.rules[host]=rp
			return rp.can_fetch(self.agent,url)	
開發者ID:lennon310,項目名稱:hyer,代碼行數:13,代碼來源:rules_monster.py

示例6: check_robots

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
 def check_robots(self, url):
     '''check the robots.txt in this url's domain'''
     hostname = urlparse(url).netloc
     if hostname not in self.domain_list.keys():      # no records in domain_list
         rp = RobotFileParser('http://%s/robots.txt' % hostname)
         print("%s: fetching %s" % (url, rp.url))
         try:
             rp.read()                                # get new robots.txt
         except IOError, e:                           # url's server not available(connection timeout)
             log.error(str(e))
             rp.disallow_all = True                   # reject all request
         self.domain_list[hostname] = rp              # add domain entry into domain_list
開發者ID:YvesChan,項目名稱:OpenSP,代碼行數:14,代碼來源:spider.py

示例7: robots_check

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
def robots_check(url):

    # creating url for robots.txt
    root_url = tld.get_tld(url)
    prefix = "http://www."
    suffix = "/robots.txt"
    robots_url = prefix + root_url + suffix

    # checking url validity
    rp = RobotFileParser()
    rp.set_url(robots_url)
    rp.read()
    return rp.can_fetch("*", url)
開發者ID:ujaco,項目名稱:crawley,代碼行數:15,代碼來源:crawley.py

示例8: _get_soup

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
def _get_soup(path):
    """Gets soup from the given path, respecting robots.txt"""

    full_path = BASE_URL + path

    # Set a user-agent
    user_agent = 'dcnotify/%s' % __version__
    http_headers = {'User-Agent': '%s' % user_agent}

    # Honor robots.txt
    robots = RobotFileParser()
    robots.set_url("%s/robots.txt" % BASE_URL)
    robots.read()
    if not robots.can_fetch(user_agent, full_path):
        raise ValueError("Path disallowed by robots.txt")

    # Make a make a request, raising any HTTP errors that might occur
    request = get(full_path, headers=http_headers)
    request.raise_for_status()

    return bs(request.text)
開發者ID:seanthegeek,項目名稱:dcnotify,代碼行數:23,代碼來源:scraper.py

示例9: urlopen

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
    def urlopen(self, host):
        robo_url = host.get_robots_url()

        print self.robotdict

        cached_parser = self.robotdict.get(robo_url)
        if cached_parser:
            logging.info("Found in Cache: " + robo_url)
        else:
            logging.info("Fetching: " + robo_url)
            cached_parser = RobotFileParser()
            self.robotdict.put(robo_url, cached_parser)
            cached_parser.set_url(robo_url)
            cached_parser.read()

        if cached_parser.can_fetch('*', host. get_url()):
            print 'Going to fetch:', host.get_url()
            return self.fetch_file(host.get_url())
        else:
            logging.info("Forbidden by Robots.txt")
            return None
開發者ID:nilanjanbasu,項目名稱:Webcrawler,代碼行數:23,代碼來源:crawler.py

示例10: __init__

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
    def __init__(self, url):
        self.url = urlManip.cleanURL(url)
        self.pages = []
        self.suggestions = set()
        self.loaded = False
        logger.info("Loading %s..." % (self.url))
        try:
            requests.get(self.url)
            self.loaded = True
        except IOError as e:
            logger.error("%s cannot be loaded: %s" % (self.url, e))

        # if the website can be loaded
        if self.loaded == True:
            logger.info("Load successful. Generating suggestions...")

            # get robots.txt
            rp = RobotFileParser(self.url + "robots.txt")
            try:
                rp.read()
            except IOError:
                logger.warning("robots.txt cannot be found.")

            # get home page
            self.pages.append(Page(self.url))

            # get all pages on homepage
            self.pages[0].load()
            for link in self.pages[0].internalLinks:
                if rp.can_fetch("*", link):
                    if link[:4] == 'http':
                        self.pages.append(Page(link))
                    else:
                        self.pages.append(Page(self.url + link))
                else:
                    logger.debug("Ignoring %s based on robots.txt" % link)
開發者ID:paywithscratch,項目名稱:scratchsuggestionscraper,代碼行數:38,代碼來源:scraper.py

示例11: Crawler

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]

#.........這裏部分代碼省略.........
		logging.debug("Crawling as reach the end of all found link")

		print (config.xml_footer, file if file else self.output_file)


	def __crawling(self):
		crawling = self.tocrawl.pop()

		url = urlparse.urlparse(crawling)
		self.crawled.add(crawling)
		request = Request(crawling, headers={"User-Agent":config.crawler_user_agent})
		
		try:
			response = urlopen(request)
		except Exception as e:
			if hasattr(e,'code'):
				if e.code in self.response_code:
					self.response_code[e.code]+=1
				else:
					self.response_code[e.code]=1

				# Gestion des urls marked pour le reporting
				if self.report:
					if e.code in self.marked:
						self.marked[e.code].append(crawling)
					else:
						self.marked[e.code] = [crawling]

			logging.debug ("{1} ==> {0}".format(e, crawling))
			return self.__continue_crawling()

		# Read the response
		try:
			msg = response.read()
			if response.getcode() in self.response_code:
				self.response_code[response.getcode()]+=1
			else:
				self.response_code[response.getcode()]=1

			response.close()

			# Get the last modify date
			if 'last-modified' in response.headers:
				date = response.headers['Last-Modified']
			else:
				date = response.headers['Date']

			date = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S %Z')

		except Exception as e:
			logging.debug ("{1} ===> {0}".format(e, crawling))
			return None


		print ("<url><loc>"+url.geturl()+"</loc><lastmod>"+date.strftime('%Y-%m-%dT%H:%M:%S+00:00')+"</lastmod></url>", file if file else self.output_file)
		if self.output_file:
			self.output_file.flush()

		# Found links
		links = self.linkregex.findall(msg)
		for link in links:
			link = link.decode("utf-8")
			#logging.debug("Found : {0}".format(link))		
			if link.startswith('/'):
				link = 'http://' + url[1] + link
			elif link.startswith('#'):
開發者ID:vionemc,項目名稱:python-sitemap,代碼行數:70,代碼來源:crawler.py

示例12: SiteMap

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
class SiteMap():

    def __init__(self, main_page=None, robotrules=True):
        """
            Constuctor method that initializes the members that are used during crawling process
        :param main_page: The root page that needs to be crawled for generation of sitemap
        """

        logging.info("Consider Robot.txt ? ==> "+str(robotrules))
        self.robotrules = robotrules
        self.site_map = {}                          # map that records the visits of urls, datemodified and assets
        self.network = {}                           # map that maintains the network/graph of webpages visited
                                                    # The intention of this map is for visual rendering using d3.js

        self.unvisited = set([])                    # a set to keep the list of urls yet to be visited
        self.start_page = None                       # the root page, this is used to avoid cycle and keeping crawl
                                                    # process limited to single domain.
        self.robot_txt_rules = None

        if main_page:
            self.unvisited.add(main_page)
            try:
                self.start_page = urlparse(main_page).netloc
            except:
                logging.error("Improper URL, Please provide a Valid Url:"+main_page)
                exit(0)

        if self.robotrules == "True":
            try:
                logging.info("robot.txt respected")
                self.robot_txt_rules = RobotFileParser()
                self.robot_txt_rules.set_url(main_page + "/robots.txt")
                self.robot_txt_rules.read()
            except:
                logging.error("Unable to read the robot.txt file")
                self.robotrules = False # error reading robot.txt, ignore it forever

    @timeit
    def generate(self, site_map=None):
        """
            This method holds the invoking control of the crawler method and drives the crawling process.
            Basically a BFS style method that keeps popping the elements from the queue [self.unvisited set]
            and scraping the urls.

            Once the crawling process is done, this creates sitemap using the self.site_map dictionary with
            just url, date-modified tags with dummy frequency and priorities.
        :param site_map: name of the site_map file so as to create xml entries.
        :return:
        """
        while self.unvisited:
            self.crawl()
        # create xml from the site_map dictionary
        header = """<?xml version="1.0" encoding="UTF-8"?>
            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
            xmlns:xhtml="http://www.w3.org/1999/xhtml"
            xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
            xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
            http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
        """
        footer = """\n</urlset>\n"""
        entry = "\t<url>\n\
                 \t\t<loc>%s</loc>\n\
                 \t\t<lastmod>%s</lastmod>\n\
                 \t\t<changefreq>monthly</changefreq>\n\
                 \t\t<priority> 1 </priority>\n\
                 \t</url>\
        "

        xml = header
        for url in self.site_map.keys():
            xml += entry % (url, self.site_map[url]['date']) + "\n"

        xml += footer
        if site_map != None:
            self.write_to_file(site_map, xml)
        else:
            self.write_to_file("sitemap.xml", xml)
        return xml

    def write_to_file(self, file_name, content):
        """
            A utility method to just write the contents of the file into a given file name.
            Alert: This overwrites if the file does exist in the current directory.
        :param file_name: name of the file, sitemap in our case.
        :param content:   contents of the file
        :return: None
        """
        f = open(file_name, 'w')
        f.write(content)
        f.close()

    def compose_url_from_href(self, url, href):
        """
            There are different ways a href could specify a location and it varies in different ways based on how
            the page is designed. This method takes few styles into consideration and ignores some, cleans and creates
            a valid url link so as to keep it ready for the crawl method.
        :param url:   basae url of the current page
        :param href:  one of the hyper links of the page
        :return:      a well formed and valid http link
        """
#.........這裏部分代碼省略.........
開發者ID:nchikkam,項目名稱:projects,代碼行數:103,代碼來源:crawler.py

示例13: test_parse

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
	def test_parse(self):
		from robotparser import RobotFileParser
		rules=RobotFileParser()
		rules.set_url("http://www.sogou.com/robots.txt")
		rules.read()
		self.assertEqual(rules.can_fetch("mozilla","http://www.sogou.com/sohu/robots.txt"),False)
開發者ID:lennon310,項目名稱:hyer,代碼行數:8,代碼來源:robot.py

示例14: __init__

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
class SimpleCrawler:

  USER_AGENT = 'SimpleCrawler/0.1'
  HEADERS = {
    'User-Agent': USER_AGENT,
    'Accept-Encoding': 'gzip',
    'Connection': 'keep-alive'
    }
  CONTENT_TYPE_PAT = re.compile(r'([^\s;]+)(.*charset=([^\s;]+))?', re.I)
  
  def __init__(self, starturl, index_html='', maxlevel=1,
               cookie_file=None, acldb=None, urldb=None, default_charset=None,
               delay=0, timeout=300, debug=0):
    (proto, self.hostport, _x, _y, _z) = urlsplit(starturl)
    assert proto == 'http'
    #Thread.__init__(self)
    self.debug = debug
    self.index_html = index_html
    if cookie_file:
      self.cookiejar = MozillaCookieJar(cookie_file)
      self.cookiejar.load()
    else:
      self.cookiejar = None
    self.robotstxt = RobotFileParser()
    self.robotstxt.set_url(urljoin(starturl, '/robots.txt'))
    self.robotstxt.read()
    self.conn = None
    self.urldb = urldb
    self.acldb = acldb
    self.curlevel = 0
    self.delay = delay
    self.timeout = timeout
    self.default_charset = default_charset
    if starturl.endswith('/'):
      starturl += self.index_html
    self.urls = [(starturl, maxlevel)]
    self.crawled = {}                   # 1:injected, 2:crawled
    return

  def accept_url(self, url):
    if url.endswith('/'):
      url += self.index_html
    if self.acldb and not self.acldb.allowed(url):
      return None
    return url
  
  def inject_url(self, url):
    if (not self.curlevel) or (not url) or (url in self.crawled): return False
    if not self.robotstxt.can_fetch(self.USER_AGENT, url):
      if self.debug:
        print >>stderr, 'DISALLOW: %r' % url
      return None
    if self.debug:
      print >>stderr, 'INJECT: %r' % url
    self.crawled[url] = 1
    self.urls.append((url, self.curlevel-1))
    return True

  def get1(self, url, maxretry=3, maxredirect=3):
    if self.debug:
      print >>stderr, 'GET: %r' % url
    # loop
    for rtry in range(maxredirect):
      # forge urllib2.Request object.
      req = Request(url)
      # add cookie headers if necessary.
      if self.cookiejar:
        self.cookiejar.add_cookie_header(req)
        headers = req.unredirected_hdrs
        headers.update(self.HEADERS)
      else:
        headers = self.HEADERS
      # get response.
      for ctry in range(maxretry):
        try:
          if not self.conn:
            print >>stderr, 'Making connection: %r...' % (self.hostport,)
            self.conn = HTTPConnection(self.hostport)
          self.conn.request('GET', req.get_selector().replace(' ',''), '', headers)
	  self.conn.sock.settimeout(self.timeout)
          resp = self.conn.getresponse()
          break
        except BadStatusLine, x:
          # connection closed unexpectedly
          print >>stderr, 'Connection closed unexpectedly.'
          # it restarts the connection...
          self.conn.close()
          self.conn = None
        except socket.error, x:
          # connection closed unexpectedly
          print >>stderr, 'Socket error:', x
          self.conn.close()
          self.conn = None
      else:
開發者ID:dreamfrog,項目名稱:jophiel,代碼行數:96,代碼來源:textcrawler.py

示例15: Webpage

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import read [as 別名]
class Webpage(object):
    """
    Objects that refer to individual webpages. If the url is scrapeable the
    object will be filled with that data, indexed, and inserted into a database
    to be searched.
    """
    number_of_scraped_pages = 0

    def __init__(self, url):
        """
        Creates a webpage object and assigns it the provided url.
        """
        self.url = url
        if self.url not in black_list and self.url not in scraped_urls:
            self.needs_to_be_scraped = True
        else:
            self.needs_to_be_scraped = False

    def page_robot_scannable(self):
        """
        Checks whether the page is allowed to be crawled
        """
        if self.need_to_be_scraped is True:
            # REFACTOR to remove try statement.
            try:
                headers = {'User-agent':settings.SPIDER_USER_AGENT}
                self.urlparse = urlparse.urlparse(self.url)
                self.robotcheck = RobotFileParser()
                self.robotcheck.set_url('http://'+self.urlparse[1]+'/robots.txt') # Only works with http right now.
                self.robotcheck.read()
                self.need_to_be_scraped = self.robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url)
            except:
                self.need_to_be_scraped = False

    def get_page(self):
        """
        The url is requested with a GET request. The page html is scraped
        directly, while elements of it aee scraped in parse_page
        """
        self.headers = {'User-agent':settings.SPIDER_USER_AGENT}
        #REFACTOR to remove try
        try:
            self.request = requests.get(self.url, headers=headers)
            self.pagehtml = BeautifulSoup(self.request.text) #REFACTOR, don't use BeautifulSoup
            self.count = self.instanceID.next()
            Webpage.number_of_scraped_pages += 1
        except:
            raise Exception

    def get_visible_elements(self, element):
        """
        Checks that the element is not contained in <style>, <script>, <head>,
        <title> or [document]. It also cannot be commented out.
        """
        if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
            return False
        elif re.match('<!--.*-->', str(element)):
            return False
        return True

    def parse_page(self):
        """
        This method parses the HTML page and extracts the title of the page,
        the outgoing links, the number of outgoing links, and the text.
        """
        self.title = self.pagehtml.find('title').text
        self.page_text = self.pagehtml.findAll(text=true)

        for item in filter(get_visible_elements, self.pagetext):
            if item != '\n':
                self.pagetext+= item
        self.pagelinks = {}

        for link in soup.findAll('a'):
            self.pagelinks[link.get('href')] = 1

        for link in self.pagehtml:
            pass

        # determine if link is relative or absolute. if relative, change it to absolute

    def inverted_index_page_text(self):
        """
        Iterates through the words in the page text and creates and adds them
        to an index.
        """
        self.pagetextlist = self.pagetext.split(' ') #Noted error: This catches punctuation along with words.
        for index, word in enumerate(self.pagetextlist):
            if word not in STOP_WORDS:
                if not inverted_index.get(word):
                    inverted_index[word]={'url':self.url,'offsets':[index]}
                else:
                    inverted_index[word]['offsets'].append(index)

    def set_page_scraped(self):
        """
        Once the page is scraped it is flagged as such
        """
        self.needs_to_be_scraped = False
開發者ID:gabrielmiller,項目名稱:Py-Webscraper,代碼行數:101,代碼來源:spider.py


注:本文中的robotparser.RobotFileParser.read方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。