當前位置: 首頁>>代碼示例>>Python>>正文


Python RobotFileParser.set_url方法代碼示例

本文整理匯總了Python中robotparser.RobotFileParser.set_url方法的典型用法代碼示例。如果您正苦於以下問題:Python RobotFileParser.set_url方法的具體用法?Python RobotFileParser.set_url怎麽用?Python RobotFileParser.set_url使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在robotparser.RobotFileParser的用法示例。


在下文中一共展示了RobotFileParser.set_url方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: is_page_robot_scannable

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import set_url [as 別名]
 def is_page_robot_scannable(self):
     """
     Returns a boolean that tells whether the page is robot scrapeable.
     """
     robotcheck = RobotFileParser()
     robotcheck.set_url(self.urlparse[0]+'://'+self.urlparse[1]+'/robots.txt')
     robotcheck.read()
     return robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url)
開發者ID:gabrielmiller,項目名稱:Py-Webscraper,代碼行數:10,代碼來源:webpage.py

示例2: _get_robot_parser

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import set_url [as 別名]
    def _get_robot_parser(self):
        if self.robot_parser_pickle is not None:
            return pickle.loads(base64.b64decode(self.robot_parser_pickle))
        else:
            parser = RobotFileParser()
            parser.set_url(self.protocol + "://" + self.domain + "/robots.txt")
            self.robot_parser = parser

            return parser
開發者ID:joskid,項目名稱:celery-crawler,代碼行數:11,代碼來源:models.py

示例3: _get_robot_parser

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import set_url [as 別名]
 def _get_robot_parser(self):
     try:
         return pickle.loads(str(self.robot_parser_pickle))
     except (TypeError, IndexError):
         parser = RobotFileParser()
         parser.set_url(str(self.protocol) + "://" + str(self.domain) + \
                        "/robots.txt")
         self.robot_parser = parser
         return parser
開發者ID:rafaduran,項目名稱:rdc_crawler,代碼行數:11,代碼來源:models.py

示例4: checkRobots

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import set_url [as 別名]
def checkRobots(URL):

	time.sleep(1)
	parsed = urlparse(URL)
	robotsUrl = parsed.scheme + "://"+ parsed.netloc+"/robots.txt"
	robotParser = RobotFileParser()
	robotParser.set_url(robotsUrl)
	robotParser.read()
	result = robotParser.can_fetch("*",URL)
	return result
開發者ID:kdin,項目名稱:Focused-Crawler,代碼行數:12,代碼來源:crawler.py

示例5: can_fetch

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import set_url [as 別名]
	def can_fetch(self,url):
		host,path=urlparse.urlparse(url)[1:3]
		if	(self.rules.has_key(host)):
			return self.rules[host].can_fetch(self.agent,url)
		else:
			rp=RobotFileParser()
			robot_url="http://"+host+"/robots.txt"
			rp.set_url(robot_url)
			rp.read()
			self.rules[host]=rp
			return rp.can_fetch(self.agent,url)	
開發者ID:lennon310,項目名稱:hyer,代碼行數:13,代碼來源:rules_monster.py

示例6: robots_check

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import set_url [as 別名]
def robots_check(url):

    # creating url for robots.txt
    root_url = tld.get_tld(url)
    prefix = "http://www."
    suffix = "/robots.txt"
    robots_url = prefix + root_url + suffix

    # checking url validity
    rp = RobotFileParser()
    rp.set_url(robots_url)
    rp.read()
    return rp.can_fetch("*", url)
開發者ID:ujaco,項目名稱:crawley,代碼行數:15,代碼來源:crawley.py

示例7: Host

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import set_url [as 別名]
class Host(object):
    ''' Represents one host. Responsible for parsing and analyzing
    ``robots.txt``.
    
    :param hostname: the name of the host extracted from an URL.
    '''
    def __init__(self, hostname):
        self.hostname = hostname
        
        self.rp = RobotFileParser()
        self.rp.set_url('http://%s/robots.txt' % self.hostname)
        
    def url_allowed(self, url):
        ''' Checks if the given url is allowed to crawl.
        
        :param url: URL to check.
        '''
        return self.rp.can_fetch(USER_AGENT, url)
開發者ID:schlamar,項目名稱:PySeeek,代碼行數:20,代碼來源:crawler.py

示例8: _get_soup

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import set_url [as 別名]
def _get_soup(path):
    """Gets soup from the given path, respecting robots.txt"""

    full_path = BASE_URL + path

    # Set a user-agent
    user_agent = 'dcnotify/%s' % __version__
    http_headers = {'User-Agent': '%s' % user_agent}

    # Honor robots.txt
    robots = RobotFileParser()
    robots.set_url("%s/robots.txt" % BASE_URL)
    robots.read()
    if not robots.can_fetch(user_agent, full_path):
        raise ValueError("Path disallowed by robots.txt")

    # Make a make a request, raising any HTTP errors that might occur
    request = get(full_path, headers=http_headers)
    request.raise_for_status()

    return bs(request.text)
開發者ID:seanthegeek,項目名稱:dcnotify,代碼行數:23,代碼來源:scraper.py

示例9: urlopen

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import set_url [as 別名]
    def urlopen(self, host):
        robo_url = host.get_robots_url()

        print self.robotdict

        cached_parser = self.robotdict.get(robo_url)
        if cached_parser:
            logging.info("Found in Cache: " + robo_url)
        else:
            logging.info("Fetching: " + robo_url)
            cached_parser = RobotFileParser()
            self.robotdict.put(robo_url, cached_parser)
            cached_parser.set_url(robo_url)
            cached_parser.read()

        if cached_parser.can_fetch('*', host. get_url()):
            print 'Going to fetch:', host.get_url()
            return self.fetch_file(host.get_url())
        else:
            logging.info("Forbidden by Robots.txt")
            return None
開發者ID:nilanjanbasu,項目名稱:Webcrawler,代碼行數:23,代碼來源:crawler.py

示例10: Crawler

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import set_url [as 別名]

#.........這裏部分代碼省略.........
			# Drop attributes if needed
			for toDrop in self.drop:
				link=re.sub(toDrop,'',link)

			# Parse the url to get domain and file extension
			parsed_link = urlparse.urlparse(link)
			domain_link = parsed_link.netloc
			target_extension = os.path.splitext(parsed_link.path)[1][1:]

			if (link in self.crawled):
				continue
			if (link in self.tocrawl):
				continue
			if (link in self.excluded):
				continue
			if (domain_link != self.target_domain):
				continue
			if ("javascript" in link):
				continue
			
			# Count one more URL
			self.nb_url+=1

			# Check if the navigation is allowed by the robots.txt
			if (not self.can_fetch(link)):
				self.exclude_link(link)
				self.nb_rp+=1
				continue

			# Check if the current file extension is allowed or not.
			if (target_extension in self.skipext):
				self.exclude_link(link)
				self.nb_exclude+=1
				continue

			# Check if the current url doesn't contain an excluded word
			if (not self.exclude_url(link)):
				self.exclude_link(link)
				self.nb_exclude+=1
				continue

			self.tocrawl.add(link)
			
		return None

	def __continue_crawling(self):
		if self.tocrawl:
			self.__crawling()

	def exclude_link(self,link):
		if link not in self.excluded:
			self.excluded.add(link)

	def checkRobots(self):
		if self.domain[len(self.domain)-1] != "/":
			self.domain += "/"
		request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent})
		self.rp = RobotFileParser()
		self.rp.set_url(self.domain+"robots.txt")
		self.rp.read()

	def can_fetch(self, link):
		try:
			if self.parserobots:
				if self.rp.can_fetch("*", link):
					return True
				else:
					logging.debug ("Crawling of {0} disabled by robots.txt".format(link))
					return False

			if not self.parserobots:
				return True

			return True
		except:
			# On error continue!
			logging.debug ("Error during parsing robots.txt")
			return True

	def exclude_url(self, link):
		for ex in self.exclude:
			if ex in link:
				return False
		return True

	def make_report(self):
		print ("Number of found URL : {0}".format(self.nb_url))
		print ("Number of link crawled : {0}".format(len(self.crawled)))
		if self.parserobots:
			print ("Number of link block by robots.txt : {0}".format(self.nb_rp))
		if self.skipext or self.exclude:
			print ("Number of link exclude : {0}".format(self.nb_exclude))

		for code in self.response_code:
			print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code]))

		for code in self.marked:
			print ("Link with status {0}:".format(code))
			for uri in self.marked[code]:
				print ("\t- {0}".format(uri))
開發者ID:vionemc,項目名稱:python-sitemap,代碼行數:104,代碼來源:crawler.py

示例11: SiteMap

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import set_url [as 別名]
class SiteMap():

    def __init__(self, main_page=None, robotrules=True):
        """
            Constuctor method that initializes the members that are used during crawling process
        :param main_page: The root page that needs to be crawled for generation of sitemap
        """

        logging.info("Consider Robot.txt ? ==> "+str(robotrules))
        self.robotrules = robotrules
        self.site_map = {}                          # map that records the visits of urls, datemodified and assets
        self.network = {}                           # map that maintains the network/graph of webpages visited
                                                    # The intention of this map is for visual rendering using d3.js

        self.unvisited = set([])                    # a set to keep the list of urls yet to be visited
        self.start_page = None                       # the root page, this is used to avoid cycle and keeping crawl
                                                    # process limited to single domain.
        self.robot_txt_rules = None

        if main_page:
            self.unvisited.add(main_page)
            try:
                self.start_page = urlparse(main_page).netloc
            except:
                logging.error("Improper URL, Please provide a Valid Url:"+main_page)
                exit(0)

        if self.robotrules == "True":
            try:
                logging.info("robot.txt respected")
                self.robot_txt_rules = RobotFileParser()
                self.robot_txt_rules.set_url(main_page + "/robots.txt")
                self.robot_txt_rules.read()
            except:
                logging.error("Unable to read the robot.txt file")
                self.robotrules = False # error reading robot.txt, ignore it forever

    @timeit
    def generate(self, site_map=None):
        """
            This method holds the invoking control of the crawler method and drives the crawling process.
            Basically a BFS style method that keeps popping the elements from the queue [self.unvisited set]
            and scraping the urls.

            Once the crawling process is done, this creates sitemap using the self.site_map dictionary with
            just url, date-modified tags with dummy frequency and priorities.
        :param site_map: name of the site_map file so as to create xml entries.
        :return:
        """
        while self.unvisited:
            self.crawl()
        # create xml from the site_map dictionary
        header = """<?xml version="1.0" encoding="UTF-8"?>
            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
            xmlns:xhtml="http://www.w3.org/1999/xhtml"
            xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
            xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
            http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
        """
        footer = """\n</urlset>\n"""
        entry = "\t<url>\n\
                 \t\t<loc>%s</loc>\n\
                 \t\t<lastmod>%s</lastmod>\n\
                 \t\t<changefreq>monthly</changefreq>\n\
                 \t\t<priority> 1 </priority>\n\
                 \t</url>\
        "

        xml = header
        for url in self.site_map.keys():
            xml += entry % (url, self.site_map[url]['date']) + "\n"

        xml += footer
        if site_map != None:
            self.write_to_file(site_map, xml)
        else:
            self.write_to_file("sitemap.xml", xml)
        return xml

    def write_to_file(self, file_name, content):
        """
            A utility method to just write the contents of the file into a given file name.
            Alert: This overwrites if the file does exist in the current directory.
        :param file_name: name of the file, sitemap in our case.
        :param content:   contents of the file
        :return: None
        """
        f = open(file_name, 'w')
        f.write(content)
        f.close()

    def compose_url_from_href(self, url, href):
        """
            There are different ways a href could specify a location and it varies in different ways based on how
            the page is designed. This method takes few styles into consideration and ignores some, cleans and creates
            a valid url link so as to keep it ready for the crawl method.
        :param url:   basae url of the current page
        :param href:  one of the hyper links of the page
        :return:      a well formed and valid http link
        """
#.........這裏部分代碼省略.........
開發者ID:nchikkam,項目名稱:projects,代碼行數:103,代碼來源:crawler.py

示例12: __init__

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import set_url [as 別名]
class SimpleCrawler:

  USER_AGENT = 'SimpleCrawler/0.1'
  HEADERS = {
    'User-Agent': USER_AGENT,
    'Accept-Encoding': 'gzip',
    'Connection': 'keep-alive'
    }
  CONTENT_TYPE_PAT = re.compile(r'([^\s;]+)(.*charset=([^\s;]+))?', re.I)
  
  def __init__(self, starturl, index_html='', maxlevel=1,
               cookie_file=None, acldb=None, urldb=None, default_charset=None,
               delay=0, timeout=300, debug=0):
    (proto, self.hostport, _x, _y, _z) = urlsplit(starturl)
    assert proto == 'http'
    #Thread.__init__(self)
    self.debug = debug
    self.index_html = index_html
    if cookie_file:
      self.cookiejar = MozillaCookieJar(cookie_file)
      self.cookiejar.load()
    else:
      self.cookiejar = None
    self.robotstxt = RobotFileParser()
    self.robotstxt.set_url(urljoin(starturl, '/robots.txt'))
    self.robotstxt.read()
    self.conn = None
    self.urldb = urldb
    self.acldb = acldb
    self.curlevel = 0
    self.delay = delay
    self.timeout = timeout
    self.default_charset = default_charset
    if starturl.endswith('/'):
      starturl += self.index_html
    self.urls = [(starturl, maxlevel)]
    self.crawled = {}                   # 1:injected, 2:crawled
    return

  def accept_url(self, url):
    if url.endswith('/'):
      url += self.index_html
    if self.acldb and not self.acldb.allowed(url):
      return None
    return url
  
  def inject_url(self, url):
    if (not self.curlevel) or (not url) or (url in self.crawled): return False
    if not self.robotstxt.can_fetch(self.USER_AGENT, url):
      if self.debug:
        print >>stderr, 'DISALLOW: %r' % url
      return None
    if self.debug:
      print >>stderr, 'INJECT: %r' % url
    self.crawled[url] = 1
    self.urls.append((url, self.curlevel-1))
    return True

  def get1(self, url, maxretry=3, maxredirect=3):
    if self.debug:
      print >>stderr, 'GET: %r' % url
    # loop
    for rtry in range(maxredirect):
      # forge urllib2.Request object.
      req = Request(url)
      # add cookie headers if necessary.
      if self.cookiejar:
        self.cookiejar.add_cookie_header(req)
        headers = req.unredirected_hdrs
        headers.update(self.HEADERS)
      else:
        headers = self.HEADERS
      # get response.
      for ctry in range(maxretry):
        try:
          if not self.conn:
            print >>stderr, 'Making connection: %r...' % (self.hostport,)
            self.conn = HTTPConnection(self.hostport)
          self.conn.request('GET', req.get_selector().replace(' ',''), '', headers)
	  self.conn.sock.settimeout(self.timeout)
          resp = self.conn.getresponse()
          break
        except BadStatusLine, x:
          # connection closed unexpectedly
          print >>stderr, 'Connection closed unexpectedly.'
          # it restarts the connection...
          self.conn.close()
          self.conn = None
        except socket.error, x:
          # connection closed unexpectedly
          print >>stderr, 'Socket error:', x
          self.conn.close()
          self.conn = None
      else:
開發者ID:dreamfrog,項目名稱:jophiel,代碼行數:96,代碼來源:textcrawler.py

示例13: Webpage

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import set_url [as 別名]
class Webpage(object):
    """
    Objects that refer to individual webpages. If the url is scrapeable the
    object will be filled with that data, indexed, and inserted into a database
    to be searched.
    """
    number_of_scraped_pages = 0

    def __init__(self, url):
        """
        Creates a webpage object and assigns it the provided url.
        """
        self.url = url
        if self.url not in black_list and self.url not in scraped_urls:
            self.needs_to_be_scraped = True
        else:
            self.needs_to_be_scraped = False

    def page_robot_scannable(self):
        """
        Checks whether the page is allowed to be crawled
        """
        if self.need_to_be_scraped is True:
            # REFACTOR to remove try statement.
            try:
                headers = {'User-agent':settings.SPIDER_USER_AGENT}
                self.urlparse = urlparse.urlparse(self.url)
                self.robotcheck = RobotFileParser()
                self.robotcheck.set_url('http://'+self.urlparse[1]+'/robots.txt') # Only works with http right now.
                self.robotcheck.read()
                self.need_to_be_scraped = self.robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url)
            except:
                self.need_to_be_scraped = False

    def get_page(self):
        """
        The url is requested with a GET request. The page html is scraped
        directly, while elements of it aee scraped in parse_page
        """
        self.headers = {'User-agent':settings.SPIDER_USER_AGENT}
        #REFACTOR to remove try
        try:
            self.request = requests.get(self.url, headers=headers)
            self.pagehtml = BeautifulSoup(self.request.text) #REFACTOR, don't use BeautifulSoup
            self.count = self.instanceID.next()
            Webpage.number_of_scraped_pages += 1
        except:
            raise Exception

    def get_visible_elements(self, element):
        """
        Checks that the element is not contained in <style>, <script>, <head>,
        <title> or [document]. It also cannot be commented out.
        """
        if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
            return False
        elif re.match('<!--.*-->', str(element)):
            return False
        return True

    def parse_page(self):
        """
        This method parses the HTML page and extracts the title of the page,
        the outgoing links, the number of outgoing links, and the text.
        """
        self.title = self.pagehtml.find('title').text
        self.page_text = self.pagehtml.findAll(text=true)

        for item in filter(get_visible_elements, self.pagetext):
            if item != '\n':
                self.pagetext+= item
        self.pagelinks = {}

        for link in soup.findAll('a'):
            self.pagelinks[link.get('href')] = 1

        for link in self.pagehtml:
            pass

        # determine if link is relative or absolute. if relative, change it to absolute

    def inverted_index_page_text(self):
        """
        Iterates through the words in the page text and creates and adds them
        to an index.
        """
        self.pagetextlist = self.pagetext.split(' ') #Noted error: This catches punctuation along with words.
        for index, word in enumerate(self.pagetextlist):
            if word not in STOP_WORDS:
                if not inverted_index.get(word):
                    inverted_index[word]={'url':self.url,'offsets':[index]}
                else:
                    inverted_index[word]['offsets'].append(index)

    def set_page_scraped(self):
        """
        Once the page is scraped it is flagged as such
        """
        self.needs_to_be_scraped = False
開發者ID:gabrielmiller,項目名稱:Py-Webscraper,代碼行數:101,代碼來源:spider.py

示例14: __init__

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import set_url [as 別名]
class MarioDepth:
    def __init__(self, starturl, callback, callpre=None, callfail=None, concount=MAXCONCOUNT, depth=2, accept_url_patterns=None, reject_url_patterns=None):
        self.concount = concount
        self.callback = callback
        self.callpre = callpre
        self.callfail = callfail
        self.depth = depth
        self.starturl = starturl
        self.baseurl = URL.baseurl(starturl)
        self.urls = []
        self.crawled = {}
        self.link_title_db = LinkTitleDB()
        self.accept_url_patterns = accept_url_patterns
        self.reject_url_patterns = reject_url_patterns
        self.robotstxt = RobotFileParser()
        self.robotstxt.set_url(urljoin(starturl, '/robots.txt'))
        self.referer = starturl
        try:
            self.robotstxt.read()
        except:
            logger.debug(Traceback())
        #self.lightcloud = LightCloud.connect('n0')
    
    def __call__(self, n=None):
        if n: self.concount = n
        current_depth = self.depth
        self.urls.append((self.starturl, current_depth))
        while self.urls:
            self.depth_get()
            logger.debug('%d unprocessed urls'%(len(self.urls)))
    
    def depth_get(self):
        mario = MarioBatch(callback=self.next_depth, callpre=self.callpre, callfail=self.callfail)
        pool = coros.CoroutinePool(max_size=len(self.urls))
        while self.urls:
            waiters = []
            #self.add_job(mario)
            counter = 0
            while self.urls:
                if counter > 9: break;
                counter += 1
                waiters.append(pool.execute(self.add_job, mario))
            logger.debug('Depth break')
            for waiter in waiters:
                waiter.wait()
            mario(self.concount)
    
    def add_job(self, mario):
        if not self.urls: return
        url, depth = self.urls.pop()
        if self.visited(url, depth): return
        mario.add_job(url, args=depth)
        
    def visited(self, url, depth):
        #is_duplicate = URL.is_duplicate(url, self.lightcloud)
        return depth==0 and is_duplicate or depth < self.depth and self.crawled.has_key(url) and self.crawled[url] == 2
    
    def next_depth(self, response):
        #with_timeout(1, self.lightcloud.set, LightCloud.crawled_url_key(response.effective_url), response.url, timeout_value=None)
        for link, title in URL.link_title(response.body, response.effective_url):
            if not self.inject_url(link, response.args):continue
            self.link_title_db.add(link, response.effective_url, title)
        if callable(self.callback): self.callback(response)
        self.crawled[response.effective_url] = 2
        if response.effective_url != response.url:
            self.crawled[response.url] = 2
        self.referer = response.effective_url
    
    def inject_url(self, url, depth):
        if not (depth and url and url not in self.crawled): 
            #logger.debug('IGNORE(%d): %r'%(depth, url))
            return None
        if isinstance(url, unicode): url = url.encode('utf-8')
        if self.reject_url(url): 
            logger.debug('REJECT: %r' % url)
            return None
        try:
            can_fetch = self.robotstxt.can_fetch(USER_AGENT['safari'], url)
        except:
            can_fetch = True
        if self.baseurl!='http://hi.baidu.com/' and not can_fetch:
            logger.debug('DISALLOW: %r' % url)
            return None
        logger.debug('INJECT(%d): %r' % (depth-1, url))
        self.crawled[url] = 1
        self.urls.append((url, depth-1))
        return True
    
    def reject_url(self, url):
        return self.baseurl != URL.baseurl(url) and (not self.accept_url_patterns or not re.match('|'.join(self.accept_url_patterns), url) or self.reject_url_patterns or re.match('|'.join(self.reject_url_patterns), url))
        
開發者ID:AmoebaFactor,項目名稱:SuperMario,代碼行數:92,代碼來源:Mario.py

示例15: test_parse

# 需要導入模塊: from robotparser import RobotFileParser [as 別名]
# 或者: from robotparser.RobotFileParser import set_url [as 別名]
	def test_parse(self):
		from robotparser import RobotFileParser
		rules=RobotFileParser()
		rules.set_url("http://www.sogou.com/robots.txt")
		rules.read()
		self.assertEqual(rules.can_fetch("mozilla","http://www.sogou.com/sohu/robots.txt"),False)
開發者ID:lennon310,項目名稱:hyer,代碼行數:8,代碼來源:robot.py


注:本文中的robotparser.RobotFileParser.set_url方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。