当前位置: 首页>>代码示例>>Python>>正文


Python RobotFileParser.can_fetch方法代码示例

本文整理汇总了Python中robotparser.RobotFileParser.can_fetch方法的典型用法代码示例。如果您正苦于以下问题:Python RobotFileParser.can_fetch方法的具体用法?Python RobotFileParser.can_fetch怎么用?Python RobotFileParser.can_fetch使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在robotparser.RobotFileParser的用法示例。


在下文中一共展示了RobotFileParser.can_fetch方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: is_page_robot_scannable

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
 def is_page_robot_scannable(self):
     """
     Returns a boolean that tells whether the page is robot scrapeable.
     """
     robotcheck = RobotFileParser()
     robotcheck.set_url(self.urlparse[0]+'://'+self.urlparse[1]+'/robots.txt')
     robotcheck.read()
     return robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url)
开发者ID:gabrielmiller,项目名称:Py-Webscraper,代码行数:10,代码来源:webpage.py

示例2: _allowed_to_open

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
 def _allowed_to_open(self, url):
     host = urlparse.urlsplit(url)[1]
     robots_url = urlparse.urlunsplit(('http', host, '/robots.txt', '', ''))
     rp = RobotFileParser(robots_url)
     try:
         rp.read()
     except:
         return False
     return rp.can_fetch(self._agent_name, url)
开发者ID:AntonDatsik,项目名称:L4,代码行数:11,代码来源:crawler.py

示例3: checkRobots

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
def checkRobots(URL):

	time.sleep(1)
	parsed = urlparse(URL)
	robotsUrl = parsed.scheme + "://"+ parsed.netloc+"/robots.txt"
	robotParser = RobotFileParser()
	robotParser.set_url(robotsUrl)
	robotParser.read()
	result = robotParser.can_fetch("*",URL)
	return result
开发者ID:kdin,项目名称:Focused-Crawler,代码行数:12,代码来源:crawler.py

示例4: http_open

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
    def http_open(self, req):

        url = req.get_full_url()
        host = urlsplit(url)[1]
        robots_url = urlunsplit(('http', host, '/robots.txt', '', ''))
        robotfileparser = RobotFileParser(robots_url)
        robotfileparser.read()
        if not robotfileparser.can_fetch(self.crawlername, url):
            raise RuntimeError('Forbidden by robots.txt')
        return urllib2.HTTPHandler.http_open(self, req)
开发者ID:KroosT,项目名称:BSUIR_Python_Lab_003,代码行数:12,代码来源:crawler.py

示例5: can_fetch

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
	def can_fetch(self,url):
		host,path=urlparse.urlparse(url)[1:3]
		if	(self.rules.has_key(host)):
			return self.rules[host].can_fetch(self.agent,url)
		else:
			rp=RobotFileParser()
			robot_url="http://"+host+"/robots.txt"
			rp.set_url(robot_url)
			rp.read()
			self.rules[host]=rp
			return rp.can_fetch(self.agent,url)	
开发者ID:lennon310,项目名称:hyer,代码行数:13,代码来源:rules_monster.py

示例6: robots_check

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
def robots_check(url):

    # creating url for robots.txt
    root_url = tld.get_tld(url)
    prefix = "http://www."
    suffix = "/robots.txt"
    robots_url = prefix + root_url + suffix

    # checking url validity
    rp = RobotFileParser()
    rp.set_url(robots_url)
    rp.read()
    return rp.can_fetch("*", url)
开发者ID:ujaco,项目名称:crawley,代码行数:15,代码来源:crawley.py

示例7: robots_precheck

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
 def robots_precheck(self, url):
     """
     If we have the robots.txt file available, check it to see if the
     request is permissible.
     
     This does not fetch robots.txt.
     """
     
     fetcher = RedFetcher(url)
     robots_txt = fetcher.fetch_robots_txt(url, lambda a:a, network=False)
     if robots_txt == "":
         return True
     checker = RobotFileParser()
     checker.parse(robots_txt.splitlines())
     return checker.can_fetch(UA_STRING, url)
开发者ID:jrottenberg,项目名称:redbot,代码行数:17,代码来源:webui.py

示例8: Host

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
class Host(object):
    ''' Represents one host. Responsible for parsing and analyzing
    ``robots.txt``.
    
    :param hostname: the name of the host extracted from an URL.
    '''
    def __init__(self, hostname):
        self.hostname = hostname
        
        self.rp = RobotFileParser()
        self.rp.set_url('http://%s/robots.txt' % self.hostname)
        
    def url_allowed(self, url):
        ''' Checks if the given url is allowed to crawl.
        
        :param url: URL to check.
        '''
        return self.rp.can_fetch(USER_AGENT, url)
开发者ID:schlamar,项目名称:PySeeek,代码行数:20,代码来源:crawler.py

示例9: crawl

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
    def crawl(self, seed_url, max_urls=30, max_depth=1, obey_robots=False, max_size=1000000, force_html=True, **kwargs):
        """Crawl website html and return list of URLs crawled

        seed_url: url to start crawling from
        max_urls: maximum number of URLs to crawl (use None for no limit)
        max_depth: maximum depth to follow links into website (use None for no limit)
        obey_robots: whether to obey robots.txt
        max_size is passed to get() and is limited to 1MB by default
        force_text is passed to get() and is set to True by default so only crawl HTML content
        **kwargs is passed to get()
        """
        user_agent = kwargs.get("user_agent", self.user_agent)
        server = "http://" + extract_domain(seed_url)
        robots = RobotFileParser()
        if obey_robots:
            robots.parse(self.get(server + "/robots.txt").splitlines())  # load robots.txt
        outstanding = [(seed_url, 0), (server, 0)]  # which URLs need to crawl
        crawled = []  # urls that have crawled

        while outstanding:
            # more URLs to crawl
            if len(crawled) == max_urls:
                break
            url, cur_depth = outstanding.pop(0)
            if url not in crawled:
                html = self.get(url, max_size=max_size, force_html=force_html, **kwargs)
                crawled.append(url)
                if max_depth is None or cur_depth < max_depth:
                    # continue crawling
                    for scraped_url in re.findall(re.compile("<a[^>]+href=[\"'](.*?)[\"']", re.IGNORECASE), html):
                        if "#" in scraped_url:
                            scraped_url = scraped_url[
                                : scraped_url.index("#")
                            ]  # remove internal links to prevent duplicates
                        if os.path.splitext(scraped_url)[
                            -1
                        ].lower() not in Download.IGNORED_EXTENSIONS and robots.can_fetch(user_agent, scraped_url):
                            scraped_url = urljoin(server, scraped_url)  # support relative links
                            # check if same domain or sub-domain
                            this_server = extract_domain(scraped_url)
                            if this_server and (this_server in server or server in this_server):
                                outstanding.append((scraped_url, cur_depth + 1))
        return crawled
开发者ID:staticerror,项目名称:SeKing,代码行数:45,代码来源:download.py

示例10: urlopen

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
    def urlopen(self, host):
        robo_url = host.get_robots_url()

        print self.robotdict

        cached_parser = self.robotdict.get(robo_url)
        if cached_parser:
            logging.info("Found in Cache: " + robo_url)
        else:
            logging.info("Fetching: " + robo_url)
            cached_parser = RobotFileParser()
            self.robotdict.put(robo_url, cached_parser)
            cached_parser.set_url(robo_url)
            cached_parser.read()

        if cached_parser.can_fetch('*', host. get_url()):
            print 'Going to fetch:', host.get_url()
            return self.fetch_file(host.get_url())
        else:
            logging.info("Forbidden by Robots.txt")
            return None
开发者ID:nilanjanbasu,项目名称:Webcrawler,代码行数:23,代码来源:crawler.py

示例11: _get_soup

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
def _get_soup(path):
    """Gets soup from the given path, respecting robots.txt"""

    full_path = BASE_URL + path

    # Set a user-agent
    user_agent = 'dcnotify/%s' % __version__
    http_headers = {'User-Agent': '%s' % user_agent}

    # Honor robots.txt
    robots = RobotFileParser()
    robots.set_url("%s/robots.txt" % BASE_URL)
    robots.read()
    if not robots.can_fetch(user_agent, full_path):
        raise ValueError("Path disallowed by robots.txt")

    # Make a make a request, raising any HTTP errors that might occur
    request = get(full_path, headers=http_headers)
    request.raise_for_status()

    return bs(request.text)
开发者ID:seanthegeek,项目名称:dcnotify,代码行数:23,代码来源:scraper.py

示例12: run_continue

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
    def run_continue(self, robots_txt):
        """
        Continue after getting the robots file.
        TODO: refactor callback style into events.
        """
        if robots_txt == "": # empty or non-200
            pass
        else:
            checker = RobotFileParser()
            checker.parse(robots_txt.decode('ascii', 'replace').encode('ascii', 'replace').splitlines())
            if not checker.can_fetch(UA_STRING, self.request.uri):
                self.response.http_error = RobotsTxtError()
                self.finish_task()
                return # TODO: show error?

        if 'user-agent' not in [i[0].lower() for i in self.request.headers]:
            self.request.headers.append(
                (u"User-Agent", UA_STRING))
        self.exchange = self.client.exchange()
        self.exchange.on('response_start', self._response_start)
        self.exchange.on('response_body', self._response_body)
        self.exchange.on('response_done', self._response_done)
        self.exchange.on('error', self._response_error)
        if self.status_cb and self.name:
            self.status_cb("fetching %s (%s)" % (
                self.request.uri, self.name
            ))
        req_hdrs = [
            (k.encode('ascii', 'replace'), v.encode('latin-1', 'replace')) \
            for (k, v) in self.request.headers
        ]
        self.exchange.request_start(
            self.request.method, self.request.uri, req_hdrs
        )
        self.request.start_time = thor.time()
        if self.request.payload != None:
            self.exchange.request_body(self.request.payload)
            self.transfer_out += len(self.request.payload)
        self.exchange.request_done([])
开发者ID:ElijahLynn,项目名称:redbot,代码行数:41,代码来源:fetch.py

示例13: __init__

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]
    def __init__(self, url):
        self.url = urlManip.cleanURL(url)
        self.pages = []
        self.suggestions = set()
        self.loaded = False
        logger.info("Loading %s..." % (self.url))
        try:
            requests.get(self.url)
            self.loaded = True
        except IOError as e:
            logger.error("%s cannot be loaded: %s" % (self.url, e))

        # if the website can be loaded
        if self.loaded == True:
            logger.info("Load successful. Generating suggestions...")

            # get robots.txt
            rp = RobotFileParser(self.url + "robots.txt")
            try:
                rp.read()
            except IOError:
                logger.warning("robots.txt cannot be found.")

            # get home page
            self.pages.append(Page(self.url))

            # get all pages on homepage
            self.pages[0].load()
            for link in self.pages[0].internalLinks:
                if rp.can_fetch("*", link):
                    if link[:4] == 'http':
                        self.pages.append(Page(link))
                    else:
                        self.pages.append(Page(self.url + link))
                else:
                    logger.debug("Ignoring %s based on robots.txt" % link)
开发者ID:paywithscratch,项目名称:scratchsuggestionscraper,代码行数:38,代码来源:scraper.py

示例14: Crawler

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]

#.........这里部分代码省略.........
			elif link.startswith('#'):
				link = 'http://' + url[1] + url[2] + link
			elif not link.startswith('http'):
				link = 'http://' + url[1] + '/' + link
			
			# Remove the anchor part if needed
			if "#" in link:
				link = link[:link.index('#')]

			# Drop attributes if needed
			for toDrop in self.drop:
				link=re.sub(toDrop,'',link)

			# Parse the url to get domain and file extension
			parsed_link = urlparse.urlparse(link)
			domain_link = parsed_link.netloc
			target_extension = os.path.splitext(parsed_link.path)[1][1:]

			if (link in self.crawled):
				continue
			if (link in self.tocrawl):
				continue
			if (link in self.excluded):
				continue
			if (domain_link != self.target_domain):
				continue
			if ("javascript" in link):
				continue
			
			# Count one more URL
			self.nb_url+=1

			# Check if the navigation is allowed by the robots.txt
			if (not self.can_fetch(link)):
				self.exclude_link(link)
				self.nb_rp+=1
				continue

			# Check if the current file extension is allowed or not.
			if (target_extension in self.skipext):
				self.exclude_link(link)
				self.nb_exclude+=1
				continue

			# Check if the current url doesn't contain an excluded word
			if (not self.exclude_url(link)):
				self.exclude_link(link)
				self.nb_exclude+=1
				continue

			self.tocrawl.add(link)
			
		return None

	def __continue_crawling(self):
		if self.tocrawl:
			self.__crawling()

	def exclude_link(self,link):
		if link not in self.excluded:
			self.excluded.add(link)

	def checkRobots(self):
		if self.domain[len(self.domain)-1] != "/":
			self.domain += "/"
		request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent})
开发者ID:vionemc,项目名称:python-sitemap,代码行数:70,代码来源:crawler.py

示例15: SiteMap

# 需要导入模块: from robotparser import RobotFileParser [as 别名]
# 或者: from robotparser.RobotFileParser import can_fetch [as 别名]

#.........这里部分代码省略.........
            try:
                list.append(x['src'])
            except KeyError:
                pass

        csss = []
        imgs = []
        jss = []
        for link in css:
            csss.append(link['href'])
        for link in img:
            imgs.append(link['src'])
        for link in jss:
            jss.append(link['src'])

        return {
                'css': csss,
                'img': imgs,
                'js':  jss
        }

    def crawl(self):
        """
            The main driver method that crawls the pages. This main does below steps:
            for every unvisited [vertex|page] that belongs to the requested domain:
                crawl the page
                record valid links and their last-modified-dates
        :return:   None
        """
        page = self.unvisited.pop()
        # if robot.txt is defined, use Disallow to avoid pages. domain.robot.txt doesn't exist so the crawler
        # must find all the pages for report.
        logging.info("Starting to Crawl Page: " + page)

        url = urlparse(page)
        try:
            response = urlopen(page)
        except:
            logging.debug("Issue with the url: " + page)
            return None
        try:
            html_body = response.read() # response.getcode()
            response.close()
            # record visit ans assets
            self.record_visit(page, response.headers, html_body)
            logging.debug("Queued Pages: {0}, Crawled Pages: {1}".format(len(self.unvisited), len(self.site_map)))
        except:
            logging.debug("Issue while opening url: " + page)
            return None
        connects = self.get_out_going_edges(url, html_body)

        # simple Graph that keeps the order of the pages crawled.
        for i, url in enumerate(connects):
            self.network[page] = {
                'to': connects,
                'assets': {
                    'css': self.site_map[page]['assets']['css'],
                    'js':  self.site_map[page]['assets']['js'],
                    'img': self.site_map[page]['assets']['img']
                }
            }
        return None

    def get_site_map(self):
        """
            Returns the compiled sitemap structure
        :return:       sitemap data structure
        """
        return self.site_map

    def get_network_graph(self):
        """
            Returns the compiled network in the order of the crawled pages
        :return:       network graph
        """
        return self.network

    def get_network_json_format(self):
        """
            Returns the crawl traverse order sequence in json format
        :return:       network in json format
        """
        return json.dumps(self.network)

    def set_start_page(self, url):
        """
            This could be useful if one is testing
        :param url: start page to start the crawling.
        :return:
        """
        self.start_page = url

    def robot_allows(self, link):
        if not self.robotrules: return True
        try:
            if self.robot_txt_rules.can_fetch("*", link):
                    return True
            return False
        except:
            return True
开发者ID:nchikkam,项目名称:projects,代码行数:104,代码来源:crawler.py


注:本文中的robotparser.RobotFileParser.can_fetch方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。