当前位置: 首页>>代码示例>>Python>>正文


Python urlnorm.norm函数代码示例

本文整理汇总了Python中urlnorm.norm函数的典型用法代码示例。如果您正苦于以下问题:Python norm函数的具体用法?Python norm怎么用?Python norm使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了norm函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: normalize_url

def normalize_url(base_url, url):
	myfile3 = open('normalization_log', 'a')
	myfile3.write("base url:{0}\n".format(base_url))
	myfile3.write("url:{0}\n".format(url))
	myfile3.close()
	result = ''

	# if url starts with http:// or https://
	allowed_scheme = ['http', 'https']
	url_scheme = urlparse(url).scheme
	if url_scheme in allowed_scheme:
		return urlnorm.norm(url)
	elif url_scheme == 'mailto':
		return False
	elif len(url_scheme) == 0:
		# check if URL starts with ../
		if (url[:3] == '../') or (url[:2] == './'):
			return urlnorm.norm(base_url+'/'+url)
		elif url[0] == '/': # e.g. /page/page
			# That means it's the domain + url
			url_obj = urlparse(base_url)
			new_url = url_obj.scheme + "://" + url_obj.netloc + url
			return urlnorm.norm(new_url)

		else: # URL should be just html page e.g. research.html
			# so we need to replace the last part
			# if URL is 'http://www.test.com/page/page/12345':
			# results will be ['http://www.test.com/page/page', '12345']
			parts = base_url.rsplit('/', 1)
			return urlnorm.norm(parts[0]+'/'+url)
	result = url
	return result
开发者ID:sigmundc,项目名称:CS6965,代码行数:32,代码来源:crawler3.py

示例2: main

def main():
	if (len(sys.argv) < 3 ):
		print "usage: python ll-print.py <url> <search term>"
		print "example: python ll-print.py http://www.hunch.com 'hunch team'"
		exit(0)
	root_URL = sys.argv[1]
	search_term = sys.argv[2]
	if (not validate_search_term(search_term)):
		print "Invalid search term.  Please only use valid url characters and spaces."
		exit(1)
	first_letter = search_term[0]
	first_letter_match = root_URL.find(first_letter.lower())
	if (first_letter_match != -1):
		try:
			br = mechanize.Browser()
			br._factory.is_html = True
			result = []
			br.open(root_URL)
			# print "visiting: " + urlnorm.norm(br.geturl())
			visited = set([urlnorm.norm(br.geturl()), urlnorm.norm(root_URL)])
			result = find_matching_links(br, search_term, result, visited)
			if (result):
				max_index = max(result, key=lambda u: u[1])[1]
				for l, i, c in result:
					print_url(l, i, max_index)
		except urlnorm.InvalidUrl:
			print "Invalid root URL"
		except urllib2.URLError, e:
			print "Error opening root URL"
			print e
		except Exception, e:
			print e
开发者ID:rickychang,项目名称:letter-link-crawl,代码行数:32,代码来源:llc.py

示例3: find_matching_links

def find_matching_links(br, target_word, result, visited):
	if (not target_word):
		return result
	else:
		current_URL = urlnorm.norm(br.geturl())
		current_letter = target_word[0].lower()
		if (current_letter.isspace()):
			return find_matching_links(br, target_word[1:], result + [('', -1, ' ')], visited)
		else:
			matching_index = current_URL[7:].find(current_letter)
			if (matching_index == -1):
				return []
			else:
				new_result = result + [(current_URL, matching_index + 7, current_letter)]
				links = list(br.links())
				for link in links:
					try:
						link_URL = urlnorm.norm(link.absolute_url)
						if (link_URL not in visited):
							br.open(link_URL)
							new_visited = visited.copy()
							new_visited.add(link_URL)
							# print "visiting: " + urlnorm.norm(br.geturl())
							new_visited.add(urlnorm.norm(br.geturl()))
							child_result = find_matching_links(br, target_word[1:], new_result, new_visited)
							if (child_result):
								return child_result
 					except Exception, e:
						continue
开发者ID:rickychang,项目名称:letter-link-crawl,代码行数:29,代码来源:llc.py

示例4: task_listener_crawler

def task_listener_crawler(gearman_worker, gearman_job):
	url = gearman_job.data
	url_frontier.add(url)
	urls = urlparse.urlparse(url)
	print "Crawling ", url
	response = requests.get(url, crawler_headers)
	print 'Downloaded page'
	if response.status_code == 200:
		raw_data = response.text
		if response.encoding != 'utf8':
			raw_data = response.text.encode(response.encoding).decode('utf8')
		r.table(raw_result_table).insert({'url': url, 'raw': raw_data, 'status': 200}, conflict="replace").run(rethink)

		links = linkregex.findall(raw_data)
		for link in (links.pop(0) for _ in xrange(len(links))):
			pre_norm_url = url_pre_norm(link, urls)
			norm_url = urlnorm.norm(pre_norm_url)
			norm_parts = urlparse.urlparse(norm_url)
			ext_url = norm_parts.path.split(".")[-1].lower()
			if ext_url not in except_url_suffixes and url_frontier.add(norm_url):
				print "Add ", norm_url, " to redis queue"
				redis_client.rpush("urls:enqueued", norm_url)
		print "Done"
		return "ok"
	else:
		r.table(raw_result_table).insert({'url': url, 'status': response.status_code}, conflict="replace").run(rethink)
	return "fail"
开发者ID:khanhicetea,项目名称:distributed-webcrawler,代码行数:27,代码来源:crawler.py

示例5: __init__

    def __init__(self, url, previous=None, **info):
        # Apply the simple idempotent optimizations to all urls (no need to
        # ever deal with "HTTP://.."). This means case-sensitivity, and a
        # whole lot of other things that the urlnorm library will do for us.
        # We call this the original url, even though it is a bit of a lie.
        try:
            self.original_url = urlnorm.norm(url)
        except urlnorm.InvalidUrl as e:
            raise urlnorm.InvalidUrl('{}: {}'.format(e, url))

        # For the normalized url that we'll be exposing, remove the
        # fragment, and treat https and http the same.
        url, fragment = urldefrag(self.original_url)
        self.lossy_url_data = {'fragment': fragment}
        if url.startswith('https:'):
            url = 'http' + url[5:]
            self.lossy_url_data.update({'protocol': 'https'})
        self.url = url

        self.set_previous(previous)
        self.info = info
        self.post = None

        # Runtime data
        self.response = None
        self.exception = None
        self.retries = 0
开发者ID:miracle2k,项目名称:track0,代码行数:27,代码来源:spider.py

示例6: processPage

def processPage():
    while not urls.counter > urlcount:
        try:
            link = urlpool.get()
            newurl = urlparse.urljoin(link.base_url, link.url) # Converting relative URLs to Absolute ones
            newurl = unicode(urlnorm.norm(newurl)) # Normalizing URL
            print "out: " + newurl
            disassembled = urlparse.urlsplit(newurl)
            filename, file_ext = splitext(basename(disassembled.path)) # Finding file extension for filtering exclusions
            file_ext = file_ext.lower()
            if filename == 'index':
                newurl = newurl[:-len(filename + file_ext)]
            if (file_ext not in excludedExtensions and disassembled.scheme in ['http', 'https'] and disassembled.fragment == ''):
                print "in : " + newurl
                if newurl not in visited: # Checking to see if URL has already been queued once
                    visited.add(newurl)
                    if urlContains(newurl, searchTags) > 0:
                        urls.put(newurl, 1)
                    else:
                        priority = priorityCalculator.searchPage(newurl, searchTags)
                        if priority < len(searchTags) + 1:
                            urls.put(newurl, priority) # Adding URL to queue with calculated priority
        except UnicodeEncodeError:
            print "UnicodeEncodeError"
        except:
            print "Invalid URL"
开发者ID:Walliee,项目名称:FocusedCrawler,代码行数:26,代码来源:Crawler.py

示例7: canonicalize

def canonicalize(url):
    """Canonicalize a URL in just a few easy steps:

        1. Resolve any redirects
        2. Normalize the URL
        3. Strip any superflous query params
        4. Sort any remaining query params
        5. Profit!

    This relies on the urlnorm module for normalization, and, at the moment,
    just removes utm_* query params.

    TODO: Special case normalization for major sites (e.g. youtube)?
    """
    url = urlnorm.norm(resolve(url))
    url_parts = urlparse.urlsplit(url)
    scheme, netloc, path, query, fragment = url_parts

    params = []
    for key, value in cgi.parse_qs(query).iteritems():
        if exclude_param(url_parts, key, value):
            continue
        if isinstance(value, list):
            params.extend((key, v) for v in value)
        else:
            params.append((key, value))

    query = urllib.urlencode(sorted(params), doseq=1)
    return urlparse.urlunsplit((scheme, netloc, path, query, ''))
开发者ID:MattLeMay,项目名称:thresholderbot,代码行数:29,代码来源:urlwork.py

示例8: test_invalid_urls

def test_invalid_urls(url):
    try:
        output = urlnorm.norm(url)
        print '%r' % output
    except urlnorm.InvalidUrl:
        return
    assert 1 == 0, "this should have raised an InvalidUrl exception"
开发者ID:jehiah,项目名称:urlnorm,代码行数:7,代码来源:test_urlnorm.py

示例9: normalize_url

def normalize_url(url):
    # TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py
    try:
        norm = urlnorm.norm(url)
        norm, _ = urldefrag(norm)
        return norm.rstrip('/')
    except:
        return None
开发者ID:adamchainz,项目名称:aleph,代码行数:8,代码来源:urls.py

示例10: normalize_url

def normalize_url(url):
    norm_url = urlnorm.norm(url)
    if norm_url.startswith("https://"):
        return norm_url[8:]
    elif norm_url.startswith("http://"):
        return norm_url[7:]
    else:
        return norm_url
开发者ID:osks,项目名称:komfeeder,代码行数:8,代码来源:feedimporter.py

示例11: googleSearch

def googleSearch ( searchString ):
    g = pygoogle(searchString)
    g.pages = 2
    urls = g.get_urls()
    urls = urls[:10]
    for i in range(len(urls)):
        urls[i]=unicode(urlnorm.norm(urls[i]))

    return urls
开发者ID:Walliee,项目名称:FocusedCrawler,代码行数:9,代码来源:gQuery.py

示例12: new

 def new(cls, *args, **kwargs):
     obj = cls(*args)
     obj.source = kwargs['source']
     obj.duplicates = 0
     obj.priority = 0
     # normalize url
     if hasattr(obj, 'url'):
         obj.url = urlnorm.norm(obj.url)
     return obj
开发者ID:axknightroad,项目名称:metasearch,代码行数:9,代码来源:base.py

示例13: normalize_url

def normalize_url(url):
    # TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py
    try:
        url = urlnorm.norm(url)
        url, _ = urldefrag(url)
        url = url.rstrip("/")
        return url
    except:
        return None
开发者ID:rlugojr,项目名称:krauler,代码行数:9,代码来源:url.py

示例14: clean

 def clean(self):
     """Ensures that URLs are canonized before saving"""
     self.value = refang(self.value.strip())
     try:
         if re.match(r"[^:]+://", self.value) is None:  # if no schema is specified, assume http://
             self.value = u"http://{}".format(self.value)
         self.value = urlnorm.norm(self.value)
     except urlnorm.InvalidUrl:
         raise ObservableValidationError("Invalid URL: {}".format(self.value))
开发者ID:carriercomm,项目名称:yeti,代码行数:9,代码来源:url.py

示例15: __init__

	def __init__(self, url):
		"""Construct from a string or Django request."""
		nurl = urlnorm.norm(url.encode('utf-16').lower())
		if hasattr(nurl, 'get_full_path'):
			nurl = nurl.get_full_path()

		self.scheme, self.netloc, self.path, self.params, \
			self.query, self.fragment = urlparse.urlparse(nurl)
		filename, self.ftype = os.path.splitext(self.path)
		self.args = dict(cgi.parse_qsl(self.query))
开发者ID:file-citas,项目名称:pyhtoncrawler,代码行数:10,代码来源:url.py


注:本文中的urlnorm.norm函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。