本文整理汇总了Python中urlnorm.norm函数的典型用法代码示例。如果您正苦于以下问题:Python norm函数的具体用法?Python norm怎么用?Python norm使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了norm函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: normalize_url
def normalize_url(base_url, url):
myfile3 = open('normalization_log', 'a')
myfile3.write("base url:{0}\n".format(base_url))
myfile3.write("url:{0}\n".format(url))
myfile3.close()
result = ''
# if url starts with http:// or https://
allowed_scheme = ['http', 'https']
url_scheme = urlparse(url).scheme
if url_scheme in allowed_scheme:
return urlnorm.norm(url)
elif url_scheme == 'mailto':
return False
elif len(url_scheme) == 0:
# check if URL starts with ../
if (url[:3] == '../') or (url[:2] == './'):
return urlnorm.norm(base_url+'/'+url)
elif url[0] == '/': # e.g. /page/page
# That means it's the domain + url
url_obj = urlparse(base_url)
new_url = url_obj.scheme + "://" + url_obj.netloc + url
return urlnorm.norm(new_url)
else: # URL should be just html page e.g. research.html
# so we need to replace the last part
# if URL is 'http://www.test.com/page/page/12345':
# results will be ['http://www.test.com/page/page', '12345']
parts = base_url.rsplit('/', 1)
return urlnorm.norm(parts[0]+'/'+url)
result = url
return result
示例2: main
def main():
if (len(sys.argv) < 3 ):
print "usage: python ll-print.py <url> <search term>"
print "example: python ll-print.py http://www.hunch.com 'hunch team'"
exit(0)
root_URL = sys.argv[1]
search_term = sys.argv[2]
if (not validate_search_term(search_term)):
print "Invalid search term. Please only use valid url characters and spaces."
exit(1)
first_letter = search_term[0]
first_letter_match = root_URL.find(first_letter.lower())
if (first_letter_match != -1):
try:
br = mechanize.Browser()
br._factory.is_html = True
result = []
br.open(root_URL)
# print "visiting: " + urlnorm.norm(br.geturl())
visited = set([urlnorm.norm(br.geturl()), urlnorm.norm(root_URL)])
result = find_matching_links(br, search_term, result, visited)
if (result):
max_index = max(result, key=lambda u: u[1])[1]
for l, i, c in result:
print_url(l, i, max_index)
except urlnorm.InvalidUrl:
print "Invalid root URL"
except urllib2.URLError, e:
print "Error opening root URL"
print e
except Exception, e:
print e
示例3: find_matching_links
def find_matching_links(br, target_word, result, visited):
if (not target_word):
return result
else:
current_URL = urlnorm.norm(br.geturl())
current_letter = target_word[0].lower()
if (current_letter.isspace()):
return find_matching_links(br, target_word[1:], result + [('', -1, ' ')], visited)
else:
matching_index = current_URL[7:].find(current_letter)
if (matching_index == -1):
return []
else:
new_result = result + [(current_URL, matching_index + 7, current_letter)]
links = list(br.links())
for link in links:
try:
link_URL = urlnorm.norm(link.absolute_url)
if (link_URL not in visited):
br.open(link_URL)
new_visited = visited.copy()
new_visited.add(link_URL)
# print "visiting: " + urlnorm.norm(br.geturl())
new_visited.add(urlnorm.norm(br.geturl()))
child_result = find_matching_links(br, target_word[1:], new_result, new_visited)
if (child_result):
return child_result
except Exception, e:
continue
示例4: task_listener_crawler
def task_listener_crawler(gearman_worker, gearman_job):
url = gearman_job.data
url_frontier.add(url)
urls = urlparse.urlparse(url)
print "Crawling ", url
response = requests.get(url, crawler_headers)
print 'Downloaded page'
if response.status_code == 200:
raw_data = response.text
if response.encoding != 'utf8':
raw_data = response.text.encode(response.encoding).decode('utf8')
r.table(raw_result_table).insert({'url': url, 'raw': raw_data, 'status': 200}, conflict="replace").run(rethink)
links = linkregex.findall(raw_data)
for link in (links.pop(0) for _ in xrange(len(links))):
pre_norm_url = url_pre_norm(link, urls)
norm_url = urlnorm.norm(pre_norm_url)
norm_parts = urlparse.urlparse(norm_url)
ext_url = norm_parts.path.split(".")[-1].lower()
if ext_url not in except_url_suffixes and url_frontier.add(norm_url):
print "Add ", norm_url, " to redis queue"
redis_client.rpush("urls:enqueued", norm_url)
print "Done"
return "ok"
else:
r.table(raw_result_table).insert({'url': url, 'status': response.status_code}, conflict="replace").run(rethink)
return "fail"
示例5: __init__
def __init__(self, url, previous=None, **info):
# Apply the simple idempotent optimizations to all urls (no need to
# ever deal with "HTTP://.."). This means case-sensitivity, and a
# whole lot of other things that the urlnorm library will do for us.
# We call this the original url, even though it is a bit of a lie.
try:
self.original_url = urlnorm.norm(url)
except urlnorm.InvalidUrl as e:
raise urlnorm.InvalidUrl('{}: {}'.format(e, url))
# For the normalized url that we'll be exposing, remove the
# fragment, and treat https and http the same.
url, fragment = urldefrag(self.original_url)
self.lossy_url_data = {'fragment': fragment}
if url.startswith('https:'):
url = 'http' + url[5:]
self.lossy_url_data.update({'protocol': 'https'})
self.url = url
self.set_previous(previous)
self.info = info
self.post = None
# Runtime data
self.response = None
self.exception = None
self.retries = 0
示例6: processPage
def processPage():
while not urls.counter > urlcount:
try:
link = urlpool.get()
newurl = urlparse.urljoin(link.base_url, link.url) # Converting relative URLs to Absolute ones
newurl = unicode(urlnorm.norm(newurl)) # Normalizing URL
print "out: " + newurl
disassembled = urlparse.urlsplit(newurl)
filename, file_ext = splitext(basename(disassembled.path)) # Finding file extension for filtering exclusions
file_ext = file_ext.lower()
if filename == 'index':
newurl = newurl[:-len(filename + file_ext)]
if (file_ext not in excludedExtensions and disassembled.scheme in ['http', 'https'] and disassembled.fragment == ''):
print "in : " + newurl
if newurl not in visited: # Checking to see if URL has already been queued once
visited.add(newurl)
if urlContains(newurl, searchTags) > 0:
urls.put(newurl, 1)
else:
priority = priorityCalculator.searchPage(newurl, searchTags)
if priority < len(searchTags) + 1:
urls.put(newurl, priority) # Adding URL to queue with calculated priority
except UnicodeEncodeError:
print "UnicodeEncodeError"
except:
print "Invalid URL"
示例7: canonicalize
def canonicalize(url):
"""Canonicalize a URL in just a few easy steps:
1. Resolve any redirects
2. Normalize the URL
3. Strip any superflous query params
4. Sort any remaining query params
5. Profit!
This relies on the urlnorm module for normalization, and, at the moment,
just removes utm_* query params.
TODO: Special case normalization for major sites (e.g. youtube)?
"""
url = urlnorm.norm(resolve(url))
url_parts = urlparse.urlsplit(url)
scheme, netloc, path, query, fragment = url_parts
params = []
for key, value in cgi.parse_qs(query).iteritems():
if exclude_param(url_parts, key, value):
continue
if isinstance(value, list):
params.extend((key, v) for v in value)
else:
params.append((key, value))
query = urllib.urlencode(sorted(params), doseq=1)
return urlparse.urlunsplit((scheme, netloc, path, query, ''))
示例8: test_invalid_urls
def test_invalid_urls(url):
try:
output = urlnorm.norm(url)
print '%r' % output
except urlnorm.InvalidUrl:
return
assert 1 == 0, "this should have raised an InvalidUrl exception"
示例9: normalize_url
def normalize_url(url):
# TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py
try:
norm = urlnorm.norm(url)
norm, _ = urldefrag(norm)
return norm.rstrip('/')
except:
return None
示例10: normalize_url
def normalize_url(url):
norm_url = urlnorm.norm(url)
if norm_url.startswith("https://"):
return norm_url[8:]
elif norm_url.startswith("http://"):
return norm_url[7:]
else:
return norm_url
示例11: googleSearch
def googleSearch ( searchString ):
g = pygoogle(searchString)
g.pages = 2
urls = g.get_urls()
urls = urls[:10]
for i in range(len(urls)):
urls[i]=unicode(urlnorm.norm(urls[i]))
return urls
示例12: new
def new(cls, *args, **kwargs):
obj = cls(*args)
obj.source = kwargs['source']
obj.duplicates = 0
obj.priority = 0
# normalize url
if hasattr(obj, 'url'):
obj.url = urlnorm.norm(obj.url)
return obj
示例13: normalize_url
def normalize_url(url):
# TODO: learn from https://github.com/hypothesis/h/blob/master/h/api/uri.py
try:
url = urlnorm.norm(url)
url, _ = urldefrag(url)
url = url.rstrip("/")
return url
except:
return None
示例14: clean
def clean(self):
"""Ensures that URLs are canonized before saving"""
self.value = refang(self.value.strip())
try:
if re.match(r"[^:]+://", self.value) is None: # if no schema is specified, assume http://
self.value = u"http://{}".format(self.value)
self.value = urlnorm.norm(self.value)
except urlnorm.InvalidUrl:
raise ObservableValidationError("Invalid URL: {}".format(self.value))
示例15: __init__
def __init__(self, url):
"""Construct from a string or Django request."""
nurl = urlnorm.norm(url.encode('utf-16').lower())
if hasattr(nurl, 'get_full_path'):
nurl = nurl.get_full_path()
self.scheme, self.netloc, self.path, self.params, \
self.query, self.fragment = urlparse.urlparse(nurl)
filename, self.ftype = os.path.splitext(self.path)
self.args = dict(cgi.parse_qsl(self.query))