本文整理汇总了Python中urllib2.urlparse.urlparse函数的典型用法代码示例。如果您正苦于以下问题:Python urlparse函数的具体用法?Python urlparse怎么用?Python urlparse使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了urlparse函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self, announce, piece_length=262144, **kw):
self.piece_length = piece_length
if not bool(urlparse.urlparse(announce).scheme):
raise ValueError('No schema present for url')
self.tdict = {
'announce': announce,
'creation date': int(time()),
'info': {
'piece length': self.piece_length
}
}
if kw.get('comment'):
self.tdict.update({'comment': kw.get('comment')})
if kw.get('httpseeds'):
if not isinstance(kw.get('httpseeds'), list):
raise TypeError('httpseeds must be a list')
else:
self.tdict.update({'httpseeds': kw.get('httpseeds')})
if kw.get('announcelist'):
if not isinstance(kw.get('announcelist'), list):
raise TypeError('announcelist must be a list of lists')
if False in [isinstance(l, list) for l in kw.get('announcelist')]:
raise TypeError('announcelist must be a list of lists')
if False in [bool(urlparse.urlparse(f[0]).scheme) for f in kw.get('announcelist')]:
raise ValueError('No schema present for url')
else:
self.tdict.update({'announce-list': kw.get('announcelist')})
示例2: classify_link
def classify_link(link):
''' classify link according to its domain
'''
if link is None:
return link, SITE_TYPE['junk']
original_url = link
url = urlparse.urlparse(link)
max_try_count = 10
try_count = 0
while url.netloc in _SHORT_SERVICE:
if try_count >= max_try_count:
# if multiple redirect, return as news
return link, SITE_TYPE['news']
#get original link of short link
original_url = _get_original_link(original_url)
url = urlparse.urlparse(original_url)
try_count += 1
domain_token = url.netloc.split('.')
length = len(domain_token) - 2
while length >= 0:
domain = '.'.join(domain_token[length:])
if domain in _BLACK_SITE_LIST:
return original_url, _BLACK_SITE_LIST[domain]
length -= 1
#treat unclassified link as news link
return original_url, SITE_TYPE['news']
示例3: __getParentPage
def __getParentPage(self):
'''
This will get the Parent Page info
'''
page = {}
try:
self.hierarchy = page['et_thread_hierarchy'] = [stripHtml(x.renderContents()) for x in self.soup.find('div',{'class':'rd Microsoft_Msn_Boards_Read_List Web_Bindings_Base'}).findAll('li')]
except:
log.info(self.log_msg('Thread hierarchy is not found'))
try:
self.forum_title = page['title'] = stripHtml(self.soup.find('h2').renderContents())
except:
log.info(self.log_msg('Title Not Found'))
page['title'] = ''
if checkSessionInfo(self.genre, self.session_info_out, self.parent_uri, self.task.instance_data.get('update')):
log.info(self.log_msg('Session info return True'))
return False
for each in ['et_author_name','ei_thread_replies_count','ei_thread_view_count','ei_author_count','et_last_post_author','edate_last_post_date','posted_date']:
try:
page[each] = self.task.pagedata[each]
except:
log.info(self.log_msg('Page data cannot be extracted for %s'%each))
try:
page['ei_thread_id'] = int(urlparse.urlparse(self.currenturi)[4].split('&')[0].split('ThreadId=')[1])
except:
log.info(self.log_msg('Thread id not found'))
try:
post_hash = get_hash(page)
id = None
if self.session_info_out == {}:
id = self.task.id
result = updateSessionInfo(self.genre, self.session_info_out, self.parent_uri, post_hash, 'Post', self.task.instance_data.get('update'),Id=id)
if not result['updated']:
return False
page['path'] = [self.parent_uri]
page['parent_path'] = []
page['uri'] = normalize(self.currenturi)
page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])
page['priority'] = self.task.priority
page['level'] = self.task.level
page['pickup_date'] = datetime.strftime(datetime.utcnow(),'%Y-%m-%dT%H:%M:%SZ')
page['connector_instance_log_id'] = self.task.connector_instance_log_id
page['connector_instance_id'] = self.task.connector_instance_id
page['workspace_id'] = self.task.workspace_id
page['client_id'] = self.task.client_id
page['client_name'] = self.task.client_name
page['last_updated_time'] = page['pickup_date']
page['versioned'] = False
page['data'] = ''
page['task_log_id']=self.task.id
page['entity'] = 'Post'
page['category']=self.task.instance_data.get('category','')
self.pages.append(page)
log.info(page)
log.info(self.log_msg('Parent Page added'))
return True
except :
log.exception(self.log_msg("parent post couldn't be parsed"))
return False
示例4: completeurl
def completeurl(fullurl, partialurl):
from urllib2 import urlparse
parsed_jobsurl = urlparse.urlparse(fullurl)
parsed_joburl = urlparse.urlparse(partialurl)
fulljoburl = urlparse.urlunparse([parsed_jobsurl.scheme, parsed_jobsurl.netloc,
parsed_joburl.path, parsed_joburl.params, parsed_joburl.query,
parsed_joburl.fragment])
return fulljoburl
示例5: convert
def convert(self, value, context, ctx_opts):
if value[:4] != u'http':
value = u'http://%s' % value
domain = urlparse.urlparse(value)[1]
if not domain or domain == u'':
domain = urlparse.urlparse(u'http://%s' % value)[1]
if not domain or len(domain.split(u'.')) < 2 or \
len(domain.split(u' ')) > 1:
self.error('invalid_domain', value, context, ctx_opts)
return domain.lower()
示例6: homepage_url
def homepage_url(self):
"""Try ensure we prepend http: to the url if there's nothing there
This is to ensure we're not generating relative links in the
user templates."""
if not self.homepage:
return self.homepage
parsed = urlparse.urlparse(self.homepage)
if parsed.scheme:
return self.homepage
# Vague sanity check
abs_url = ''.join(['http://', self.homepage])
if urlparse.urlparse(abs_url).scheme == 'http':
return abs_url
return self.homepage
示例7: check_config
def check_config():
"""
Check crucial configuration details for existence and workability.
Runs checks to see whether bugtracker's URL is reachable, whether
backend is available at the right filename, and whether the script has
the key arguments it needs to run: URL, backend, and database details.
The filename for the backend in the backends/ directory needs to be the
same as the configuration argument specifying that backend. For
instance, invoking the Launchpad backend uses 'lp', and so the filename
is 'lp.py'.
"""
Config.check_params(['url', 'backend'])
if Config.backend + ".py" not in Backend.get_all_backends():
raise InvalidConfig('Backend "' + Config.backend + '" does not exist')
url = urlparse.urlparse(Config.url)
check_url = urlparse.urljoin(url.scheme + '://' + url.netloc, '')
print("Checking URL: " + check_url)
req = Request(check_url)
if Config.backend != 'github':
try:
response = urlopen(req)
except HTTPError, e:
raise InvalidConfig('The server could not fulfill the request '
+ str(e.msg) + '(' + str(e.code) + ')')
except URLError, e:
raise InvalidConfig('We failed to reach a server. ' + str(e.reason))
示例8: __addPost
def __addPost(self, post):
"""
This will take the post tag , and fetch data and meta data and add it to
self.pages
"""
try:
page = self.__getData(post)
if not page:
log.info(self.log_msg('page contains empty data, getdata \
returns False for uri %s'%self.currenturi))
return True
unique_key = get_hash(page)
if checkSessionInfo(self.__genre, self.session_info_out, unique_key,\
self.task.instance_data.get('update')):
log.info(self.log_msg('Session info returns True for uri %s'%unique_key))
return False
result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \
get_hash( page ),'forum', self.task.instance_data.get('update'))
if result['updated']:
page['parent_path'] = []
page['path'] = [unique_key]
page['uri'] = self.currenturi
page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
log.info(page)
page.update(self.__task_elements_dict)
self.pages.append(page)
else:
log.info(self.log_msg('Update session info returns False for \
url %s'%self.currenturi))
except:
log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi))
return True
示例9: __addPost
def __addPost(self, post):
'''It will add the post
'''
try:
page = self.__getData(post)
if not page:
return True
unique_key = get_hash( {'data' : page['data'] })
if checkSessionInfo('review', self.session_info_out, unique_key,\
self.task.instance_data.get('update'),parent_list\
= [self.currenturi]):
log.info(self.log_msg('Session info returns True'))
return False
result=updateSessionInfo('review', self.session_info_out, unique_key, \
get_hash( page ),'Review', self.task.instance_data.get('update'),\
parent_list=[self.currenturi])
if not result['updated']:
log.info(self.log_msg('Update session info returns False'))
return True
page['path'] = [self.currenturi]
page['parent_path'] = []
#page['path'].append(unique_key)
page['uri'] = self.currenturi
page['uri_domain'] = urlparse.urlparse(page['uri'])[1]
page['entity'] = 'post'
page.update(self.__task_elements_dict)
self.pages.append(page)
log.info(page)
log.info(self.log_msg('Post Added'))
return True
except:
log.exception(self.log_msg('Error while adding session info'))
return False
示例10: __setParentPage
def __setParentPage(self):
"""This will get the parent info
"""
page = {}
try:
page['et_thread_hierarchy'] = self.__hierarchy = [x.strip() for x in stripHtml(self.soup.find('div', 'deck breadcrumbs').renderContents()).split('>') if x.strip()][1:]
page['data'] = page['title'] = page['et_thread_hierarchy'][-1]
except:
log.exception(self.log_msg('Thread hierarchy and Title Not found for uri\
%s'%self.currenturi))
return
if checkSessionInfo(self.__genre, self.session_info_out, self.task.instance_data['uri'], \
self.task.instance_data.get('update')):
log.info(self.log_msg('Session info return True, Already exists'))
return
try:
result = updateSessionInfo('review', self.session_info_out, self.\
task.instance_data['uri'], get_hash( page ), 'forum', self.task.instance_data.get('update'))
if result['updated']:
page['path'] = [self.task.instance_data['uri']]
page['parent_path'] = []
page['uri'] = self.currenturi
page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1])
page['data'] = ''
page['entity'] = 'thread'
page.update(self.__task_elements_dict)
page['posted_date'] = page['pickup_date']
self.pages.append(page)
log.info(self.log_msg('Parent Page Added'))
else:
log.info(self.log_msg('Result[updated] returned True for \
uri'%self.currenturi))
except:
log.exception(self.log_msg("parent post couldn't be parsed"))
示例11: generate_cookie
def generate_cookie(self, url_path, session_id, expiration=None, add_header=False):
'''
Return a session cookie containing the session id. The cookie
will be contrainted to the url path, defined for use
with HTTP only, and only returned on secure connections (SSL).
:parameters:
url_path
The cookie will be returned in a request if it begins
with this url path.
session_id
The session id identified by the session cookie
add_header
If true format cookie string with Set-Cookie: header
:returns:
cookie string
'''
if not expiration: # Catch zero unix timestamps
expiration = None;
cookie = Cookie(self.session_cookie_name, session_id,
domain=urlparse.urlparse(api.env.xmlrpc_uri).netloc,
path=url_path, httponly=True, secure=True,
expires=expiration)
if add_header:
result = 'Set-Cookie: %s' % cookie
else:
result = str(cookie)
return result
示例12: test_compare_triples
def test_compare_triples():
for mime, fext in MIME_TYPES.items():
dump_path = path.join(DUMP_DIR, path.basename(mime))
for url in URLs:
if six.PY2:
fname = '%s.%s' % (path.basename(urlparse.urlparse(url).path), fext)
else:
fname = '%s.%s' % (path.basename(urlparse(url).path), fext)
fname = path.join(dump_path, fname)
req = Request(url)
req.add_header('Accept', mime)
res = urlopen(req)
g_fdp.parse(data=res.read(), format=mime)
g_dump.parse(fname, format=mime)
both, first, second = graph_diff(g_fdp, g_dump)
n_first = len(first)
# n_second = len(second)
# n_both = len(both)
assert_equals(
n_first, 0, '{} triple(s) different from reference:\n\n{}===\n{}\n'.format(
n_first, first.serialize(format='turtle'), second.serialize(format='turtle')))
示例13: on_navigation_requested
def on_navigation_requested(self,view,frame,req,data=None):
uri = req.get_uri()
parse = urlparse.urlparse(uri)
if self.url_callback.find(parse.hostname) > 0:
self.getAccessToken(parse)
return True
return False
示例14: is_url
def is_url(name):
try:
result = urlparse.urlparse(name)
except Exception:
return False
else:
return result.scheme in ('http', 'https', 'file', 'ftp')
示例15: __call__
def __call__(self, **kwargs):
field = self.context.getField('provenances')
provenances = field.getAccessor(self.context)()
formatted_provenances = []
for provenance in provenances:
title = provenance.get('title', '')
link = provenance.get('link', '')
owner = provenance.get('owner', '')
if title != '' or owner != '' or link != '':
formatted_provenance = {'source':{}, 'owner':{}}
formatted_provenance['source']['title'] = title
formatted_provenance['source']['url'] = link
if owner != '':
if hasVocab:
owner_title = tmpOrganisationsVocabulary.\
getDisplayList(self.context).getValue(owner)
else:
owner_title = owner
formatted_provenance['owner']['title'] = owner_title
parser = urlparse.urlparse(owner)
if all((parser.scheme, parser.netloc)):
formatted_provenance['owner']['url'] = owner
else:
formatted_provenance['owner']['url'] = link
formatted_provenances.append(formatted_provenance)
self.info['provenances'] = formatted_provenances
return self.info