本文整理汇总了Python中urlparse.urlparse.urlparse函数的典型用法代码示例。如果您正苦于以下问题:Python urlparse函数的具体用法?Python urlparse怎么用?Python urlparse使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了urlparse函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: searchcrawler
def searchcrawler(url,keyword=''):
"""
tb搜索页爬虫
"""
html=get_html(url)
#print html
if html:
soup = BeautifulSoup(html,fromEncoding='gbk')
items_row = soup.findAll('div',{'class':'row item icon-datalink'})
if items_row:
print '=======================row search row=========================='
#print items
for item in items_row:
item_info = item.find('div',{'class':'col title'}).h3.a
item_url = item_info['href']
url_info = urlparse.urlparse(item_url)
item_id = urlparse.parse_qs(url_info.query,True)['id'][0]
print item_url
print item_id
judge_site(item_url,keyword)
items_col = soup.findAll('div',{'class':'col item icon-datalink'})
if items_col:
print '=======================row search col=========================='
#print items
for item in items_col:
item_info = item.find('div',{'class':'item-box'}).h3.a
item_url = item_info['href']
url_info = urlparse.urlparse(item_url)
item_id = urlparse.parse_qs(url_info.query,True)['id'][0]
print item_url
print item_id
judge_site(item_url,keyword)
示例2: rendered_wall_posts
def rendered_wall_posts( wall_posts ):
for wall_post in wall_posts:
title = ''
desc = ''
site_image = ''
article_title = ''
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[[email protected]&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', wall_post.data['post_content'])
for url in urls:
parse_obj = urlparse.urlparse(url)
site = parse_obj.netloc
path = parse_obj.path
conn = httplib.HTTPConnection(site)
conn.request('HEAD',path)
response = conn.getresponse()
conn.close()
ctype = response.getheader('Content-Type')
if response.status < 400 and ctype.startswith('image'):
wall_post.data['post_content'] = wall_post.data['post_content']+"<br/><a href='"+url+"' target='_blank'><img width=300 src='"+url+"' target = '_blank'/></a>"
else:
og = opengraph.OpenGraph(url)
if not len(og.items()) == 2:
for x,y in og.items():
if x == 'type' and y == 'video':
for k,l in og.items():
if k == 'site_name' and l == 'YouTube':
url_data = urlparse.urlparse(url)
query = urlparse.parse_qs(url_data.query)
video = query["v"][0]
wall_post.data['post_content'] = wall_post.data['post_content'].replace(url,"")+"<br/><iframe width='300' height='200' src='//www.youtube.com/embed/"+video+"' frameborder='0' allowfullscreen></iframe>"
elif k == 'site_name' and l == 'Vimeo':
url_data = urlparse.urlparse(url)
video = url_data.path
wall_post.data['post_content'] = wall_post.data['post_content'].replace(url,"")+"<br/><iframe src='//player.vimeo.com/video"+video+"' width='300' height='200' frameborder='0' webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe> <p></p>"
elif x == 'type' and y == 'article':
for k,l in og.items():
if k == 'title':
article_title = l
elif k == 'site_name':
title = l
elif k=='description':
desc = l
elif k=='image':
site_image = l
wall_post.data['post_content'] = wall_post.data['post_content'] +"<br/><table><tr><td><img width='50' src='"+site_image+"'</td><td><a href='"+url+"' target='_blank'/>"+article_title+"</a><br/>"+title+"</td></td></table>"
elif x=='type':
for k,l in og.items():
if k == 'site_name':
title = l
elif k=='description':
desc = l
elif k=='image':
site_image = l
wall_post.data['post_content'] = wall_post.data['post_content'].replace(url, "<table><tr><td><img width='50' src='"+site_image+"'</td><td><a href='"+url+"' target='_blank'/>"+title+"</a><br/>"+desc+"</td></td></table>")
else:
wall_post.data['post_content'] = wall_post.data['post_content'].replace(url, "<a href='"+url+"' target='_blank'>"+url+"</a>")
return wall_posts
示例3: searchcrawler
def searchcrawler(url):
html=get_html(url)
# print url
if html:
soup = BeautifulSoup(html,fromEncoding='gbk')
items_row = soup.findAll('div',{'class':'item-box st-itembox'})
if items_row:
print '=======================row search row=========================='
for item in items_row:
# print item
item_info = item.find('h3',{'class':'summary'}).a
item_url = item_info['href']
# print item_url
sid_info = item.find('div',{'class':'col seller feature-dsi-tgr'}).a
print sid_info
sid_item_url = sid_info['href']
sid_url_info = urlparse.urlparse(sid_item_url)
sid_id = urlparse.parse_qs(sid_url_info.query,True)['user_number_id'][0]
print sid_id
judge_site(item_url, sid_id)
# logging.warning(item_id)
#
# download_reply_by_id(item_id)
items_col = soup.findAll('div',{'class':'product-item row icon-datalink'})
if items_col:
print '=======================row search col=========================='
#print items
for item in items_col:
item_info = item.find('div',{'class':'title'}).a
item_url = item_info['href']
# url_info = urlparse.urlparse(item_url)
# item_id = urlparse.parse_qs(url_info.query,True)['id'][0]
print item_url
# print item_id
sid_info = item.find('div',{'class':'seller'}).a
print sid_info
sid_item_url = sid_info['href']
sid_url_info = urlparse.urlparse(sid_item_url)
sid_id = urlparse.parse_qs(sid_url_info.query,True)['user_number_id'][0]
print sid_id
judge_site(item_url, sid_id)
示例4: post
def post(self):
try:
name = self.request.POST['name']
topic = MicroTopic.all().filter('name =', name).get()
if not topic:
raise ReatiweError("Topic %s does not exists." % name)
if self.request.POST['mode']:
mode = self.request.POST['mode']
else:
mode = "subscribe"
form_fields = { "hub.mode": mode,
"hub.callback": "%s/callback/%s" % (settings.SITE_URL, topic.name),
"hub.topic": topic.url,
"hub.verify": "sync",
"hub.verify_token": topic.name }
result = 200
url = self.request.POST['hub']
req = urllib2.Request(url, urllib.urlencode(form_fields))
o = urlparse.urlparse(url)
# superfeedr support
if o.username and o.password:
base64string = base64.encodestring('%s:%s' % (o.username, o.password))[:-1]
authheader = "Basic %s" % base64string
new_url = "%s://%s%s" % (o.scheme, o.hostname, o.path)
req = urllib2.Request(new_url, urllib.urlencode(form_fields))
req.add_header("Authorization", authheader)
urllib2.urlopen(req)
except DownloadError, e:
logging.error('DownloadError: %s' % repr(e))
pass
示例5: searchcrawler
def searchcrawler(url):
html=get_html(url)
# print url
if html:
soup = BeautifulSoup(html,fromEncoding='gbk')
items_row = soup.findAll('div',{'class':'product-iWrap'})
#items_row = soup.find('div',{'class':'item-box st-itembox'})
# print items_row
if items_row:
print '=======================row search row=========================='
for item in items_row:
# print item
try:
item_info = item.find('p',{'class':'productTitle'}).a
except:
item_info = item.find('div',{'class':'productTitle productTitle-spu'}).a
# print item_info
item_url = item_info['href']
# print item_url
url_info = urlparse.urlparse(item_url)
item_id = urlparse.parse_qs(url_info.query,True)['id'][0]
print item_id
logging.warning(item_id)
# item_id = 16862466992
download_reply_by_id(item_id)
示例6: gensitemap
def gensitemap(server, urlformat):
'''
Crea la ruta del índice de sitemap para el servidor de archivos dado.
Se conecta a los índices de segundo nivel y obtiene su fecha de modificación.
@type server: dict-like
@param server: Documento del servidor tal cual viene de MongoDB
@rtype tuple (str, datetime) o None
@return tupla con la url y su fecha de modificación, o None si no se puede
obtener la url.
'''
subdomain = server["ip"].split(".")[0]
serverno = int(subdomain[6:])
url = urlformat % serverno
domain = urlparse.urlparse(url)[1]
con = httplib.HTTPConnection(domain)
con.request("HEAD", url)
response = con.getresponse()
if response.status == 200:
mtime = time.mktime(time.strptime(
response.getheader("last-Modified"),
"%a, %d %b %Y %H:%M:%S %Z"))
return (url, datetime.datetime.fromtimestamp(mtime))
return None
示例7: startupagent
def startupagent(self, sender, **kwargs):
if not self.bind_web_address:
_log.info('Web server not started.')
return
import urlparse
parsed = urlparse.urlparse(self.bind_web_address)
hostname = parsed.hostname
port = parsed.port
_log.info('Starting web server binding to {}:{}.' \
.format(hostname, port))
self.registeredroutes.append((re.compile('^/discovery/$'), 'callable',
self._get_discovery))
self.registeredroutes.append((re.compile('^/discovery/allow$'),
'callable',
self._allow))
self.registeredroutes.append((re.compile('^/$'), 'callable',
self._redirect_index))
port = int(port)
vhome = os.environ.get('VOLTTRON_HOME')
logdir = os.path.join(vhome, "log")
if not os.path.exists(logdir):
os.makedirs(logdir)
self.appContainer = WebApplicationWrapper(self, hostname, port)
svr = WSGIServer((hostname, port), self.appContainer)
self._server_greenlet = gevent.spawn(svr.serve_forever)
示例8: fps_ipn_handler
def fps_ipn_handler(self, request):
uri = request.build_absolute_uri()
parsed_url = urlparse.urlparse(uri)
resp = self.fps_connection.verify_signature(UrlEndPoint="%s://%s%s" % (parsed_url.scheme,
parsed_url.netloc,
parsed_url.path),
HttpParameters=request.body)
if not resp.VerifySignatureResult.VerificationStatus == "Success":
return HttpResponseForbidden()
data = dict(map(lambda x: x.split("="), request.body.split("&")))
for (key, val) in data.items():
data[key] = urllib.unquote_plus(val)
if AmazonFPSResponse.objects.filter(transactionId=data["transactionId"]).count():
resp = AmazonFPSResponse.objects.get(transactionId=data["transactionId"])
else:
resp = AmazonFPSResponse()
for (key, val) in data.items():
attr_exists = hasattr(resp, key)
if attr_exists and not callable(getattr(resp, key, None)):
if key == "transactionDate":
val = datetime.datetime(*time.localtime(float(val))[:6])
setattr(resp, key, val)
resp.save()
if resp.statusCode == "Success":
transaction_was_successful.send(sender=self.__class__,
type=data["operation"],
response=resp)
else:
if not "Pending" in resp.statusCode:
transaction_was_unsuccessful.send(sender=self.__class__,
type=data["operation"],
response=resp)
# Return a HttpResponse to prevent django from complaining
return HttpResponse(resp.statusCode)
示例9: startupagent
def startupagent(self, sender, **kwargs):
if not self.bind_web_address:
_log.info('Web server not started.')
return
import urlparse
parsed = urlparse.urlparse(self.bind_web_address)
hostname = parsed.hostname
port = parsed.port
_log.info('Starting web server binding to {}:{}.' \
.format(hostname, port))
self.registeredroutes.append((re.compile('^/discovery/$'), 'callable',
self._get_discovery))
self.registeredroutes.append((re.compile('^/discovery/allow$'),
'callable',
self._allow))
self.registeredroutes.append((re.compile('^/$'), 'callable',
self._redirect_index))
port = int(port)
vhome = os.environ.get('VOLTTRON_HOME')
logdir = os.path.join(vhome, "log")
if not os.path.exists(logdir):
os.makedirs(logdir)
with open(os.path.join(logdir, 'web.access.log'), 'wb') as accesslog:
with open(os.path.join(logdir, 'web.error.log'), 'wb') as errlog:
server = pywsgi.WSGIServer((hostname, port), self.app_routing,
log=accesslog, error_log=errlog)
server.serve_forever()
示例10: judge_site
def judge_site(url,keyword=''):
"""
判断物品是tb还是tm
"""
url_info = urlparse.urlparse(url)
urlkey = urlparse.parse_qs(url_info.query,True)
iid = int(urlkey['id'][0])
#print 'url_info:',url_info[1]
try:
if url_info[1] == 'detail.tmall.com':
print 'it is a tm item'
if check_item_update_time(iid,'tm'):
return
data = getTmallItemInfo(iid,keyword)
elif urlkey.get('cm_id'):
print 'it is a tm item'
if check_item_update_time(iid,'tm'):
return
data = getTmallItemInfo(iid,keyword)
else:
print 'it is a tb item'
if check_item_update_time(iid,'tb'):
return
data = getTaobaoItemInfo(iid,keyword)
except Exception ,e:
print traceback.print_exc()
return
示例11: judge_site
def judge_site(url, sid_id):
"""
判断物品是tb还是tm
"""
url_info = urlparse.urlparse(url)
urlkey = urlparse.parse_qs(url_info.query,True)
iid = int(urlkey['id'][0])
print iid
# print 'url_info:',url_info[1]
try:
if url_info[1] == 'detail.tmall.com':
print 'it is a tm item'
# data = download_tm_reply_by_id(iid)
elif urlkey.get('cm_id'):
print 'it is a tm item cm_id'
# data = download_tm_reply_by_id(iid)
else:
print 'it is a tb item'
data = download_tb_reply_by_id(iid, sid_id)
except Exception ,e:
print traceback.print_exc()
return
示例12: is_local_service
def is_local_service(name):
"""
Determine if a service definition describes a service running on
the local node. This is true if the service URL is for localhost,
matches the machine's name, or ec2 public name
"""
if name is None:
return False
if "://" in name:
url = urlparse.urlparse(name)
if ":" in url.netloc:
name = url.netloc.split(":")[0]
else:
name = url.netloc
elif ":" in name:
name = name.split(":")[0]
if name == "localhost":
return True
if '.' in name:
name = name.split('.')[0]
node = platform.node()
if '.' in node:
node = node.split('.')[0]
if name == node:
return True
pn = public_name()
if pn is not None and pn.split(".")[0] == name:
return True
return False
示例13: wait_for_servers
def wait_for_servers(urls, timeout):
import time, urlparse, httplib
from ssl import SSLError
for u in urls:
parsed = urlparse.urlparse(u.lower(), "https")
netloc = parsed.hostname
if parsed.port: netloc = "%s:%s" % (netloc, parsed.port)
if parsed.scheme == "http":
cnxn = httplib.HTTPConnection(netloc)
elif parsed.scheme == "https":
cnxn = httplib.HTTPSConnection(netloc)
else:
raise Exception("Don't know how to handle scheme %s" % parsed.scheme)
i = 0
while(i < timeout):
try:
cnxn.connect()
except SSLError:
break;
except Exception as e:
if "Connection refused" in str(e):
time.sleep(1)
i = i - 1
elif "SSL" in str(e):
break
else:
raise
else:
break
示例14: _extracturls
def _extracturls(self):
#print "Extract URLs"
urls = []
htmlsrc, charset, parenturl = self.htmlSrcTuple
if htmlsrc != None:
resulturls = []
urlExtractor = ExtractLinks(resulturls)
try:
if charset == None:
urlExtractor.feed(htmlsrc)
else:
urlExtractor.feed(htmlsrc.decode(charset))
except HTMLParser.HTMLParseError:
pass
try:
urlExtractor.reset() # I think close needs special treatment .close()
except HTMLParser.HTMLParseError:
urlExtractor.reset()
#this piece of code forms the URIs to full URLs by joining the
#parenturl with the network location free URLs extracted
for i in xrange(len(resulturls)): #replacing range() for performance reasons
urlres = urlparse.urlparse(resulturls[i], "http")
if urlres.netloc == "":
resulturls[i] = urlparse.urljoin(parenturl, resulturls[i])
urls.extend(resulturls)
return urls
示例15: getParams
def getParams(path):
query = urlparse.urlparse(path).query
queryDict = dict([x.split('=') for x in query.split('&')])
width = queryDict['WIDTH']
height = queryDict['HEIGHT']
bbox = queryDict['BBOX']
return Params(int(width), int(height), map(float, bbox.split(',')))