本文整理汇总了Python中pyquery.pyq函数的典型用法代码示例。如果您正苦于以下问题:Python pyq函数的具体用法?Python pyq怎么用?Python pyq使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了pyq函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _fetch_user
def _fetch_user(self, url):
try:
html = urllib2.urlopen(url + "?tab=repositories").read()
except urllib2.HTTPError as e:
if e.code == 429:
print "#" * 10, " 429 Too many request.Sleep %s seconds. " % self._too_many_request_sleep, "#" * 10
eventlet.sleep(self._too_many_request_sleep)
return self._fetch_user(url)
raise e
jq = pyq(html)
data = {}
data['url'] = url
data['name'] = jq(".vcard-fullname").text()
data['avatar'] = jq(".avatar").attr("src")
data['location'] = jq("[aria-label='Home location']").attr("title")
data['email'] = jq("[aria-label=Email] a").text()
data['website'] = jq("[aria-label='Blog or website'] a").text()
data['join'] = jq(".join-date").attr("datetime")
data['followers'] = jq(".vcard-stat-count:eq(0)").text()
data['starred'] = jq(".vcard-stat-count:eq(1)").text()
data['following'] = jq(".vcard-stat-count:eq(2)").text()
data['repositories'] = {}
sources = jq(".repo-list-item.source")
data['repositories']['source_count'] = len(sources)
data['repositories']["source_lang"] = {}
for i in sources:
lang = pyq(i).find("[itemprop='programmingLanguage']").text()
data['repositories']["source_lang"].setdefault(lang, 0)
data['repositories']["source_lang"][lang] += 1
return data
示例2: main
def main():
doc = pyq(filename='html.txt')
doc1 = doc('div')
doc2 = doc1('a')
# print(doc2)
TieBaDate = {}
try:
f = open('source.txt', 'w')
except IOError:
print("Error: open file failed.")
iSum = 0
for i in doc2:
tmphref = pyq(i).attr('href')
tmptitle = pyq(i).attr('title')
strhref = repr(tmphref)
strtitle = repr(tmptitle)
aryhref = re.findall('/p/(\d+)', strhref)
if re.findall('/p/(\d+)', strhref) != [] and re.findall('(.*?)魔枪(.*?)', strtitle) != []:
# print(strtitle)
# print(strhref)
strsource = 'http://tieba.baidu.com/p/%s' % aryhref[0]
f.write(strsource)
f.write("\n")
iSum += 1
AnalyHtml(url=strsource, filePath='')
break
print('sum :', iSum)
f.close()
示例3: _fetch_query
def _fetch_query(self, url, page=0):
print "-" * 10, " Fetch Page %s " % (page + 1), "-" * 10
print url
try:
html = urllib2.urlopen(url).read()
except urllib2.HTTPError as e:
if e.code == 429:
print "#" * 10, " 429 Too many request.Sleep %s seconds. " % self._too_many_request_sleep, "#" * 10
eventlet.sleep(self._too_many_request_sleep)
return self._fetch_query(url, page)
raise e
jq = pyq(html)
urls = []
user_list = jq(".user-list-item")
for i in user_list:
name = pyq(i).find(".user-list-info a")
href = self._domain + name.attr("href")
urls.append(href)
users = []
for user in pool.imap(self._fetch_user, urls):
users.append(user)
if page == 0:
max_page_index = jq(".next_page").prev("a").text()
users.extend(self._fetch_query_by_page(url, int(max_page_index)))
return users
示例4: get_jiandan_mm_pic
def get_jiandan_mm_pic(page_num):
url = 'http://jandan.net/ooxx/page-' + str(page_num)
html = pyq(url)
print('reading ... http://jandan.net/ooxx/page-{0}\n'.format(page_num))
sys.stdout.flush()
#print(html)
hash_pic_message = {}
#获取图片地址
for element in html('li div div.row div.text'):
img = pyq(element).find('img')
#img = pyq(element)('img')
if img != None:
id = pyq(element)('span a').text()
#id = id.replace("vote-","")
hash_pic_message[id]={}
hash_pic_message[id]['ID']=id
hash_pic_message[id]['URL']=[]
hash_pic_message[id]['FileName']=[]
if img.attr('org_src') == None:
for t in img:
url = img(t).attr('src')
hash_pic_message[id]['URL'].append(url)
hash_pic_message[id]['FileName'].append(get_file_name2(url))
else:
for t in img:
url = img(t).attr('org_src')
hash_pic_message[id]['URL'].append(url)
hash_pic_message[id]['FileName'].append(get_file_name2(url))
#获取图片ID和评级
for element in html('li div div.row div.jandan-vote'):
id = pyq(element)('a').attr('data-id')
#id = id.replace("vote-","")
vote = pyq(element).text()
reg_vote = 'OO \[ (\d.*) \] XX \[ (\d.*) \]'
pattern = re.compile(reg_vote)
result = pattern.findall(vote)
if result != None:
support = result[0][0]
unsupport = result[0][1]
hash_pic_message[id]["Support"] = support
hash_pic_message[id]["UnSupport"] = unsupport
if unsupport != "0":
scale = float(support) / float(unsupport)
else:
scale = 0.0
rank = get_scale(scale)
hash_pic_message[id]["Scale"] = scale
hash_pic_message[id]["Rank"] = rank
for value in hash_pic_message.values():
#print(value)
pass
return hash_pic_message.values()
示例5: run
def run(self):
headers = {'connection': 'close'}
response = requests.get(self.url, headers=headers)
response.encoding = 'utf-8'
column_jq = pyq(response.text)
column = column_jq('title').text()
parsed_body = html.fromstring(response.text)
song_urls = parsed_body.xpath('//a[contains(@href, "/play/")]/@href')
new_lyrics = []
for song_url in song_urls:
full_url = urlparse.urljoin("http://www.9ku.com", song_url) # base_url ahead
r = requests.get(full_url, headers=headers)
r.encoding = 'utf-8' # refer to test/get_chinese.py
jq = pyq(r.text)
# get title, author in song page
brief = jq('h2#play_title').text()
title = brief.split(' ')[1]
author = brief.split(' ')[3]
# two types of song pages
if jq('div.lrcBox').text():
content = jq('div.lrcBox').text()
else:
out_url = jq('h2#play_title').find('a').eq(2).attr('href')
r_out = requests.get(out_url, headers=headers)
r_out.encoding = 'utf-8' # maybe dno't need
jq_out = pyq(r_out.text)
content = jq_out('div.ciInfo').eq(0).text()
new_lyric = Lyric2(column=column, title=title, author=author,
content=content)
new_lyric.save()
print 'get data from %s at %s' % (full_url, time.ctime())
示例6: dir_parse
def dir_parse(self,page,spider_list,result_list):
print page
doc = pyq(page)
tmp = doc('table[class=tableList]')
trl = tmp('tr')
for v in trl:
td= pyq(v)('td[class=title]')
a = td('a')
name = a.text().encode("UTF-8").decode("UTF-8")
ename =""
print name
if len(name)>1:
for uchar in name:
#print uchar
if is_alphabet(uchar) :
ename += uchar
#elif uchar =='.' or uchar ==' ' or uchar =='&':
#ename += uchar
elif (uchar =='(' or is_number(uchar) ) and len(ename)>2:
break
print "xxxx",ename
link = "http://banyungong.net/" + a.attr('href')
result_list.append((ename.lower() +"," +link).encode("UTF-8"))
return ""
示例7: getPages
def getPages(self):
dirs=[]
#doc=pyq(self.url)
#while doc("div:contains('Browse Problems')+div+table img").attr('alt')=="FOLDER" and (None in dirs[p].values()):
#dirs[p].update(dict.fromkeys([self.root+'/'+a.attr('href') for a in doc("div:contains('Browse Problems')+div+table a")]))
#for d,c in dirs[p].items():
dirs.append(self.url)
while dirs:
curdir=dirs.pop()
try:
doc=pyq(curdir)
except (httplib.IncompleteRead,urllib2.URLError):
print "Bug!!!!!!!!!!!!!1"
httplib.HTTPConnection._http_vsn = 10
httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
doc=pyq(curdir)
#httplib.HTTPConnection._http_vsn = 11
#httplib.HTTPConnection._http_vsn_str = 'HTTP/1.1'
if doc("div:contains('Browse Problems')+div+table img").attr('alt')=="FOLDER":
print "[folder]",curdir
links=doc("div:contains('Browse Problems')+div+table a")
for a in links:
dirs.append(self.root+'/'+pyq(a).attr('href'))
else:
print "[file]",curdir
self.pages.append(curdir)
示例8: getHtmlByPyquery
def getHtmlByPyquery(tUrl):
posts =[]
from pyquery import PyQuery as pyq
r = requests.get(tUrl)
doc=pyq(r.text)
lis = doc(".car-monthlisting li a")
lis = lis[0:100]
lis.reverse()
i=1
for li in lis:
link = pyq(li).attr("href")
title = pyq(li).text()
print "抓取文章_%s(%s,link:%s)" %(i,title,link)
ir = requests.get(link)
idoc = pyq(ir.text)
content = idoc("#content .entrybody").remove(".wumii-hook").remove("script").remove("ins").remove(".ds-thread").remove("#ds-ssr").remove("div").remove("#comments").html()
content = content.replace("\"","\"\"");
#print content
post = Post()
post.category = urllib.quote("notes") + ":段子"
post.post_author = "geekzone"
post.post_title = title
post.post_content = "\""+content+"\""
posts.append(post)
i=i+1
return posts
示例9: exportText
def exportText(section, idx, link):
# url = "http://book.kanunu.org/book3/6630/115916.html"
# req = urllib2.Request(url)
# response = urllib2.urlopen(req).read()
fileName = section + "/" + idx + ".html"
textFile = open(fileName)
mainHtml = textFile.read()
textFile.close()
html = unicode(mainHtml, "GBK")
doc = pyq(html)
tables = doc.find("table")
a = []
for table in tables:
a.append(len(pyq(table).text()))
mx = max(a)
textIdx = a.index(mx)
titleIdx = textIdx - 1
mainText = pyq(tables[textIdx]).find("p").html()
# afterTitle = mainText.index(r"<br/>")
# mainTitle = mainText[0:afterTitle].replace(u" ", "").replace(u"】", "").replace(u"【", "").strip().encode("UTF-8")
# mainTitle = pyq(tables[titleIdx]).text().replace(u"上部 ", "").replace(u"中部 ", "").replace(u"下部 ", "").encode("UTF-8")
mainTitle = pyq(tables[titleIdx]).text().encode("UTF-8")
outFile = open("Text/" + section + "/" + idx + ".xhtml", "w")
outFile.write("<h1>" + mainTitle + "<h1/>")
# outFile.write("<p>")
outFile.write(mainText.encode("UTF-8"))
# outFile.write("<p/>")
outFile.write("<p><br/>" + link + "<br/><p/>")
outFile.close()
titleList.append(mainTitle)
return mainTitle
示例10: main
def main():
url = 'http://taiwan.huanqiu.com/news/'
#url = 'http://world.huanqiu.com/observation/'
#url = 'http://china.huanqiu.com/politics/'
doc = pyq(url=url)
alist = doc('.pad20 li a')
for a in alist:
link = pyq(a).attr('href')
get_info(link)
示例11: get_proxy_list
def get_proxy_list(self, page_range=15):
__all_proxy_list =[]
for __page in range(page_range):
__url = 'http://proxylist.hidemyass.com/%s#listable' % __page
__request = urllib2.Request(__url, headers=self.__headers)
__response = urllib2.urlopen(__request)
__the_page = __response.read()
doc = pyq(__the_page)
for __list_idx in doc('#listable tbody>tr')[:]:
__tmp = doc(__list_idx).outerHtml()
p = pyq(__tmp)
for __j in p('style').text().split('\n'):
if __j.find('display:none')>0:
p.remove(__j.split('{')[0])
p.remove('style')
for __j in p('span,div'):
if p(__j).attr('style')=='display:none':
p(__j).remove()
__proxy = {'last_update' : p('td').eq(0).text(),
'ip_address' : p('td').eq(1).text().replace(' ',''),
'port' : p('td').eq(2).text(),
'country' : p('td').eq(3).text(),
'countryIsoCode' : p('td').eq(3).attr('rel'),
'type': p('td').eq(6).text(),
'anon' : p('td').eq(7).text(),
'speed': ''.join( re.findall(u'\d', p('td').eq(4)('.indicator').attr('style').split(';')[0]) ),
'connection_time': ''.join( re.findall(u'\d', p('td').eq(4)('.indicator').attr('style').split(';')[0]) )
}
print __proxy
__all_proxy_list.append(__proxy)
pickle.dump(__all_proxy_list, open('free_proxy_list', 'wb'))
__all_proxy_list = pickle.load(open('free_proxy_list' , 'r'))
return __all_proxy_list
all_count_cnt = {}
for __i in __all_proxy_list:
if all_count_cnt.has_key(__i['country']):
all_count_cnt[__i['country']] = all_count_cnt[__i['country']]+1
else:
all_count_cnt[__i['country']] = 1
print all_count_cnt
all_count_cnt = {}
for __i in __all_proxy_list:
if all_count_cnt.has_key(__i['countryIsoCode']):
all_count_cnt[__i['countryIsoCode']] = all_count_cnt[__i['countryIsoCode']]+1
else:
all_count_cnt[__i['countryIsoCode']] = 1
print all_count_cnt
示例12: _parse_data
def _parse_data(self,pyq_node,k,data,debug):
keymap =[]
path = data['path']
pathlist = path.split(',')
node = pyq_node
for p in pathlist:
if '[email protected]' in p:
attr = p[5:]
value= node.attr(attr)
return value
elif 'text' == p:
if node.text() != None:
value = node.text().encode("utf-8")
else:
value = None
return value
elif '#' in p:
pl = p.split('#')
#print pl[0],pl[1]
node = node(pl[0].encode("utf-8")).eq(int(pl[1]))
if node !=None:
node = pyq(node)
else:
return None
else:
node = node(p.encode("utf-8"))
if node!=None:
#node = pyq(node)(p)
node = pyq(node)
else:
return None
if debug:
print "DEBUG,p",p
print node
# for key in data:
# if key != 'path':
# keymap[k]=[]
# break;
if len(node )> 0:
if debug:
print "DEBUG",k
print node
for d in node:
submap ={}
for key in data:
if key != 'path':
res = self._parse_data(pyq(d),key,data[key],debug)
submap[key] = res
keymap.append(submap)
return keymap
示例13: parse
def parse(self, url):
# 解析第一页商品列表
res = requests.get(url)
assert res.status_code == 200
jq = pyq(res.content)
goods_list = jq('.list-container>ul>li>a')
for r in goods_list:
goods_url = r.get('href')
if not goods_url:
continue
goods_url = '%s%s' % (CosstoresGoodsListPrase.COSSTORES_HOST, goods_url)
goods_name = r.get('title')
# print goods_url, goods_name
goods_item = {
'url' : goods_url,
'name' : goods_name,
}
self.goods_list.append(goods_item)
# 解析ajax动态请求的商品列表页,第2-n页
next_page = jq('#infiload_nav>a')
if next_page:
next_page = next_page[0]
max_page = int(next_page.get('data-maxpage'))
next_url = next_page.get('href')
np = re.findall('page=(\d+)', next_url)
if not np:
return
np = int(np[0])
while np <= max_page:
next_url = re.sub('page=(\d+)', 'page=%s' % (np), next_url)
np += 1
res = requests.get('%s%s' % (CosstoresGoodsListPrase.COSSTORES_HOST, next_url))
assert res.status_code == 200
jq_page = pyq(res.content)
goods_list = jq_page('li>a')
if not goods_list:
# 解析完了
break
for r in goods_list:
goods_url = r.get('href')
if not goods_url:
continue
goods_url = '%s%s' % (CosstoresGoodsListPrase.COSSTORES_HOST, goods_url)
goods_name = r.get('title')
goods_item = {
'url' : goods_url,
'name' : goods_name,
}
self.goods_list.append(goods_item)
示例14: get_betting_odds_info_list
def get_betting_odds_info_list(self):
h = requests.get(self.url, timeout = self.timeout) #, proxies = self.proxies
text = h.content
pq = pyq(text)
betting_odds_info_list = []
startdate_html = pq('.event-holder.holder-scheduled>.eventLine.status-scheduled')
url_html = pyq(startdate_html)('meta[itemprop=\'url\']')
matchup_html = pyq(startdate_html)('meta[itemprop=\'name\']')
for i in range(len(startdate_html)):
betting_odds_info_list.append({'start_time': startdate_html.eq(i).attr('rel'),
'url': url_html.eq(i).attr('content'),
'away_team': matchup_html.eq(i).attr('content').split(' vs ')[0],
'home_team': matchup_html.eq(i).attr('content').split(' vs ')[1]})
return betting_odds_info_list
示例15: parseThread
def parseThread(self, response):
url = response.url.replace('http://bbs', 'http://www')
reply = []
for floor in response.css('div.tpc_content').extract():
reply.append(pyq(floor).text())
self.collection.update({"url": response.url}, {'$set': {"reply": reply}}, True)