本文整理汇总了Python中pyquery.pyquery.PyQuery.text方法的典型用法代码示例。如果您正苦于以下问题:Python PyQuery.text方法的具体用法?Python PyQuery.text怎么用?Python PyQuery.text使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyquery.pyquery.PyQuery
的用法示例。
在下文中一共展示了PyQuery.text方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_list_info_html
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import text [as 别名]
def get_list_info_html(html):
#print("get_list_info_html")
data = []
album_items = html('ul.site-piclist').children('li')
for album_item in album_items:
album_item = PyQuery(album_item)
site_piclist_info = PyQuery(album_item.children('div.site-piclist_info'))
site_piclist_info_title = PyQuery(site_piclist_info.children('p.site-piclist_info_title'))
site_piclist_info_title_a = PyQuery(site_piclist_info_title.children('a'))
site_piclist_info_title_fs12 = PyQuery(site_piclist_info.children('p.fs12'))
site_piclist_info_title_fs12_a = PyQuery(site_piclist_info_title_fs12.children('a'))
no = site_piclist_info_title_a.text()
#if re.search("预告",no):
#continue
name = site_piclist_info_title_fs12_a.text()
url = site_piclist_info_title_fs12_a.attr('href')
if url is None:
continue
subtitle = site_piclist_info_title_fs12_a.text()
info = {
"name": name,
"no": no,
"subtitle": subtitle,
"url": url
}
data.append(info)
i = i+1
return data
示例2: Parse_le
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import text [as 别名]
def Parse_le(self, input_text):
html = PyQuery(get_url(input_text))
items = html('dt.d_tit')
title = "LETV"
i = 0
data = {
"data": [],
"more": False,
"title": title,
"total": i,
"type": "collection"
}
for item in items:
a = PyQuery(item).children('a')
name = a.text()
no = a.text()
subtitle = a.text()
url = a.attr('href')
if url is None:
continue
if not re.match('^http://www\.le\.com/.+\.html', url):
continue
info = {
"name": name,
"no": no,
"subtitle": subtitle,
"url": url,
"caption": "首页地址列表"
}
data["data"].append(info)
i = i + 1
total = i
data["total"] = total
return data
示例3: __initPageNum
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import text [as 别名]
def __initPageNum(self):
initurl="%s/%s/&act=personal&options="%(self.baseUrl,self.urlpath)
req=urllib2.Request(initurl, None, self.header)
p=self.br.open(req).read()
pg=PyQuery(p)("div#houses div.fl")
if re.search('''(\d+)''',pg.text()):
pg=re.search('''(\d+)''',pg.text()).group(1)
r=self.__getPageAllLink(p)
if not r:
return
self.pn= [i for i in range(int(pg)+1)][2:]
print ""
示例4: parse
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import text [as 别名]
async def parse(self, input_text, *k, **kk):
html = await get_url_service.get_url_async(input_text)
html = PyQuery(html)
title = ""
for meta in html('meta[itemprop="name"]'):
meta = PyQuery(meta)
title = meta.attr("content")
break
data = {
"data": [],
"more": False,
"title": title,
"total": 0,
"type": "list",
"caption": "QQ视频全集"
}
for a in html(".mod_episode a"):
a = PyQuery(a)
_title = ""
for span in PyQuery(a("span")):
span = PyQuery(span)
if span.attr("itemprop") == "episodeNumber":
_title = "第%s集" % span.text()
elif span.has_class("mark_v"):
_title += span.children("img").attr("alt")
info = {
"name": _title,
"no": _title,
"subtitle": _title,
"url": a.attr("href")
}
data["data"].append(info)
data["total"] = len(data["data"])
return data
示例5: parse
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import text [as 别名]
async def parse(self, input_text, *k, **kk):
if not await self._check_support(input_text):
return []
html_text = await get_url_service.get_url_async(input_text)
html = PyQuery(html_text)
title = html('h1.main_title > a').text()
if not title:
for a in html('div.crumb-item > a'):
a = PyQuery(a)
if a.attr('href') in input_text:
title = a.text()
if not title:
try:
title = match1(html_text, '<title>([^<]+)').split('-')[0]
except AttributeError:
pass
data = {
"data": [],
"more": False,
"title": title,
"total": 0,
"type": "list",
"caption": "271视频全集"
}
data["data"] = await self._get_list_info_api(html_text)
return data
示例6: parse
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import text [as 别名]
async def parse(self, input_text, *k, **kk):
html = await get_url_service.get_url_async(input_text)
m = re.findall('showid:"([0-9]+)",', html) # showid:"307775"
if not m:
return []
logging.info(m[0])
html = PyQuery(html)
p_title = html("li.p-row.p-title")
p_title("li>a").remove()
p_title("li>span").remove()
title = p_title.text().replace(":", '')
data = {
"data": [],
"more": False,
"title": title,
"total": 0,
"type": "list",
"caption": "优酷视频全集"
}
last_num = 0
while True:
new_url = "https://list.youku.com/show/episode?id=" + m[0] + "&stage=reload_" + str(
last_num) + "&callback=a"
json_data = await get_url_service.get_url_async(new_url)
info = json.loads(json_data[14:-2])
if info.get("error", None) == 0 and info.get("message", None) == "success":
new_html = info.get("html", None)
if new_html:
new_html = PyQuery(new_html)
items = new_html("a")
for item in items:
item = PyQuery(item)
num = int(item.text())
url = "http:" + item.attr("href")
title = "第%02d集" % num
info = {
"name": title,
"no": title,
"subtitle": title,
"url": url
}
data["data"].append(info)
last_num = num
last_num += 1
else:
continue
else:
break
data["total"] = len(data["data"])
return data
示例7: _parse
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import text [as 别名]
def _parse(self, response):
d = PyQuery(response)
# page_turning
__url = map(lambda x: x.attr('href'),
d.find(self.__css).items()
)
if config_dictionary.get(self.__url_start).get('basejoin'):
new_url = map(lambda u: urlparse.urljoin(self.__url_base, u), __url)
else:
new_url = __url
self.__url_pool = self.__url_pool.union(set(new_url))
# IP address extracting
rst = ':'.join(d.text().split(' '))
proxy_list = re.findall(pattern_ip_address, rst)
proxy_port_queue.put((proxy_list, self.__url_base))
示例8: Parse
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import text [as 别名]
def Parse(self,input_text):
html = PyQuery(self.getUrl(input_text))
items = html('a')
title = html('title').text()
i =0
data = {
"data": [],
"more": False,
"title": title,
"total": i,
"type": "collection"
}
for item in items:
a = PyQuery(item)
name = a.attr('title')
if name is None:
name = a.text()
no = name
subtitle = name
url = a.attr('href')
if url is None:
continue
if name is None or name == "":
continue
if not re.match('(^(http|https)://.+\.(shtml|html))|(^(http|https)://.+/video/)',url):
continue
if re.search('(list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com)',url):
continue
if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))',no):
continue
unsure = False
info = {
"name": name,
"no": no,
"subtitle": subtitle,
"url": url,
"unsure": unsure
}
data["data"].append(info)
i = i+1
total = i
data["total"] = total
return data
示例9: get_field_data
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import text [as 别名]
def get_field_data(self,url):
"""
Fetches the data from the URL and tries to extract all of the tag
information from the page.
@param url -- the URL for the *concise* tag information page.
@return tag (string) , tag_info (dict)
or False if information cannot be extracted from the page at url
"""
dom = self.get_dom(url)
tag_info = self.get_tag_def(dom)
if tag_info:
tag, title, repeatable = tag_info
else:
return False
definition = dom("div.definition")
if not definition.size():
definition = dom("p").eq(0)
if not definition.size():
definition = PyQuery("<p>Bad HTML: %s</p>" % url)
control_field = tag in self.CONTROL_FIELDS
definition = normalize(definition.text())
data = dict(title=title,definition=definition,repeatable=repeatable,control_field=control_field)
if not control_field:
subfields = self.get_subfields(dom)
if '?' in subfields:
raise Exception("can't parse subfields in " + url)
try:
indicators = self.get_indicators(dom)
except Exception, e:
import traceback, sys
traceback.print_exception(*sys.exc_info())
print e
raise Exception("Can't get indicators from " + url, e)
data['indicators'] = indicators
data['subfields'] = subfields
示例10: Parse_a
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import text [as 别名]
def Parse_a(self,input_text):
# modity from sceext2's list271.py
def get_list_info_api1(html_text):
RE_GET_AID = ' albumId: ([0-9]+),' # albumId: 202340701,
# http://cache.video.qiyi.com/jp/avlist/202340701/2/
URL_JS_API_PORT = 'http://cache.video.qiyi.com/jp/avlist/'
# get info from 271 javascript API port
def get_info_from_js_port(html_text):
# get album id
aid = get_aid(html_text)
# get info list
vlist = get_vinfo_list(aid)
# done
return vlist
# get album id
def get_aid(html_text):
m = re.findall(RE_GET_AID, html_text)
return m[0]
# make js API port URL
def make_port_url(aid, page_n):
url = URL_JS_API_PORT + str(aid) + '/' + str(page_n) + '/'
#print(url)
return url
# get vinfo list, get full list from js API port
def get_vinfo_list(aid):
vlist = []
# request each page
page_n = 0
urls = []
while True:
# make request url
page_n += 1
url = make_port_url(aid, page_n)
# get text
raw_text = common.getUrl(url)
# get list
sub_list = parse_one_page(raw_text)
for sub in sub_list:
url = sub['url']
if url in urls:
sub_list = []
else:
urls.append(url)
if len(sub_list) > 0:
vlist += sub_list
else: # no more data
break
# get full vinfo list done
return vlist
# parse one page info, parse raw info
def parse_one_page(raw_text):
# remove 'var tvInfoJs={' before json text, and json just ended with '}'
json_text = '{' + raw_text.split('{', 1)[1]
# load as json text
info = json.loads(json_text)
# check code, '"code":"A00000"' is OK, and '"code":"A00004"' is out of index
if info['code'] == 'A00004':
return [] # just return null result
# get and parse video info items
vlist = info['data']['vlist']
out = [] # output info
for v in vlist:
one = {}
one['no'] = v['pd']
one['title'] = v['vn']
one['subtitle'] = v['vt']
one['url'] = v['vurl']
# get more info
one['vid'] = v['vid']
one['time_s'] = v['timeLength']
one['tvid'] = v['id']
out.append(one)
# get video info done
return out
# get info from js API port
info2 = get_info_from_js_port(html_text)
# replace vlist with js port data
vlist = []
for i in info2:
one = {}
one['no'] = "第"+str(i['no'])+"集 "+str(i['subtitle'])
one['subtitle'] = i['subtitle']
one['url'] = i['url']
vlist.append(one)
# done
return vlist
def get_list_info_api2(html_text):
RE_GET_AID = ' albumId: ([0-9]+),' # albumId: 203342201,
# http://cache.video.qiyi.com/jp/sdvlst/6/203342201/
#.........这里部分代码省略.........
示例11: sell
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import text [as 别名]
#.........这里部分代码省略.........
if re.search(self.house_room_regex, detail_mer_str):
house_room=re.search(self.house_room_regex, detail_mer_str).group(1)
self.fd['house_room'] = int(house_room)
else:
self.fd['house_room'] = 0
if re.search(self.house_hall_regex, detail_mer_str):
house_hall=re.search(self.house_hall_regex, detail_mer_str).group(1)
self.fd['house_hall'] = int(house_hall)
else:
self.fd['house_hall'] = 0
if re.search(self.house_toilet_regex, detail_mer_str):
house_toilet=re.search(self.house_toilet_regex, detail_mer_str).group(1)
self.fd['house_toilet'] = int(house_toilet)
else:
self.fd['house_toilet'] = 0
if re.search(self.house_veranda_regex, response):
house_veranda=re.search(self.house_veranda_regex, response).group(1)
self.fd['house_veranda'] = int(house_veranda)
else:
self.fd['house_veranda'] = 0
if re.search(self.house_title_regex, response):
house_title=re.search(self.house_title_regex, response).group(1)
self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
else:
self.fd['house_title'] = ''
#描述
detail_box = soup.find('div',{'class':'maincon'})
if detail_box:
house_desc = str(detail_box)
self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!","",house_desc)
else:
self.fd['house_desc'] = ""
#小区名
lis=PyQuery(unicode(detail_mer_str,"UTF-8"))("li")
for li in lis:
lit=PyQuery(li)
if "小区:" in lit.text():
xq= lit.text().replace("小区:","")
if u"二手房信息" in xq:
self.fd['borough_name'] =xq[:xq.find("(")]
else:
self.fd['borough_name'] =xq
break
# if re.search(self.borough_name1_regex, detail_mer_str):
# borough_name=re.search(self.borough_name1_regex, detail_mer_str).group(1)
# self.fd['borough_name'] = re.sub("\(.*\)|<.*?>","",borough_name)
#
# else:
# self.fd['borough_name'] = ''
# lis=PyQuery(unicode(detail_mer_str,"UTF-8"))("li")
for li in lis:
lit= PyQuery(li).text()
if "地址:" in lit:
self.fd['house_addr']=lit[lit.find(":")+1:lit.find(u"(")]
break
#区域
try:
area_box = detail_mer.find(text="区域:").parent.parent
area_a = area_box('a')
if area_a and len(area_a)>1:
self.fd['house_region'] = str(area_a[0].string)
self.fd['house_section'] = str(area_a[1].string)
elif area_a and len(area_a)==1:
self.fd['house_region'] = str(area_a[0].string)
self.fd['house_section'] = ""
else:
self.fd['house_region'] = ""
self.fd['section'] = ""
except:
self.fd['house_region'] = ""
self.fd['house_section'] = ""
if re.search(self.house_age_regex, response):
house_age=re.search(self.house_age_regex, response).group(1)
Y=int(time.strftime('%Y', time.localtime()))
house_age=Y-int(house_age)
self.fd['house_age'] = house_age
else:
self.fd['house_age'] = 0
#朝向
self.fd['house_toward'] = toward(detail_mer_str)
self.fd['house_fitment'] = fitment(detail_mer_str)
request = None
response = None
soup=None
del request
del response
del soup
示例12: parse
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import text [as 别名]
def parse(self, input_text, *k, **kk):
global TWICE_PARSE_TIMEOUT
html = PyQuery(get_url(input_text))
items = html('a')
title = html('title').text()
data = {
"data": [],
"more": False,
"title": title,
"total": 0,
"type": "collection"
}
urls = []
for item in items:
a = PyQuery(item)
name = a.attr('title')
if name is None:
name = a.text()
no = name
subtitle = name
url = a.attr('href')
if url is None:
continue
if name is None or name == "":
continue
if re.match('^(http|https|ftp)://.+\.(mp4|mkv|ts|avi)', url):
url = 'direct:' + url
if not re.match('(^(http|https)://.+\.(shtml|html|mp4|mkv|ts|avi))|(^(http|https)://.+/video/)', url):
continue
if re.search(
'[^\?](list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com|www.iqiyi.com/lib/s_|www.iqiyi.com/dv/|top.iqiyi.com)',
url):
continue
if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))', no):
continue
unsure = False
for temp in urls:
if temp == str(url):
# print("remove:"+url)
url = None
break
if url is None:
continue
urls.append(url)
if re.search('(www.iqiyi.com/a_)|(www.le.com/comic)', url):
unsure = True
info = {
"name": name,
"no": no,
"subtitle": subtitle,
"url": url,
"unsure": unsure
}
data["data"].append(info)
if self.TWICE_PARSE:
try:
from .. import main
except Exception as e:
import main
def runlist_parser(queue, url, pool):
try:
result = main.parse(url, types="list", parsers_name=["iqiyilistparser.IQiYiAListParser",
"iqiyilistparser.IQiYiLibMListParser",
"iqiyilistparser.IQiYiVListParser"],
pool=pool)[0]
if (result is not None) and (result != []) and (result["data"] is not None) and (
result["data"] != []):
queue.put({"result": result, "url": url})
except IndexError:
pass
except Exception as e:
# continue
logging.exception("twice parse %s failed" % url)
# import traceback
# traceback.print_exc()
pool = WorkerPool(20)
parser_threads = []
parse_urls = []
t_results = []
q_results = Queue()
with WorkerPool() as pool:
for url in urls:
pool.spawn(runlist_parser, q_results, url, pool)
pool.join(timeout=self.TWICE_PARSE_TIMEOUT)
while not q_results.empty():
t_results.append(q_results.get())
oldddata = data["data"]
data["data"] = []
for t_result in t_results:
parse_urls.append(t_result["url"])
for tdata in t_result["result"]["data"]:
tdata["no"] = t_result["result"]["title"] + " " + tdata["no"]
data["data"].extend(t_result["result"]["data"])
#.........这里部分代码省略.........
示例13: Parse
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import text [as 别名]
def Parse(self,input_text,types=None):
if (types is not None) and ("collection" not in types):
return
html = PyQuery(common.getUrl(input_text))
items = html('a')
title = html('title').text()
data = {
"data": [],
"more": False,
"title": title,
"total": 0,
"type": "collection"
}
urls = []
for item in items:
a = PyQuery(item)
name = a.attr('title')
if name is None:
name = a.text()
no = name
subtitle = name
url = a.attr('href')
if url is None:
continue
if name is None or name == "":
continue
if re.match('^(http|https|ftp)://.+\.(mp4|mkv|ts|avi)',url):
url = 'direct:'+url
if not re.match('(^(http|https)://.+\.(shtml|html|mp4|mkv|ts|avi))|(^(http|https)://.+/video/)',url):
continue
if re.search('(list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com|www.iqiyi.com/lib/s_|www.iqiyi.com/dv/|top.iqiyi.com)',url):
continue
if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))',no):
continue
unsure = False
for temp in urls:
if temp == str(url):
#print("remove:"+url)
url = None
break
if url is None:
continue
urls.append(url)
if re.search('(www.iqiyi.com/a_)|(www.le.com/comic)',url):
unsure = True
info = {
"name": name,
"no": no,
"subtitle": subtitle,
"url": url,
"unsure": unsure
}
data["data"].append(info)
if self.TWICE_PARSE:
try:
from . import listparser
except Exception as e:
import listparser
try:
from .. import run
except Exception as e:
import run
def runlist_parser(queue,parser,url):
url2 = urlHandle(url)
try:
result = parser.Parse(url2)
if (result is not None) and (result != []) and (result["data"] is not None) and (result["data"] != []):
queue.put({"result":result,"url":url})
except Exception as e:
#continue
print(e)
#import traceback
#traceback.print_exc()
list_parser = listparser.ListParser()
urlHandle = run.urlHandle
parser_threads = []
parse_urls = []
t_results = []
q_results = queue.Queue()
for url in urls:
for filter in list_parser.getfilters():
if re.search(filter,url):
parser_threads.append(threading.Thread(target=runlist_parser, args=(q_results,list_parser,url)))
for parser_thread in parser_threads:
parser_thread.start()
for parser_thread in parser_threads:
parser_thread.join()
while not q_results.empty():
t_results.append(q_results.get())
oldddata = data["data"]
data["data"] = []
for t_result in t_results:
parse_urls.append(t_result["url"])
for tdata in t_result["result"]["data"]:
#.........这里部分代码省略.........