本文整理汇总了Python中pyquery.pyquery.PyQuery.find方法的典型用法代码示例。如果您正苦于以下问题:Python PyQuery.find方法的具体用法?Python PyQuery.find怎么用?Python PyQuery.find使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyquery.pyquery.PyQuery
的用法示例。
在下文中一共展示了PyQuery.find方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: extract_data
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import find [as 别名]
def extract_data(text):
global total_data
pq = PyQuery(text)
data = pq.find('p.data').text()
total_data = total_data + data
nextState = pq.find('.nextState').attr('value')
return nextState
示例2: detail_page
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import find [as 别名]
def detail_page(self, response):
t = response.text.replace(' ', '')
d = PyQuery(t)
base = response.save
base_url = response.url
fenbu = dict(map(
lambda x: (x.find('.field-righttit').text(), x.find('ul').text()),
list(d.find(".right-border div").items())
))
basic_info = dict(map(
lambda x: (x.text().replace(u':', "").strip(),
x.parent().text().replace(x.text(), "").strip()),
list(d.find('.fc-gray').items())
))
other_info = dict(map(
lambda x: (x.text().replace(u':', ''), x.next().text()), list(d.find('.xiaoqu-otherinfo dt').items())
))
info_temp = {
'base': base,
'sell_rent_info': fenbu,
'basic_info': basic_info,
'other_info': other_info
}
url = base_url + 'amenities/'
self.crawl(url, callback=self.amenities_page, save=info_temp, retries=100)
return [
2,
response.url,
json.dumps(info_temp),
time.strftime('%Y-%m-%d %X', time.localtime())
]
示例3: __getPageAllLink
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import find [as 别名]
def __getPageAllLink(self,p):
# if self.kind=="1":
# lis=PyQuery(p)("div.qiuzu li")
# elif self.kind=="2":
# lis=PyQuery(p)("div.qiuzu li")
if self.kind=="1" or self.kind=="2":
lis=PyQuery(p)("div.house")
else:
lis=PyQuery(p)("div.qiuzu li")
links=[]
for li in lis:
# if self.kind=="3":
# tm=PyQuery(li)("p.time span").eq(1).text()
# link=self.baseurl+PyQuery(li)("p.housetitle a").attr("href")
if self.kind=="2" or self.kind=="1":
tm=PyQuery(li)("p.time").text()
tm=tm and tm.replace("个人","") or ""
link=self.baseurl+PyQuery(li)("p.housetitle a").attr("href")
else:
tm=PyQuery(li)("span.li5").text()
link=self.baseurl+PyQuery(li)("span.li2 a").attr("href")
if self.kind=="4":
if PyQuery(li)("span.li1").text()=="合租 ":
continue
# tm=PyQuery(li)("span.li5").text()
# link=self.baseurl+PyQuery(li)("span.li2 a").attr("href")
#link=self.baseurl+PyQuery(li)("span.li2 a").attr("href")
# print link
if u"天" in tm:
s=tm.find(u"天")
tm=tm[:s]
if int(tm)<8:
links.append(link)
else:
break
elif u"小时" in tm:
links.append(link)
elif u"分钟" in tm:
links.append(link)
else:
continue
if 1:#not checkPath(homepath,self.folder,link):
LinkLog.info("%s|%s"%(self.kind,link))
try:
getContent(link,self.citycode,self.kind)
except Exception,e:print "ganji getContent Exception %s"%e
time.sleep(int(self.st))
# fetch_quere.put({"mod":"soufang","link":link,"citycode":self.citycode,"kind":self.kind})
# self.clinks.extend(links)
if self.kind=="1" or self.kind=="2":
if len(links)!=30:
return False
else:
return True
else:
if len(links)!=35:
return False
else:
return True
示例4: onSuccess
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import find [as 别名]
def onSuccess(self, tid, context, response,headers):
resp = PyQuery(response)
for h3 in resp.find("h3 a"):
url="http://dev.open.taobao.com/bbs/"+h3.attrib['href']
print h3.text
Spider.executeSql(self,"insert into task (task_type,url,status,http_code,task_context) values('topbbs文章',%s,0,-1,%s)",(url,h3.text))
Spider.onSuccess(self,tid, context,response,headers);
示例5: parse_html_page
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import find [as 别名]
def parse_html_page(self):
pq = PyQuery(self.html_page)
main_table = pq('#mainBody > table.coltable')
def find_row(text):
for c in main_table.find('td:first-child').items():
if c.text() == text:
return c.nextAll().items().next()
def find_row_text(text, default=''):
row = find_row(text)
if row:
return row.text()
return default
def find_row_html(text, default=''):
row = find_row(text)
if row:
return row.html()
return default
self.info_hash = find_row_text('Info hash')
self.title = pq.find('#mainBody > h1').text()
self.category, self.subcategory = find_row_text('Type').split(' - ', 1)
self.language = find_row_text('Language')
self.cover_url = find_row('Picture:').find('img').attr('src')
self.small_description = find_row_html('Small Description')
self.description = find_row_html('Description')
self.torrent_url = find_row('Download').find('a#dlNormal').attr('href')
size_string = find_row_text('Size')
match = re.match('.* \((?P<size>\d+(,\d\d\d)*) bytes\)', size_string)
self.torrent_size = int(match.group('size').replace(',', ''))
示例6: page_parse
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import find [as 别名]
def page_parse(content, url):
d = PyQuery(content)
# print content[:200].encode('utf8')
shop_name = d.find('.shop-name>a').text()
shop_years = d.find('.shop-time>em').text()
open_time = d.find('.store-time>em').text()
contact_person = d.find('.contactName').text()
contact_block = d.find('.box.block.clear-block').html()
contact_detail = re.findall(pattern_contact_info, contact_block)
crawl_time = time.strftime('%Y-%m-%d %X', time.localtime())
return [
url.replace('contactinfo/', '').replace('.html', ''),
json.dumps(dict([
('shop_name', shop_name),
('contact_url', url),
('shop_years', shop_years),
('open_time', open_time),
('contact_person', contact_person)
] + contact_detail)
),
crawl_time
]
示例7: _parse
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import find [as 别名]
def _parse(self, response):
d = PyQuery(response)
# page_turning
__url = map(lambda x: x.attr('href'),
d.find(self.__css).items()
)
if config_dictionary.get(self.__url_start).get('basejoin'):
new_url = map(lambda u: urlparse.urljoin(self.__url_base, u), __url)
else:
new_url = __url
self.__url_pool = self.__url_pool.union(set(new_url))
# IP address extracting
rst = ':'.join(d.text().split(' '))
proxy_list = re.findall(pattern_ip_address, rst)
proxy_port_queue.put((proxy_list, self.__url_base))
示例8: serializeArray
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import find [as 别名]
def serializeArray(form):
form = PyQuery(form)
if not form.is_('form'):
return []
source = form.find('input, select, textarea')
data = []
for input in source:
input = PyQuery(input)
if input.is_('[disabled]') or not input.is_('[name]'):
continue
if input.is_('[type=checkbox]') and not input.is_('[checked]'):
continue
data.append((input.attr('name'), input.val()))
return data
示例9: rent
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import find [as 别名]
def rent(self,url):
hc= urlparse(url)[1].replace('.58.com',"")
hc2=citynameDict_sf.get(hc)
if hc2:
self.fd['house_city']=hc2
else:
self.fd['house_city']=hc
self.fd['house_flag'] = 2
request = urllib2.Request(url, None, self.header)
response = urllib2.urlopen(request).read()
if self.mayGetIt(response):
self.fd={}
return
# tree = etree.HTML(response)
soup =BeautifulSoup(response)
detail_mer = soup.find('ul',{'class':'info'})
detail_mer_str =re.sub("\n|\t\r| ","",str(detail_mer))
#print detail_mer_str
#非个人房源 return
#print re.search(self.agencyname_regex, response).group(1)
if re.search(self.agencyname_regex, response):
agencyname=re.search(self.agencyname_regex, response).group(1)
if agencyname != '个人房源':return
else:
return
if re.search(self.username_regex, response):
username=re.search(self.username_regex, response).group(1)
self.fd['owner_name'] = username
else:
self.fd['owner_name'] = ""
owner_phone = soup('img')
# print owner_phone
self.fd['owner_phone_pic'] = ''
for phone in owner_phone:
if phone['src'].find('58.com/showphone.aspx') != -1:
self.fd['owner_phone_pic'] = phone['src']
#没有联系方式 return
if not self.fd['owner_phone_pic']:return
if soup.find('div',{"class":'other'}):
posttime = soup.find('div',{"class":'other'}).contents[0]
posttime = re.sub('\n|\r| |\t','',posttime)
posttime = posttime.replace('发布时间:','').replace(' 浏览','')
else:
posttime = ''
if not posttime:
return
elif posttime.find('-') !=-1:
s = datetime.datetime(int(posttime.split('-')[0]),int(posttime.split('-')[1],),int(posttime.split('-')[2]))
posttime = int(time.mktime(s.timetuple()))
elif posttime.find('分钟') !=-1:
n = int(posttime.replace('分钟前',''))*60
posttime = int(time.time() - n)
elif posttime.find('小时') !=-1:
n = int(posttime.replace('小时前',''))*60*60
posttime = int(time.time() - n)
self.fd['house_posttime'] = posttime
if (time.time() - self.fd['house_posttime']) > 3600*24*7:
return
# print "++++++++++++++++"
# print time.strftime('%Y %m %d', time.localtime(self.fd['posttime']))
if re.search(self.house_floor_regex, detail_mer_str):
house_floor=re.search(self.house_floor_regex, detail_mer_str).group(1)
self.fd['house_floor'] = int(house_floor)
else:
self.fd['house_floor'] = 0
if re.search(self.house_topfloor_regex, detail_mer_str):
house_topfloor=re.search(self.house_topfloor_regex, detail_mer_str).group(1)
self.fd['house_topfloor'] = int(house_topfloor)
else:
self.fd['house_topfloor'] = 0
if re.search(self.house_totalarea_regex, detail_mer_str):
house_totalarea=re.search(self.house_totalarea_regex, detail_mer_str).group(1)
self.fd['house_addr'] = int(house_totalarea)
else:
self.fd['house_addr'] = 0
#类型
self.fd['house_type'] = housetype(detail_mer_str)
self.fd['house_price'] = str(detail_mer.em.string)
if re.search(self.house_room_regex, detail_mer_str):
house_room=re.search(self.house_room_regex, detail_mer_str).group(1)
self.fd['house_room'] =int(house_room)
else:
self.fd['house_room'] = 0
if re.search(self.house_hall_regex, detail_mer_str):
house_hall=re.search(self.house_hall_regex, detail_mer_str).group(1)
self.fd['house_hall'] = int(house_hall)
else:
self.fd['house_hall'] = 0
#.........这里部分代码省略.........
示例10: extract_upload_errors
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import find [as 别名]
def extract_upload_errors(html):
pq = PyQuery(html)
result = []
for e in pq.find('.thin > p[style="color: red; text-align: center;"]'):
result.append(PyQuery(e).text())
return result
示例11: auto_save_img
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import find [as 别名]
def auto_save_img(html, skip_domain=None, img_url_base=''):
from web.flask.globals import g
from web.flask.helpers import url_for
from pyquery.pyquery import PyQuery
from runkit.http_utility import domain
from runkit.utility import build_date_folder_file
#from config.globals import PHOTOS_PATH
#import Image, ImageEnhance
#from manage.models.material import Material, MaterialService
"""
自动保存远端的图片
"""
if not html:
return html
pq = PyQuery(html)
img_list = pq.find("img")
replace_list = {}
for img in img_list:
if 'src' in img.attrib:
img_src = img.attrib['src']
if img_src.find('http') != -1:
img_domain = domain(img_src)
if img_domain != skip_domain and img_src not in replace_list:
#print img_domain, img_src
new_img_file = img_src.split('/')[-1]
name, ext = os.path.splitext(new_img_file)
ext = ext[1:]
folder_name, file_name = build_date_folder_file()
file_name += new_img_file
directory = '%s%s' % (PHOTOS_PATH, folder_name)
# 要创建目录
if not os.path.exists(directory):
os.makedirs(directory)
local_file = '%s/%s' % (directory, file_name)
new_img_src = '%s%s/%s' % (img_url_base, folder_name, file_name)
#print local_file, new_img_src
# 1、下载数据
# 2、计算md5
# 3、从素材库中查找是否存在
try:
#urllib.urlretrieve(img_src, local_file)
sock = urllib2.urlopen(img_src)
rcv = sock.read()
sock.close()
m = hashlib.md5()
m.update(rcv)
material = MaterialService.get_by_file_signature(m.hexdigest())
if not material:
f = open(local_file, 'wb')
f.write(rcv)
size = f.tell()
f.close()
material = Material()
material.added_user_id = g.user.id
material.file_name = file_name
material.file_ext = ext
material.file_path = folder_name
material.file_type = ext
material.file_size = size
material.file_signature = m.hexdigest()
material.thumbnail_file = ''
material.url = new_img_src
if 'alt' in img.attrib:
material.title = img.attrib['alt']
MaterialService.add_or_update(material)
new_img_src = url_for('misc.photo', id=material.id, ext=ext)
except Exception, e:
raise e
replace_list[img_src] = new_img_src
else:
raise Exception(u'内部错误')
示例12: Classifier
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import find [as 别名]
class Classifier(object):
"""classify verious licences.
>>> c = Classifier()
>>> c.segments
[SoftwareLicenses, DocumentationLicenses, OtherLicenses]
>>> c.segments[0].categories
[GPLCompatibleLicenses, GPLIncompatibleLicenses, NonFreeSoftwareLicenses]
>>> c.segments[0].categories[0].licenses
[GNUGPLv3, GPLv2, LGPLv3, LGPLv2.1, AGPLv3.0, ...
"""
default_data = 'lic_check/license.html'
def __init__(self):
"""initialize."""
with open(self.default_data) as fobj:
data = fobj.read()
self.html = PyQuery(data)
self.segments = self._parse()
def _parse(self):
"""parse license html."""
segments = []
for segment in self._segments():
segment.categories = self.categories(segment)
for category in segment.categories:
category.licenses = self.licenses(category)
segments.append(segment)
return segments
def _segments(self):
"""segments."""
return (Segment(i) for i in self.html.find('.big-section h3')
.filter(lambda i: i != 0))
def categories(self, segment=None):
"""categories.
>>> c = Classifier()
>>> c.categories(c.segments[0])
[GPLCompatibleLicenses, GPLIncompatibleLicenses, NonFreeSoftware...
>>> c.categories(c.segments[1])
[FreeDocumentationLicenses, NonFreeDocumentationLicenses]
>>> c.categories(c.segments[2])
[OtherLicenses, Fonts, OpinionLicenses, Designs]
>>> c.categories().get('SoftwareLicenses')
[GPLCompatibleLicenses, GPLIncompatibleLicenses, NonFreeSoftware...
>>> c.categories().get('DocumentationLicenses')
[FreeDocumentationLicenses, NonFreeDocumentationLicenses]
"""
if segment:
return [Category(i, segment)
for i in self.__retrieve_cat_elem(segment)]
else:
return {'{0}'.format(_seg): self.categories(_seg)
for _seg in self.segments}
def __retrieve_cat_elem(self, segment):
return (self.html.find('.toc ul li a')
.filter(lambda i, this: PyQuery(this)
.attr('href') == '#{0}'.format(segment))
.siblings('ul').find('a'))
def licenses(self, category=None):
"""licenses.
>>> c = Classifier()
>>> sw_lic = c.segments[0]
>>> gpl_compat_lic = c.categories(sw_lic)[0]
>>> gpl_compat_lics = c.licenses(gpl_compat_lic)
>>> len(gpl_compat_lics)
50
>>> gpl_compat_lics[0]
GNUGPLv3
>>> gpl_compat_lics[0].category
GPLCompatibleLicenses
>>> gpl_compat_lics[0].segment
SoftwareLicenses
>>> gpl_incompat_lic = c.categories(c.segments[0])[1]
>>> c.licenses(gpl_incompat_lic)
[AGPLv1.0, AcademicFreeLicense, apache1.1, ...
>>> nonfree_lic = c.categories(sw_lic)[2]
>>> c.licenses(nonfree_lic)
[NoLicense, Aladdin, apsl1, ...
>>> c.licenses().get('GPLCompatibleLicenses')
[GNUGPLv3, GPLv2, LGPLv3, LGPLv2.1, AGPLv3.0, ...
"""
if category:
return [License(i, category)
for i in self.__retrieve_lic_elem(category)
if i.get('id') and i.text]
else:
categories = []
for i in self.categories().values():
categories += i
return {'{0}'.format(cat): self.licenses(cat)
for cat in categories}
def __retrieve_lic_elem(self, category):
#.........这里部分代码省略.........
示例13: brand_list
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import find [as 别名]
def brand_list():
res = requests.get('http://list.jd.com/list.html?cat=1319%2C1523%2C7052&go=0')
d = PyQuery(res.content)
return map(lambda a: a.text().split(u'(')[0], list(d.find('#brandsArea li a').items()))
示例14: brand_list
# 需要导入模块: from pyquery.pyquery import PyQuery [as 别名]
# 或者: from pyquery.pyquery.PyQuery import find [as 别名]
def brand_list(url):
# 利用京东地址返回品牌列表
# 弃用,现用本地文件返回品牌列表
res = requests.get(url)
d = PyQuery(res.content)
return map(lambda a: a.text().split(u'(')[0], list(d.find('#brandsArea li a').items()))