本文整理汇总了Python中pyquery.PyQuery.__unicode__方法的典型用法代码示例。如果您正苦于以下问题:Python PyQuery.__unicode__方法的具体用法?Python PyQuery.__unicode__怎么用?Python PyQuery.__unicode__使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyquery.PyQuery
的用法示例。
在下文中一共展示了PyQuery.__unicode__方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: fixLinks
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
def fixLinks(text, parser):
d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
for element in d('a, link'):
e = PyQuery(element)
href = e.attr('href')
if href is None:
continue
print '// Drop queryString in included src'
print 'from: ', href
result = urlparse(href)
if result.scheme == 'https':
href = href
elif result.scheme == '':
href = result.path + (('#' + result.fragment) if result.fragment != '' else '')
print 'to: ', href
new_href = re.sub(r'(rss/index\.html)|(rss/?)$', 'rss/index.rss', href)
if not abs_url_regex.search(href):
new_href = re.sub(r'/index\.html$', '/', new_href)
if href != new_href:
e.attr('href', new_href)
print "\t", href, "=>", new_href
if parser == 'html':
return d.html(method='html').encode('utf8')
return d.__unicode__().encode('utf8')
示例2: extract
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
def extract(self):
item = ContentItem()
self.html = re.sub('<!--.*?-->', '', self.html)
content_node = self.hxs.select("//div[@class = 'art_con']").extract()
content_node = PyQuery(content_node[0])
content_node.remove('div[class = "pconline_page"]')
content_node.remove('div[class = "pc3g"]')
content_node.remove('div[class = "pageTips"]')
content_node.remove('div[class = "art_nav_box mt10"]')
content_node.remove('div[class = "art_bottom"]')
content_node.remove('div[class = "art_con_top"]')
item['image_urls'] = [self.getRealURI(img.get('src')) for img in content_node('img') if not img.get('src').endswith('.gif')]
item['title'] = self.title = self.hxs.select("//h1/text()").extract()[0]
if not item['title']:
item['title'] = self.title = self.hxs.select("//div[@id = 'UC_newsInfoDetail_lbl_newsTitle']/text()").extract()[0]
item['content'] = self.content = content_node.__unicode__()
release_time = self.hxs.select("//div[@class = 'art_con_top']").extract()[0]
doc_t = PyQuery(release_time)
release_time = doc_t('span').text()
p = re.compile(u'20\d\d年\d\d月\d\d日')
#item['release_time'] = self.release_time = doc('div[class="art_con_top"]').find('span').eq(0).text()
item['release_time'] = self.release_time = p.search(release_time).group()
item['source'] = u'pconline'
item['author'] = ''
item['pic_url'] = ''
return item
示例3: extract
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
def extract(self):
self.html = re.sub('<!--.*?-->', '', self.html)
doc = PyQuery(self.html)
content_node = doc('div#rightdiv1')
content_node.remove('span.white12')
item = ContentItem()
content_node = content_node.__unicode__()
img_all = []
img='leftsmallimgurl\[1\]\=\"(.*?)\"\;'
ob = re.compile(img)
imgs = ob.findall(doc.__unicode__())
if not imgs:
image=''
else:
image='<br/><img src="'+imgs[0]+'"/><br/>'
img_all.append(self.getRealURI(imgs[0]))
content_node=image+content_node
item['image_urls'] = img_all
item['title'] = self.title = doc('h1').text()
item['content'] = self.content = content_node
item['release_time'] = ''
# item['release_switch_time'] = self.release_switch_time = time.time()
item['source'] = u"瑞丽服饰网"
item['author'] = ''
item['pic_url'] = ''
self.title = item['title']
self.content = item['content']
return item
示例4: extract
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
def extract(self):
item = ContentItem()
self.html = re.sub('<!--.*?-->', '', self.html)
tz_title=self.hxs.select("//h1/text()").extract()
content=self.hxs.select("//ul[@class='content']/li/div").extract()
tz_content=''
for con in content:
if "fromposty" in con:
tz_content=self.hxs.select("//ul[@class='content']/li/div")[2].extract()
break
else:
tz_content=self.hxs.select("//ul[@class='content']/li/div")[1].extract()
release_time=self.hxs.select("//div[@class='gray']/text()").extract()
imgs=PyQuery(tz_content)
ob=re.compile('src="(.*?)"')
imgs=ob.findall(imgs.__unicode__())
img_all=[]
for img in imgs:
if ".gif" in img:
continue
if ".GIF" in img:
continue
else:
img_all.append(self.getRealURI(img))
author=self.hxs.select("//td[@class='bbsname']/b/span/a/text()").extract()
tz_content = PyQuery(tz_content)
cont_div = tz_content('div[style = "color:#FCFCCC"]')
for cont in cont_div:
cont_div.eq(cont_div.index(cont)).removeAttr('style')
tz_content = tz_content.__unicode__()
item['image_urls'] = img_all
item['title'] = self.title = tz_title[0].strip()
item['content'] = self.content = tz_content
item['release_time'] = ''
item['source'] = u"铁血网"
item['author'] = author[0]
item['pic_url'] = ''
# item['release_switch_time'] = self.release_switch_time = time.mktime(time.strptime(self.release_time,u'%Y-%m-%d %H:%M'))
return item
示例5: fixLinks
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
def fixLinks(text):
d = PyQuery(text, parser='html')
for element in d('a'):
e = PyQuery(element)
href = e.attr('href')
if not abs_url_regex.search(href):
new_href = re.sub(r'/index\.html$', '/', href)
e.attr('href', new_href)
print "\t", href, "=>", new_href
return d.__unicode__().encode('utf8')
示例6: fixLinks
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
def fixLinks(text, parser):
d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
for element in d('a'):
e = PyQuery(element)
href = e.attr('href')
if not abs_url_regex.search(href):
new_href = re.sub(r'/index\.html$', '/', href)
new_href = re.sub(r'index.html', '/', new_href)
e.attr('href', new_href)
print "\t", href, "=>", new_href
if parser == 'html':
return d.html(method='html').encode('utf8')
return d.__unicode__().encode('utf8')
示例7: fix_share_links
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
def fix_share_links(text,parser):
td_regex = re.compile(target_domain + '|' )
assert target_domain, "target domain must be specified --target_domain=<http://your-host-url>"
d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
for share_class in ['.icon-twitter','.icon-facebook','.icon-google-plus']:
for element in d(share_class):
e = PyQuery(element)
href = e.attr('href')
new_href = re.sub(domain, target_domain, href)
e.attr('href', new_href)
print "\t", href, "=>", new_href
if parser == 'html':
return d.html(method='html').encode('utf8')
return d.__unicode__().encode('utf8')
示例8: extract
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
def extract(self):
self.html = re.sub('<!--.*?-->', '', self.html)
doc = PyQuery(self.html)
content_node = doc('div.kb_zw')
if not content_node:
# content_node = doc('div.zw_text')
content_node = PyQuery(self.hxs.select("//div[@class = 'zw_text']").extract()[0])
content_node.remove('script')
content_node.remove('style')
content_node.remove('iframe')
content_node.remove('div[style = "float:left; width:303px; height:250px; display:inline; margin:10px 10px 10px 10px;"]')
content_node.remove('input')
item = ContentItem()
item['title'] = self.title = doc('td[align = "center"]')('b').text()
if item['title'] == None:
item['title'] = self.title = doc('div.zw_bt').text()
if item['title'] == None:
item['title'] = self.title = doc('h1.zw_title').text()
item['release_time'] = ''
item['source'] = u"新浪"
item['author'] = ''
item['pic_url'] = ''
imgs = content_node('img')
image_urls = []
for img in imgs:
if ".gif" in img.get('src'):
continue
if not img.get('src'):
continue
else:
imgs.eq(imgs.index(img)).before('<br>')
imgs.eq(imgs.index(img)).append('<br>')
image_urls.append(self.getRealURI(img.get('src')))
item['image_urls'] = image_urls
content = content_node.__unicode__()
item['content'] = self.content = content
return item
示例9: fixLinks
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
def fixLinks(text, parser):
d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
for element in d('a, link'):
e = PyQuery(element)
href = e.attr('href')
if href is None:
continue
new_href = re.sub(r'(rss/index\.html)|((?<!\.)rss/?)$', 'rss/index.rss', href)
if not abs_url_regex.search(href):
new_href = re.sub(r'/index\.html$', '/', new_href)
if href != new_href:
e.attr('href', new_href)
print "\t", href, "=>", new_href
if parser == 'html':
return d.html(method='html').encode('utf8')
return d.__unicode__().encode('utf8')
示例10: fixLinks
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
def fixLinks(text, parser):
d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
for element in d('a'):
e = PyQuery(element)
href = e.attr('href')
print href
if href is None:
continue
new_href = re.sub(r'(rss/index\.html)|(rss/?)$', 'rss/index.rss', href)
if not abs_url_regex.search(href):
new_href = re.sub(r'/index\.html$', '/', new_href)
if href != new_href:
e.attr('href', new_href)
print "\t", href, "=>", new_href
# remove ?v=XXXXXXXXX in css
for element in d('link'):
e = PyQuery(element)
href = e.attr('href')
if href is None:
continue
if re.match(r'http://fonts',href) is not None:
continue
new_href = re.sub(r'\?.*', '',href)
if href != new_href:
e.attr('href',new_href)
print "\t", href, "=>", new_href
# remove ?v=XXXXXXXXX in js
for element in d('script'):
e = PyQuery(element)
src = e.attr('src')
if src is None:
continue
new_src = re.sub(r'\?.*', '',src)
if src != new_src:
e.attr('src',new_src)
print "\t", src, "=>", new_src
###################
if parser == 'html':
return d.html(method='html').encode('utf8')
return d.__unicode__().encode('utf8')
示例11: extract
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import __unicode__ [as 别名]
def extract(self):
item = ContentItem()
self.html = re.sub('<!--.*?-->', '', self.html)
tz_title=self.hxs.select("//h1/text()").extract()
tz_content=self.hxs.select("//div[@class='text']").extract()
release_time=self.hxs.select("//div[@class='user']/ul/li/text()").extract()
ob=re.compile(u'20\d\d.*:\d\d')
release_time=ob.findall(release_time[0])
imgs=self.hxs.select("//div[@class='text']/div/div/p/a/img/@src").extract()
img_all=[]
for img in imgs:
if ".gif" in img:
continue
if ".GIF" in img:
continue
else:
img_all.append(self.getRealURI(img))
item['image_urls'] = img_all
item['title'] = self.title = tz_title[0]
content = tz_content[0]
content_html = PyQuery(content)
cont_div = content_html('div[style = "color:#f9f9f9"]')
for cont in cont_div:
cont_div.eq(cont_div.index(cont)).removeAttr('style')
content_html = content_html.__unicode__()
item['content'] = self.content = content_html
item['release_time'] = release_time[0]
item['source'] = u"铁血网"
item['author'] = ''
item['pic_url'] = ''
# item['release_switch_time'] = self.release_switch_time = time.mktime(time.strptime(self.release_time,u'%Y-%m-%d %H:%M'))
return item