本文整理汇总了Python中pyquery.PyQuery.html方法的典型用法代码示例。如果您正苦于以下问题:Python PyQuery.html方法的具体用法?Python PyQuery.html怎么用?Python PyQuery.html使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyquery.PyQuery
的用法示例。
在下文中一共展示了PyQuery.html方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: no_fonts
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def no_fonts (pq): # yuk - lxml etree and PyQuery objs get confused - nested ones arent removed, this goes only 2 levels
raise Exception, "yuk - it's a mess, use tidy!"
pq = PyQuery (pq)
#print fonts.__class__.__name__
for font in pq ('font'):
font = PyQuery (font)
#font ('a').remove()
#print font.__class__.__name__
#print len (font), font [0]
#print dir (font)
#import sys
#sys.exit()
#inner = innerhtml (font) # .text() #.replace (':','').strip()
#print 'Replacing font with:', font.html()
font.replaceWith (font.html())
#font.getparent().replace (font, PyQuery (inner))
print 'font replaced:', font [:60]
#font = no_fonts (font)
for font in pq ('font'):
font = PyQuery (font)
font.replaceWith (font.html())
print 'font 2 replaced:', font [:60]
return pq
示例2: ReadURL
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def ReadURL(url):
trytime = 0
pq = None
while (trytime < 3):
try:
pq = PyQuery(url = url)
break
except Exception as e:
print 'Exception!', url
trytime += 1
raise e
time.sleep(SLEEP_BETWEEN_REQUEST)
if pq == None or pq.html() == None:
return ''
return pq.html()
示例3: scrape
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def scrape(slug, url, name, title=None):
f = urlopen(url)
doc = f.read()
doc, errs = tidy_document(
doc,
options={
"output-html": 1,
#'indent':1,
"clean": 1,
"drop-font-tags": 1,
},
)
if errs:
# raise Exception, errs
print errs
doc = html5lib.parse(doc, treebuilder="lxml") # this didn't work, but above three lines did: encoding='utf-8',
html.xhtml_to_html(doc)
jQuery = PyQuery([doc])
td = jQuery("td#content")
assert len(td) == 1
for img in td("img"):
# print 'img:', PyQuery (img)
img = PyQuery(img)
src = img.attr("src")
# alt = img.attr('alt')
# if src.startswith ('/image'):
rslt = getimage(src, slug.split("/")[0])
img.attr("src", rslt)
if trace:
print rslt
# td =
# no_fonts (td)
# need to fix links here
content = PyQuery(td[0])
# content = content.html()
content = no_namespaces(content.html())
print slug, content[:60] # .html() # [:60]
if dbteeth:
# q, created = QuickPage.objects.get_or_create (
qp, created = create_or_update(
QuickPage,
keys=dict(slug=slug),
fields=dict(
name=name,
title=title if title else name,
content=content,
# defaults = dict (sortorder = sortorder),
),
)
示例4: test_mount_tag
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def test_mount_tag():
root = PyQuery('<root></root>')
tag = {'name': 'custom', 'html': '<custom><text>{opts.txt}</text></custom>'}
dom = vdom.mount_tag(root, tag, {'txt': 'hello world'})
assert dom and dom.uuid # dom created
assert vdom.get_dom(dom.uuid) # dom cached
assert root.html() # mounted something
示例5: fixLinks
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def fixLinks(text, parser):
d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
for element in d('a, link'):
e = PyQuery(element)
href = e.attr('href')
if href is None:
continue
print '// Drop queryString in included src'
print 'from: ', href
result = urlparse(href)
if result.scheme == 'https':
href = href
elif result.scheme == '':
href = result.path + (('#' + result.fragment) if result.fragment != '' else '')
print 'to: ', href
new_href = re.sub(r'(rss/index\.html)|(rss/?)$', 'rss/index.rss', href)
if not abs_url_regex.search(href):
new_href = re.sub(r'/index\.html$', '/', new_href)
if href != new_href:
e.attr('href', new_href)
print "\t", href, "=>", new_href
if parser == 'html':
return d.html(method='html').encode('utf8')
return d.__unicode__().encode('utf8')
示例6: _split
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def _split(inputfile, outputdir):
source = open(inputfile, 'r')
html = source.read()
source.close()
if not os.path.isdir(outputdir):
os.mkdir(outputdir)
idx_slide=0
idx_section=0
parsed = PyQuery(html)
for section in parsed('section'):
slide = PyQuery(section)
if slide.has_class('stack'):
idx_section+=1
stack_path = os.path.join(outputdir,'%02d' % idx_section )
os.mkdir(stack_path)
for sub_slide in PyQuery(slide.html())('section'):
idx_slide+=1
_dump_slide(sub_slide, idx_slide, stack_path)
else:
if not slide.parent().has_class('stack'):
idx_slide+=1
_dump_slide(slide, idx_slide, outputdir)
示例7: _enhance_text
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def _enhance_text(self):
"""
Transforms a simplified text into a valid mail.template text.
:return: mail.template text
"""
self.ensure_one()
# Parse and set back the keywords into raw template code
html_text = PyQuery(self.simplified_text.replace('\n', ''))
def sort_keywords(kw):
# Replace first if/for-clauses, then var, then code
index = kw.position
if kw.type == 'if' or 'for' in kw.type:
index += 2*len(self.body_html) * kw.nested_position
# Take if and for in the appearing order in the text
index -= kw.position
elif kw.type == 'var':
index += len(self.body_html)
return index
keywords = self.keyword_ids.sorted(sort_keywords, reverse=True)
# Replace automatic-generated keywords
for keyword in keywords:
keyword_text = html_text('#' + keyword.html_id)
keyword_text.replace_with(keyword.final_text)
# Replace user added keywords
template_text = html_text.html()
for keyword in keywords.filtered(lambda k: k.type == 'code'):
to_replace = u"[{}]".format(keyword.short_code)
template_text = template_text.replace(to_replace, keyword.raw_code)
final_text = PyQuery(BeautifulSoup(template_text).prettify())
return final_text('body').html()
示例8: sanitize_description
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def sanitize_description(value):
cleaned = PyQuery(value)
cleaned = cleaned.remove('span.playMetaText')
cleaned.remove('span.playMetaText')
cleaned.remove('time')
cleaned.remove('strong')
return cleaned.html().split('<span>')[-1:][0].replace('</span>', '')
示例9: sanitize_html2
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def sanitize_html2(value):
soup = PyQuery(value)
soup = soup.remove("span.playMetaText")
soup.remove("span.playMetaText")
soup.remove("time")
soup.remove("strong")
return soup.html().split("<span>")[-1:]
示例10: get_pastes
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def get_pastes ( self ):
Logger ().log ( 'Getting pastes', True )
try:
page = PyQuery ( url = self.PASTES_URL )
except KeyboardInterrupt:
raise
except:
return self.CONNECTION_FAIL,None
"""
There are a set of encoding issues which, coupled with some bugs in etree (such as in the Raspbian packages) can
trigger encoding exceptions here. As a workaround, we try every possible encoding first, and even if that fails,
we resort to a very hacky workaround whereby we manually get the page and attempt to encode it as utf-8. It's
ugly, but it works for now.
"""
try:
page_html = page.html ()
except KeyboardInterrupt:
raise
except:
worked = False
for enc in all_python_encodings():
try:
page_html = page.html(encoding=enc)
worked = True
break
except KeyboardInterrupt:
raise
except:
pass
if not worked:
# One last try...
try:
f = urllib.request.urlopen(Crawler.PASTES_URL)
page_html = PyQuery(str(f.read()).encode('utf8')).html()
f.close()
except KeyboardInterrupt:
raise
except:
return self.OTHER_ERROR, None
if re.match ( r'Pastebin\.com - Access Denied Warning', page_html, re.IGNORECASE ) or 'blocked your IP' in page_html:
return self.ACCESS_DENIED,None
else:
return self.OK,page('.maintable img').next('a')
示例11: clean_body
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def clean_body(body):
site = Site.objects.get_current()
html = PyQuery('<body>' + body + '</body>')
for p in html('p'):
p = PyQuery(p)
p.replaceWith('\n\n%s\n\n' % p.html())
html('.alignright').addClass('pull-right').removeClass('alignright')
html('.alignleft').addClass('pull-left').removeClass('alignleft')
html('[style="float: left;"]').removeAttr('style').addClass('alignleft')
html('[style="float: right;"]').removeAttr('style').addClass('alignright')
while '\n\n\n' in body:
body = body.replace('\n\n\n', '\n\n')
while '\r\r\r' in body:
body = body.replace('\r\r\r', '\r\r')
body = html.html()
body = body.replace('<br />', ' \n')
body = body.replace('<br/>', ' \n')
body = body.replace('<br>', ' \n')
body = body.replace('\r\n', '\n')
body = body.replace('\n\r', '\n')
while body.find('\n\n\n') > -1:
body = body.replace('\n\n\n', '\n\n')
while body.startswith('\n'):
body = body[1:]
while body.endswith('\n'):
body = body[:-1]
while body.startswith('\r'):
body = body[1:]
while body.endswith('\r'):
body = body[:-1]
while body.startswith('\t'):
body = body[1:]
return body
示例12: extract
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def extract(self):
self.html = re.sub('<!--.*?-->', '', self.html)
doc = PyQuery(self.html)
content_node = doc('div#contentText')
content_node.remove('script')
content_node.remove('style')
content_node.remove('.line')
content_node.remove('#shareIn')
content_node.remove('.tagHotg')
content_node.remove('.blank8')
content_node.remove('."editShare clear"')
content_node.remove('select')
#content_node.remove('table[width = "100%"]')('td[align = "center"]')
content_node.remove('div[class = "jingbian_travel01_04"]')
content_node.remove('div[class = "txt2"]')
content_node.remove('iframe')
content_node.remove('embed')
content_node.remove('td[style = "font-size: 14px; font-weight: bold;"]')
content_node.remove('table[style = "margin-right: 20px;"]')
content_node.remove('digi_perpage_bottom')
content_node.remove('div[class = "extract clear"]')
content_node.remove('table[bgcolor = "#eeeeee"]')
content_node.remove('img[alt = "搜狐教育频道"]')
content_node.remove('table[bgcolor = "#e2e2e2"]')
content_node.remove('table[bgcolor = "#66ccff"]')
content_node.remove('div[class = "digi_digest"]')
item = ContentItem()
imgs = content_node('img')
img_all = []
for img in imgs:
if".gif" in img.get('src'):
continue
else:
imgs.eq(imgs.index(img)).append('<br>')
imgs.eq(imgs.index(img)).before('<br>')
img_all.append(self.getRealURI(img.get('src')))
item['image_urls'] = img_all
item['title'] = self.title = doc('h1').text()
item['content'] = self.content = content_node.__unicode__()
t = re.compile(u'var club_artinputdate = "(.*?)";')
release_time = t.search(doc.html())
if release_time:
item['release_time'] = self.release_time = release_time.group(1)
# item['release_switch_time'] = time.mktime(time.strptime(t.search(doc.html()).group(1),'%Y-%m-%d %H:%M:%S'))
item['source'] = u'搜狐'
author = doc('div[class = "function clear"]')
self.author = author('div.l')('a').text()
item['author'] = self.author
item['pic_url'] = ''
return item
示例13: sanitize_description
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def sanitize_description(value):
cleaned = PyQuery(value)
cleaned = cleaned.remove('span.playMetaText')
cleaned.remove('span.playMetaText')
cleaned.remove('span.playCount')
cleaned.remove('time')
cleaned.remove('strong')
desc = cleaned.html()
if desc is None: return ""
return desc.split('<span>')[-1:][0].replace('</span>', '').strip()
示例14: fixLinks
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def fixLinks(text, parser):
d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
for element in d('a'):
e = PyQuery(element)
href = e.attr('href')
if not abs_url_regex.search(href):
new_href = re.sub(r'/index\.html$', '/', href)
new_href = re.sub(r'index.html', '/', new_href)
e.attr('href', new_href)
print "\t", href, "=>", new_href
if parser == 'html':
return d.html(method='html').encode('utf8')
return d.__unicode__().encode('utf8')
示例15: render_md5
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import html [as 别名]
def render_md5(self, post_content):
config = Config()
self.body = markdown2.markdown(
post_content,
extras=config.mdextras,
)
# rewrite relative img-srcs to full paths.
d = PyQuery(self.body)
for img in d.find('img'):
if '/' not in img.attrib['src']:
img.attrib['src'] = '{}{}/{}'.format(config.blogurl,
self.outputpath,
img.attrib['src'])
self.body = d.html()