本文整理汇总了Python中pyquery.PyQuery.remove方法的典型用法代码示例。如果您正苦于以下问题:Python PyQuery.remove方法的具体用法?Python PyQuery.remove怎么用?Python PyQuery.remove使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyquery.PyQuery
的用法示例。
在下文中一共展示了PyQuery.remove方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: render_document
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import remove [as 别名]
def render_document(vnode, expressions, context):
for expression in expressions:
evaluation = evaluate_expression(expression, context)
node = expression.get('node')
if isinstance(expression.get('value'), basestring) and expression.get('value') == evaluation:
continue
expression['value'] = evaluation
if expression.get('type') == 'each':
if expression.get('parent'):
parent = expression.get('parent')
else:
parent = node.parent()
expression['parent'] = parent
riot_id = node.attr['data-riot-id']
original_children = parent.children('[data-riot-id="%s"]' % riot_id)
# 0. add placeholder
placeholder = PyQuery('<text></text>')
placeholder.insertBefore(original_children.eq(0))
# 1. remove children
original_node = original_children.clone()
original_children.remove()
expression['node'] = original_node
# 2. insert children
loopcontext = {}
loopcontext.update(context if isinstance(context, dict) else vars(context))
expressions_col = []
for loop_index, item in enumerate(evaluation):
loopcontext.update(item if isinstance(item, dict) else vars(item))
loopcontext['loopindex'] = loop_index
child_node = PyQuery(expression.get('impl'))
child_node.attr['data-riot-loopindex'] = str(loop_index)
expressions = parse_document_expressions(child_node)
expressions_col.append((expressions, loopcontext))
render_document(vnode, expressions, loopcontext)
child_node.insertBefore(placeholder)
# 3. remove placeholder
if len(evaluation) == 0:
placeholder.attr['data-riot-id'] = str(riot_id)
else:
placeholder.remove()
mark_dirty(parent)
generate_widget(parent)
for expressions, loopcontext in expressions_col:
connect_signals(vnode, expressions, loopcontext)
continue
if expression.get('type') == 'markup':
node.attr['markup'] = json.dumps(evaluation)
node.html('')
mark_dirty(node)
continue
if expression.get('type') == 'attribute':
attribute = expression.get('attribute')
node.attr[attribute] = str(evaluation)
mark_dirty(node)
continue
示例2: __processImageTag
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import remove [as 别名]
def __processImageTag(self, i, e):
obj = PyQuery(e)
style = obj.attr('style')
if style != None and style.find('display: none') != -1:
obj.remove()
return
newObj = PyQuery("<img />")
newObj.attr('src', obj.attr('rel:bf_image_src'))
newObj.attr('style', obj.attr('style'))
newObj.width(obj.width())
newObj.height(obj.height())
obj.replaceWith(newObj)
示例3: sanitize_description
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import remove [as 别名]
def sanitize_description(value):
cleaned = PyQuery(value)
cleaned = cleaned.remove('span.playMetaText')
cleaned.remove('span.playMetaText')
cleaned.remove('span.playCount')
cleaned.remove('time')
cleaned.remove('strong')
desc = cleaned.html()
if desc is None: return ""
return desc.split('<span>')[-1:][0].replace('</span>', '').strip()
示例4: extract
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import remove [as 别名]
def extract(self):
self.html = re.sub('<!--.*?-->', '', self.html)
doc = PyQuery(self.html)
content_node = doc('div.kb_zw')
if not content_node:
# content_node = doc('div.zw_text')
content_node = PyQuery(self.hxs.select("//div[@class = 'zw_text']").extract()[0])
content_node.remove('script')
content_node.remove('style')
content_node.remove('iframe')
content_node.remove('div[style = "float:left; width:303px; height:250px; display:inline; margin:10px 10px 10px 10px;"]')
content_node.remove('input')
item = ContentItem()
item['title'] = self.title = doc('td[align = "center"]')('b').text()
if item['title'] == None:
item['title'] = self.title = doc('div.zw_bt').text()
if item['title'] == None:
item['title'] = self.title = doc('h1.zw_title').text()
item['release_time'] = ''
item['source'] = u"新浪"
item['author'] = ''
item['pic_url'] = ''
imgs = content_node('img')
image_urls = []
for img in imgs:
if ".gif" in img.get('src'):
continue
if not img.get('src'):
continue
else:
imgs.eq(imgs.index(img)).before('<br>')
imgs.eq(imgs.index(img)).append('<br>')
image_urls.append(self.getRealURI(img.get('src')))
item['image_urls'] = image_urls
content = content_node.__unicode__()
item['content'] = self.content = content
return item
示例5: parseNextPageUrl
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import remove [as 别名]
def parseNextPageUrl(self, category_page_content):
doc = PyQuery(category_page_content)
nodeAList = doc("span#view_47 > a")
for nodeA in nodeAList:
nodeAQ = PyQuery(nodeA)
if nodeAQ.remove('span').text().strip().lower() == 'next':
return nodeAQ.attr('href').strip()
return None
示例6: sanitize_html2
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import remove [as 别名]
def sanitize_html2(value):
soup = PyQuery(value)
soup = soup.remove("span.playMetaText")
soup.remove("span.playMetaText")
soup.remove("time")
soup.remove("strong")
return soup.html().split("<span>")[-1:]
示例7: sanitize_description
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import remove [as 别名]
def sanitize_description(value):
cleaned = PyQuery(value)
cleaned = cleaned.remove('span.playMetaText')
cleaned.remove('span.playMetaText')
cleaned.remove('time')
cleaned.remove('strong')
return cleaned.html().split('<span>')[-1:][0].replace('</span>', '')
示例8: feed
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import remove [as 别名]
def feed(request, get_feed=get_feed):
with shows_db() as shows:
show_list = shows.values()
d = PyQuery(get_feed(), parser="xml")
for item in d("item"):
ditem = PyQuery(item)
title = ditem.find("title").text()
match = detect_show(show_list, title)
if match:
name, episode = match
# TODO: Record episode in the feed so that future versions of this episod will be ignored
else:
ditem.remove()
response = Response()
response.content_type = "application/rss+xml"
response.ubody = unicode(d)
response.cache_control = "no-cache"
return response
示例9: extract
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import remove [as 别名]
def extract(self):
self.html = re.sub('<!--.*?-->', '', self.html)
doc = PyQuery(self.html)
content_node = doc('.firstTopic')('div')
content_node.remove('script')
content_node.remove('.rate')
content_node.remove('.affixContent')
content_node.remove('.thread_gold')
item = ContentItem()
imgs = content_node('.p14')('img')
img_all = []
for img in imgs:
if".gif" in img.get('src'):
continue
else:
imgs.eq(imgs.index(img)).append('<br>')
imgs.eq(imgs.index(img)).before('<br>')
img_all.append(self.getRealURI(img.get('src')))
item['image_urls'] = img_all
item['title'] = self.title = doc('#thread_title').text()
content = content_node('.p14').__unicode__()
content = PyQuery(content)
del_style = content('div')
for d in del_style:
if d.get('style'):
del_style.eq(del_style.index(d)).attr['style'] = ''
content.remove('dl.rate_list')
content.remove('span[style = "font-size:12px"]')
content.remove('dl.rate')
item['content'] = self.content = content.__unicode__()
release_time=doc('.firstTopic')('.postTime').text()
ob=re.compile(u'20\d\d.*\d\d')
release_time=ob.findall(release_time)
item['release_time'] = release_time[0]
# item['release_switch_time'] = self.release_switch_time = time.mktime(time.strptime(release_time[0],u'%Y-%m-%d %H:%M:%S'))
item['source'] = u"17173论坛"
item['author'] = doc('.th1').eq(0).text()
item['pic_url'] = ''
return item
示例10: process
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import remove [as 别名]
def process( self ):
self.discovered_urls = set()
self.basic_content_type = "text/html"
self.message_stack = [ "-" * 80 ]
# resolve the address
uri = urlparse( self.current_job[ 'url' ] )
answers = dns.resolver.query( uri.hostname, 'A' )
for answer in answers:
self.message_stack.append( "DNS) %s" % answer )
try:
self.current_response = requests.get( self.current_job[ 'url' ], stream=True )
self.basic_content_type = self.current_response.headers[ 'content-type' ].split( ";" )[ 0 ]
except:
self.current_response = None
self.basic_content_type = None
if self.current_response:
for r in self.current_response.history:
self.message_stack.append( "-URL (%s) %s" % ( r.status_code, r.url ) )
self.message_stack.append( "+URL (%s) %s" % ( self.current_response.status_code, self.current_response.url ) )
self.message_stack.append( "BASIC CONTENT-TYPE) %s" % self.basic_content_type )
self.message_stack.append( "CONTENT TYPE) %s" % self.current_response.headers['content-type'] )
self.message_stack.append( "ENCODING) %s" % self.current_response.encoding )
if self.basic_content_type in ACCEPTABLE_CONTENT_TYPES:
# we need to handle the odd, but real case of the mystery <? palantir_blog_list('sidebar') ?> tag
# tidy_response_text = re.sub( "<\?.*?\?>", "", self.current_response.text )
tidy_response_text = re.sub( "<\?.*?\?>", "", self.current_response.text )
tidy_response_text = re.sub( "<!--.*?-->", "", tidy_response_text )
self.dom = PyQuery( tidy_response_text, parser='html' )
self.titles = [ safe_str( title.text ) for title in self.dom("title") ]
for a in self.dom('a'):
a = PyQuery(a)
new_url = PyQuery(a).attr.href
if new_url != None:
new_url = urldefrag( urljoin( self.current_response.url, new_url ) )[0]
self.discovered_urls.add( new_url )
self.message_stack.append( "DISCOVERED) %s" % len( self.discovered_urls ) )
# BOILERPIPE
for excluded_tag in BOILERPIPE_REMOVE_TAGS:
self.dom( excluded_tag ).after( "\n" )
self.dom.remove( excluded_tag )
# remove tags with style="display:none"
# http://www.microsoft.com/en-us/legal/intellectualproperty/copyright/default.aspx
display_none_pattern = re.compile( "display: ?none" )
for x in self.dom("*"):
try:
tag = PyQuery(x)
if not tag.attr("style") == None:
if re.match( display_none_pattern, tag.attr("style") ):
tag.remove()
except Exception as inst:
print type(inst)
print inst.args
print inst
self.save()
else:
self.message_stack.append( "DISCARDED" )
else:
self.message_stack.append( "NO RESPONSE" )
示例11: getTweets
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import remove [as 别名]
def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100, proxy=None):
refreshCursor = ''
results = []
resultsAux = []
cookieJar = http.cookiejar.CookieJar()
active = True
while active:
json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy)
if len(json['items_html'].strip()) == 0:
break
refreshCursor = json['min_position']
scrapedTweets = PyQuery(json['items_html'])
#Remove incomplete tweets withheld by Twitter Guidelines
scrapedTweets.remove('div.withheld-tweet')
tweets = scrapedTweets('div.js-stream-tweet')
if len(tweets) == 0:
break
for tweetHTML in tweets:
tweetPQ = PyQuery(tweetHTML)
tweet = models.Tweet()
usernameTweet = tweetPQ("span.username.js-action-profile-name b").text()
txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace('@ ', '@'))
retweets = int(tweetPQ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
favorites = int(tweetPQ("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
dateSec = int(tweetPQ("small.time span.js-short-timestamp").attr("data-time"))
id = tweetPQ.attr("data-tweet-id")
permalink = tweetPQ.attr("data-permalink-path")
user_id = int(tweetPQ("a.js-user-profile-link").attr("data-user-id"))
geo = ''
geoSpan = tweetPQ('span.Tweet-geo')
if len(geoSpan) > 0:
geo = geoSpan.attr('title')
urls = []
for link in tweetPQ("a"):
try:
urls.append((link.attrib["data-expanded-url"]))
except KeyError:
pass
tweet.id = id
tweet.permalink = 'https://twitter.com' + permalink
tweet.username = usernameTweet
tweet.text = txt
tweet.date = datetime.datetime.fromtimestamp(dateSec)
tweet.formatted_date = datetime.datetime.fromtimestamp(dateSec).strftime("%a %b %d %X +0000 %Y")
tweet.retweets = retweets
tweet.favorites = favorites
tweet.mentions = " ".join(re.compile('(@\\w*)').findall(tweet.text))
tweet.hashtags = " ".join(re.compile('(#\\w*)').findall(tweet.text))
tweet.geo = geo
tweet.urls = ",".join(urls)
tweet.author_id = user_id
results.append(tweet)
resultsAux.append(tweet)
if receiveBuffer and len(resultsAux) >= bufferLength:
receiveBuffer(resultsAux)
resultsAux = []
if tweetCriteria.maxTweets > 0 and len(results) >= tweetCriteria.maxTweets:
active = False
break
if receiveBuffer and len(resultsAux) > 0:
receiveBuffer(resultsAux)
return results
示例12: HTMLGenerator
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import remove [as 别名]
class HTMLGenerator(object):
"""HTML Generator
"""
def __init__(self):
self.MAX_WORKERS = 4
self.MULTIPROCESS_BOUND = 20
def load_tree_template(self):
"""Load tree HTML templates
"""
with open(os.path.join(os.path.dirname(__file__), 'template', 'tree_template.html')) as f:
self.template = PyQuery(f.read(), parser='html')
with open(os.path.join(os.path.dirname(__file__), 'template', 'tree_node_template.html')) as f:
self.node_template = PyQuery(f.read(), parser='html')
self.node_template_html = self.node_template.html()
def import_js(self, js_ids):
"""Import JS to HTML
:param js_ids: dict type, {script_id with #: js_file_name}
exmaple: {"#script_jquery": "jquery.min.js"}
"""
_path = os.path.dirname(__file__)
for _id in js_ids.iterkeys():
self.template(_id).attr("src", "%s/bin/js/%s" % (_path, js_ids[_id]))
# In case that lxml change <script></script> to <script/>
self.template(_id).html("var _lxml = 0;")
def generate_tree_structure_HTML(self, root_node, output):
"""Generate a html file with tree structure.
:param root_node: RDirNode root of the module
:param output: Output html file
"""
# Init
self.load_tree_template()
self.tree_nodes = []
self.max_layer = 0
self.import_js({
# script_id : js_file_name
"#script_jquery": "jquery.min.js",
"#script_rdir_tree": "rdir_tree.js"
})
self.template('#header_name').html(root_node.name)
self.template('#header_type').html(" <%s>" % root_node.type)
header_doc = root_node.doc.replace('\t', ' ' * 4) \
.replace(' ', ' ').replace('\n', '<br/>').strip()
if len(header_doc) > 0:
self.template('#header_doc').html(header_doc + '<br/>')
else:
self.template.remove('#header_doc')
self.template('title').html(root_node.name)
# Recur
if len(root_node.list_children()) == 0:
# self._add_node_to_HTML("No visible children methods or members.",
# "If you see this, that means this object has nothing else to show.",
# "404",
# 0)
pass
else:
self.render_tree_html(root_node)
# Render html
for i in xrange(self.max_layer + 1):
self.template("#choose_layer").append(
"<option value='%d'>%d</option>" % (i, i)
)
self.template('#wrapper').append("\n".join(self.tree_nodes))
# Write to file
with open(output, 'w') as f:
f.write(self.template.html())
def render_tree_html(self, root_node):
""" Render the node html. Use multiprocessing to speed up if needed.
:param root_node: RDirNode root of the module
"""
job_list = self.get_job_list(root_node)
job_size = len(job_list)
if job_size > self.MULTIPROCESS_BOUND:
jobs_list = Util.split_jobs(job_list, self.MAX_WORKERS)
else:
jobs_list = [job_list]
pool = multiprocessing.Pool(processes=self.MAX_WORKERS)
result = []
html = self.node_template.html()
for jobs in jobs_list:
if len(jobs) > 0:
result.append(pool.apply_async(parse_tree_node_worker, (html, jobs)))
# pool.close()
#.........这里部分代码省略.........
示例13: extract
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import remove [as 别名]
def extract(self):
self.html = re.sub('<!--.*?-->', '', self.html)
doc = PyQuery(self.html)
doc.remove('div#tipswindow')
content_node = doc('div#Cnt-Main-Article-QQ')
if not content_node:
content_node = doc('div#ArticleCnt')
if not content_node:
content_node = doc('div#textContent')
if not content_node:
content_node = doc('#content')
if not content_node:
content_node = doc('div[id = "qnews-content"]')
content_node.remove('script')
content_node.remove('style')
content_node.remove('iframe')
content_node.remove('div.adpip_Aritcle_QQ')
content_node.remove('table#picInPic')
content_node.remove('div.dayuw_ad')
content_node.remove('div.tJieHot_')
content_node.remove('div.b_new_mod')
content_node.remove('div#awh_sports')
content_node.remove('div[id = "photo-warp"]')
content_node.remove('div#MorePic')
content_node.remove('div#cmenu')
content_node.remove('div#flashCff')
content_node.remove('div#contTxt')
content_node.remove('div#PGViframe')
content_node.remove('div#Reading')
content_node.remove('span[style = "BACKGROUND-COLOR: navy; COLOR: white"]')
content_node.remove('img[width="592"][height="100"]')
content = content_node.__unicode__()
item = ContentItem()
item['title'] = self.title = doc('h1').text()
if not item['title']:
item['title'] = self.title = doc('div#ArticleTit').text()
if not item['title']:
item['title'] = self.title = doc('h2').text()
item['content'] = self.content = content
item['release_time'] = self.release_time = doc('span.pubTime').text()
p = re.compile(u"(20\d\d.*\d\d:\d\d)")
if not self.release_time:
self.release_time = doc('div[class = "info"]').text()
if self.release_time == None:
self.release_time = doc('div[id = "ArtFrom"]').text()
if self.release_time == None:
self.release_time = doc('div[class = "pubtime"]').text()
if self.release_time == None:
self.release_time = doc('span[id= "Freleasetime"]').text()
if self.release_time == None:
self.release_time = doc('td.xborderb1').eq(1).text()
p = re.compile(u"(20.*-\d\d)")
item['release_time'] = self.release_time = p.search(self.release_time).group()
#item['release_switch_time'] = time.mktime(time.strptime(self.release_time,time_s))
item['source'] = u"腾讯"
item['author'] = ''
item['pic_url'] = ''
imgs = content_node('img')
image_urls = []
for img in imgs:
if ".gif" in img.get('src'):
continue
if not img.get('src'):
continue
else:
imgs.eq(imgs.index(img)).before('<br>')
image_urls.append(self.getRealURI(img.get('src')))
item['image_urls'] = image_urls
return item
示例14: extract_content
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import remove [as 别名]
def extract_content(self, html):
html = re.sub(r'xmlns="[^"]+"', "", html)
doc = PyQuery(html)
content_node = doc.find(self.content_css_selector)
self.should_remove_css_selector and doc.remove(self.should_remove_css_selector)
return content_node.outer_html()
示例15: faltantes
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import remove [as 别名]
def faltantes():
bajados = {int(l.split('.')[0]) - 1 for l in glob.glob('*.md')}
links = get_all_links()
faltan = set(range(len(links))) - bajados
return [links[i] if i in faltan else None for i in range(len(links))]
for did, url in enumerate(faltantes()):
if not url:
continue
try:
d = PyQuery(url=url, headers=headers)
# cleanups
d.remove('ul.actions, #fb-root, script, div[style="clear:both"]')
for cf in d('.clearfix'):
if d(cf).text() == "":
d(cf).remove()
fecha = d('dd.published').text()
d('.article-info').before(u'<p>[{}]</p>'.format(fecha))
d.remove('.article-info')
# no link in the title
titulo = d('.item-page h2 a').text().decode('utf8')
d('.item-page h2').text(titulo)
# clean html content
discurso = d('.item-page').html()
import ipdb;ipdb.set_trace()