本文整理汇总了Python中w3lib.html.remove_tags函数的典型用法代码示例。如果您正苦于以下问题:Python remove_tags函数的具体用法?Python remove_tags怎么用?Python remove_tags使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了remove_tags函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse
def parse(self, response):
ts = time.time()
html_name = 'txt/artist/artist' + str(ts) + '.txt'
file = codecs.open(html_name, 'w+', 'utf-8')
# file.write(response.url)
# file.write('\n')
for body in response.css('div.layoutSingleColumn h3').extract():
body = body.encode(response.encoding)
body = remove_tags(body)
print "Header"
print(body)
try:
file.write(body)
except AttributeError:
print(AttributeError)
sys.exit(0)
for body in response.css('div.layoutSingleColumn p').extract():
body = body.encode(response.encoding)
body = remove_tags(body)
print "Paragraph"
print(body)
try:
file.write(body)
except AttributeError:
print(AttributeError)
sys.exit(0)
file.close()
示例2: test_remove_tags
def test_remove_tags(self):
# text with tags
self.assertEqual(remove_tags(u'<p>one p tag</p>'), u'one p tag')
self.assertEqual(remove_tags(u'<p>one p tag</p>', which_ones=('b',)), u'<p>one p tag</p>')
self.assertEqual(remove_tags(u'<b>not will removed</b><i>i will removed</i>', which_ones=('i',)),
u'<b>not will removed</b>i will removed')
示例3: parse
def parse(self, response):
max_position = ''
koma = ','
headers = response.headers
itemselector = Selector(response).xpath('//div[@class="content"]')
if headers['Content-Type'] == 'application/json;charset=utf-8':
data = json.loads(response.body)
itemselector = Selector(text=data['items_html']).xpath('//div[@class="content"]')
max_position = data['min_position']
yield Request("https://twitter.com/i/search/timeline?f=tweets&vertical=default&q=%22demam%20berdarah%22%20OR%20dbd%20OR%20dhf%20OR%20%22dengue%20fever%22%20OR%20%22dengue%20hemorrhagic%22%20OR%20%22sakit%20db%22%20lang%3Aid%20since%3A"+self.start+"%20until%3A"+self.end+"&src=typd&include_available_features=1&include_entities=1&max_position="+max_position+"&reset_error_state=false",
callback=self.parse,
method="GET",)
for sel in itemselector:
self.index += 1
item = TwitterscrapingItem()
item['index'] = self.index
item['userid'] = ''.join(
map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/a/@data-user-id').extract()))
item['username'] = ''.join(
map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/a/span[@class="username js-action-profile-name"]/b/text()').extract()))
item['fullname'] = ''.join(
map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/a/strong/text()').extract()))
text_tweet = ''.join(
map(unicode.strip, sel.xpath('p[@class="TweetTextSize js-tweet-text tweet-text"]').extract()))
item['text_tweet'] = remove_tags(text_tweet).replace('\n',' ').replace('\u',' ')
item['original_text_tweet'] = text_tweet
hash_tags = koma.join(
map(unicode.strip, sel.xpath('p[@class="TweetTextSize js-tweet-text tweet-text"]'
'/a[@class="twitter-hashtag pretty-link js-nav"]').extract()))
item['hash_tags'] = remove_tags(hash_tags)
item['time_tweet'] = ''.join(
map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/small[@class="time"]/a/@title').extract()))
item['lang'] = ''.join(
map(unicode.strip, sel.xpath('p[@class="TweetTextSize js-tweet-text tweet-text"]/@lang').extract()))
retweets = ''.join(
map(unicode.strip, sel.xpath('div[@class="stream-item-footer"]'
'/div[@class="ProfileTweet-actionList js-actions"]'
'/div[@class="ProfileTweet-action ProfileTweet-action--retweet js-toggleState js-toggleRt"]'
'/button[@class="ProfileTweet-actionButton js-actionButton js-actionRetweet"]'
'/div[@class="IconTextContainer"]').extract()))
item['retweets'] = remove_tags(retweets).strip()
favorite = ''.join(
map(unicode.strip, sel.xpath('div[@class="stream-item-footer"]'
'/div[@class="ProfileTweet-actionList js-actions"]'
'/div[@class="ProfileTweet-action ProfileTweet-action--favorite js-toggleState"]'
'/button[@class="ProfileTweet-actionButton js-actionButton js-actionFavorite"]'
'/div[@class="IconTextContainer"]').extract()))
item['favorite'] = remove_tags(favorite).strip()
item['place_id'] = ''.join(
map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/span[@class="Tweet-geo u-floatRight js-tooltip"]/a/@data-place-id').extract()))
item['place'] = ''.join(
map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/span[@class="Tweet-geo u-floatRight js-tooltip"]/a/span[@class="u-hiddenVisually"]/text()').extract()))
item['max_position'] = max_position
yield item
示例4: test_returns_unicode
def test_returns_unicode(self):
# make sure it always return unicode
assert isinstance(remove_tags(b'no tags'), six.text_type)
assert isinstance(remove_tags(b'no tags', which_ones=('p',)), six.text_type)
assert isinstance(remove_tags(b'<p>one tag</p>'), six.text_type)
assert isinstance(remove_tags(b'<p>one tag</p>', which_ones=('p')), six.text_type)
assert isinstance(remove_tags(b'<a>link</a>', which_ones=('b',)), six.text_type)
assert isinstance(remove_tags(u'no tags'), six.text_type)
assert isinstance(remove_tags(u'no tags', which_ones=('p',)), six.text_type)
assert isinstance(remove_tags(u'<p>one tag</p>'), six.text_type)
assert isinstance(remove_tags(u'<p>one tag</p>', which_ones=('p')), six.text_type)
assert isinstance(remove_tags(u'<a>link</a>', which_ones=('b',)), six.text_type)
示例5: parse
def parse(self, response):
hrefs = response.selector.xpath('//div[contains(@class, "c-container")]/h3/a/@href').extract()
containers = response.selector.xpath('//div[contains(@class, "c-container")]')
for container in containers:
href = container.xpath('h3/a/@href').extract()[0]
title = remove_tags(container.xpath('h3/a').extract()[0])
c_abstract = container.xpath('div/div/div[contains(@class, "c-abstract")]').extract()
abstract = ""
if len(c_abstract) > 0:
abstract = remove_tags(c_abstract[0])
request = scrapy.Request(href, callback=self.parse_url)
request.meta['title'] = title
request.meta['abstract'] = abstract
yield request
示例6: parse
def parse(self, response):
s = Selector(response)
next_link = s.xpath('//div[@class="w-button-more"]/a/@href').extract()
if len(next_link):
yield Request("https://mobile.twitter.com"+next_link[0], callback=self.parse)
itemselector = Selector(response).xpath('//*[@id="main_content"]/div/div[3]/table')
#regex = re.compile(r"([\\]+u\d*)", re.MULTILINE)
for sel in itemselector:
self.index += 1
item = TwitterscrapingItem()
item['index'] = self.index
item['username'] = ''.join(
map(unicode.strip, sel.xpath('tr[1]/td[2]/a/div/text()').extract()))
tweet = remove_tags(''.join(
map(unicode.strip, sel.xpath('tr[2]/td/div').extract()))
).replace('&','&').replace(' ','').replace('\n ','').replace('\n ','').replace('\n','').replace('\u',' ')
item['text_tweet'] = u''+tweet
item['original_tweet'] = ''.join(sel.xpath('tr[2]/td/div/div').extract())
item['time_tweet'] = ''.join(
map(unicode.strip, sel.xpath('tr[1]/td[3]/a/text()').extract()))
item['url'] = ''.join(
map(unicode.strip, sel.xpath('tr[2]/td/div/@data-id').extract()))
item['data_id'] = ''.join(
map(unicode.strip, sel.xpath('tr[3]/td/span[1]/a/@href').extract()))
yield item
示例7: _extract_features
def _extract_features(self, sel, item):
description_xpath = '//div[@id="tab1"]/ul/li'
data = sel.xpath(description_xpath).extract()
if len(data) > 0 :
data = [remove_tags(v).strip().replace(' ',' ').replace('>','>').strip() for v in data]
data = filter(None,data)
item['description'] = '<br>'.join(data)
示例8: parse_speech
def parse_speech(self, response):
paragraphs = response.css('p')[:-1] # last p contains pagination
text = remove_tags(''.join(paragraphs.extract()))
l = ParlamentHuSpeechLoader(item=Speech(), selector=response,
scheme='parlament.hu/people')
l.add_value('text', text)
l.add_value('type', 'speech')
l.add_value('sources', [response.url])
l.add_xpath('position', '//b[1]/text()')
l.add_xpath('video', '//table//tr[6]//td[2]/a/@href')
l.add_xpath('creator_id', '//table//tr[2]//td[2]/a/@href',
re=r'ogy_kpv\.kepv_adat\?p_azon=(\w\d+)')
l.add_value('event_id', response.meta['event_id'])
date = response.xpath(
'//table//tr[1]/th/text()').re(r'\d{4}\.\d{2}.\d{2}\.')
time = response.meta.get('time')
if date:
date = date[0]
if time:
date += time[0]
l.add_value('date', date)
item = l.load_item()
yield item
if 'creator_id' in item:
yield scrapy.Request(self.get_api_url(
self.PERSON_ENDPOINT, params={
'p_azon': item['creator_id']['identifier']}),
callback=self.parse_person, meta={
'p_azon': item['creator_id']['identifier']})
示例9: make_it_clean
def make_it_clean(line):
'''
Очистка текста от тегов html, css стилей, js
(string) line - входной текст
'''
cleari = remove_tags(line)
soline = re.compile("(\<.+\n)", re.DOTALL)
boline = re.compile("(.+\>)", re.DOTALL)
alline = re.compile("\<.+\>", re.DOTALL)
cleari = re.sub(soline, '', cleari)
cleari = re.sub(boline, '', cleari)
cssline = re.compile(r"\{.+\}{1}", re.DOTALL)
cleari = re.sub(cssline, ' ', cleari)
cleari = re.sub("async=\"async\"\n", '', cleari)
cleari = re.sub("src=.+\"", '', cleari)
cleari = re.sub("var\s_.+\)", '', cleari)
cleari = re.sub("function.+\"\)", '', cleari)
cleari = re.sub("document.+\);", " ", cleari)
cleari = re.sub("function.+\)", " ", cleari)
cleari = re.sub("«", " «", cleari)
cleari = re.sub("»", "» ", cleari)
cleari = re.sub("→", "→", cleari)
cleari = re.sub(r' ', ' ', cleari)
cleari = re.sub(r'(—)|(–)', '-', cleari)
cleari = re.sub(r'\t{2,}', ' ', cleari)
cleari = re.sub(r'\s{2,}', ' ', cleari)
cleari = re.sub(r'\n{2,}', '\n', cleari)
cleari = re.sub(r"(\<\!\-\-.*\-\-\>)", '', cleari)
return cleari
示例10: parse_item
def parse_item(self, response):
province = response.css('.dqwz>a:last-child::attr(title)').re_first(ur'2017年(.+?)省?本科')
school = response.css('.nr>h2::text').extract_first()
count = len(response.xpath('//div[@id="ivs_content"]/table//tr[1]/td').extract())
for row in response.xpath('//div[@id="ivs_content"]/table//tr[position()>1]'):
fields = [remove_tags(i).strip() for i in row.css('td').extract()]
if count == 4:
del fields[0]
if len(fields) == 3:
rowspan_count = [e.css('::attr(rowspan)').extract_first(1) for e in row.css('td')][-3:]
rowspan_value = fields
rowspans = len([i for i in rowspan_count if i > 1])
elif len(fields) + rowspans == 3:
new_fields = []
fields.reverse()
for k, v in zip(rowspan_count, rowspan_value):
if k == 1:
new_fields.append(fields.pop())
else:
new_fields.append(v)
fields = new_fields
else:
continue
yield ShmecItem(
province=province,
school=school,
major=fields[0],
require=fields[1],
remark=fields[2],
)
示例11: parse_linklist
def parse_linklist(text, remove_tags=False):
data = []
for row in text.split('\n'):
rowparts = row.strip().split(' ')
if len(rowparts) < 2:
break
time = rowparts[0]
if rowparts[1].startswith('<') and rowparts[1].endswith('>'):
url = rowparts[1][1:-1]
textparts = rowparts[2:]
else:
url = ''
textparts = rowparts[1:]
text = ' '.join(textparts)
if remove_tags:
text = html.remove_tags(text)
data.append(
{
'time': time,
'url': url,
'text': text
}
)
return data
示例12: _extract_description
def _extract_description(self, sel, item):
return
desc_xpath = '//div[@id="item-overview"]/ul/li/node()'
data = sel.xpath(desc_xpath).extract()
if len(data) != 0:
data = [remove_tags(v.strip()) for v in data]
description = ';'.join(data).replace(':;',':').replace('from;','from ')
item['description'] = description
示例13: _extract_links
def _extract_links(self, response_text, response_url, response_encoding):
base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding))))
clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()
links_text = linkre.findall(response_text)
urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text])
return [Link(url, text) for url, text in urlstext]
示例14: clean_tags_from_affiliations
def clean_tags_from_affiliations(value):
"""Clean the affiliaton string for an author."""
for affiliation in value.get('affiliations', []):
# Remove tag AND content of any prefix like <label><sup>1</sup></label>
affiliation['value'] = remove_tags_with_content(affiliation['value'], ('label',))
# Now remove all tags but KEEP content
affiliation['value'] = remove_tags(affiliation['value'])
# Remove random whitespaces
affiliation['value'] = clean_whitespace_characters(affiliation['value'])
return value
示例15: process_response
def process_response(self, request, response, spider):
# clean body
orig_body = response.body_as_unicode()
body = remove_tags_with_content(orig_body, which_ones=('script', 'head'))
body = remove_tags(remove_comments(body))
terms = tokenize(body.lower())
request.meta['terms'] = terms
request.meta['body'] = body
return response