本文整理汇总了Python中w3lib.html.replace_entities函数的典型用法代码示例。如果您正苦于以下问题:Python replace_entities函数的具体用法?Python replace_entities怎么用?Python replace_entities使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了replace_entities函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_regular
def test_regular(self):
# regular conversions
self.assertEqual(replace_entities(u'As low as £100!'),
u'As low as \xa3100!')
self.assertEqual(replace_entities(b'As low as £100!'),
u'As low as \xa3100!')
self.assertEqual(replace_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant'),
u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant')
示例2: test_illegal_entities
def test_illegal_entities(self):
self.assertEqual(replace_entities('a < b &illegal; c � six', remove_illegal=False),
u'a < b &illegal; c � six')
self.assertEqual(replace_entities('a < b &illegal; c � six', remove_illegal=True),
u'a < b c six')
self.assertEqual(replace_entities('x≤y'), u'x\u2264y')
self.assertEqual(replace_entities('xy'), u'xy')
self.assertEqual(replace_entities('xy', remove_illegal=False), u'xy')
示例3: clean_url
def clean_url(url):
clean_url = ''
try:
clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding))))
except ValueError:
pass
return clean_url
示例4: text
def text(region):
"""Converts HTML to text. There is no attempt at formatting other than
removing excessive whitespace,
For example:
>>> t = lambda s: text(htmlregion(s))
>>> t(u'<h1>test</h1>')
u'test'
Leading and trailing whitespace are removed
>>> t(u'<h1> test</h1> ')
u'test'
Comments are removed
>>> t(u'test <!-- this is a comment --> me')
u'test me'
Text between script tags is ignored
>>> t(u"scripts are<script>n't</script> ignored")
u'scripts are ignored'
HTML entities are converted to text
>>> t(u"only £42")
u'only \\xa342'
>>> t(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>")
u'The text is here'
"""
text = replace_entities(region.text_content, encoding=region.htmlpage.encoding)
return _WS.sub(u' ', text).strip()
示例5: test_missing_semicolon
def test_missing_semicolon(self):
for entity, result in (
('<<!', '<<!',),
('<!', '<!',),
('A ', 'A ',),
('A!', 'A!',),
('Ah', 'Ah',),
('A!', 'A!',),
('Ax', 'Ax',),
('³!', u'\u00B3!',),
('Á!', u'\u00C1!',),
('☃!', u'\u2603!',),
('™', u'\u2122',),
('™', u'\u2122',),
):
self.assertEqual(replace_entities(entity, encoding='cp1252'), result)
self.assertEqual(replace_entities('x%sy' % entity, encoding='cp1252'), u'x%sy' % result)
示例6: extract_raw_text
def extract_raw_text(html):
text = replace_entities(html)
text = re_clean_blanks.sub(u' ', text)
text = re_clean_comments.sub(u' ', text)
text = re_clean_javascript.sub(u' ', text)
text = re_clean_style.sub(u' ', text)
text = re_clean_balises.sub(u' ', text)
text = re_clean_blanks.sub(u' ', text).strip()
text = re_clean_multiCR.sub(u'\n', text)
return text
示例7: _extract_links
def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
if base_url is None:
base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
clean_url = lambda u: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding))))
clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()
links_text = linkre.findall(response_text)
return [Link(clean_url(url).encode(response_encoding),
clean_text(text))
for url, _, text in links_text]
示例8: image_url
def image_url(txt):
"""convert text to a url
this is quite conservative, since relative urls are supported
Example:
>>> image_url('')
>>> image_url(' ')
>>> image_url(' \\n\\n ')
>>> image_url('foo-bar.jpg')
['foo-bar.jpg']
>>> image_url('/images/main_logo12.gif')
['/images/main_logo12.gif']
>>> image_url("http://www.image.com/image.jpg")
['http://www.image.com/image.jpg']
>>> image_url("http://www.domain.com/path1/path2/path3/image.jpg")
['http://www.domain.com/path1/path2/path3/image.jpg']
>>> image_url("/path1/path2/path3/image.jpg")
['/path1/path2/path3/image.jpg']
>>> image_url("path1/path2/image.jpg")
['path1/path2/image.jpg']
>>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)")
['http://www.site.com/path1/path2/image.jpg']
>>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')")
['http://www.site.com/path1/path2/image.jpg']
>>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")')
['http://www.site.com/path1/path2/image.jpg']
>>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)")
['http://www.site.com/path1/path2/image.jpg']
>>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')")
['http://www.site.com/path1/path2/image.jpg']
>>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")')
['http://www.site.com/path1/path2/image.jpg']
>>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
>>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
>>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80')
['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80']
>>> image_url('../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg')
['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg']
>>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff')
['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff']
>>> image_url('http://www.site.com/image.php')
['http://www.site.com/image.php']
>>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom)')
['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']
"""
imgurl = extract_image_url(txt)
return [safe_url_string(replace_entities(url(imgurl)))] if imgurl else None
示例9: extract_regex
def extract_regex(regex, text, encoding="utf-8"):
"""Extract a list of unicode strings from the given text/encoding using the following policies:
* if the regex contains a named group called "extract" that will be returned
* if the regex contains multiple numbered groups, all those will be returned (flattened)
* if the regex doesn't contain any group the entire regex matching is returned
"""
if isinstance(regex, six.string_types):
regex = re.compile(regex, re.UNICODE)
try:
strings = [regex.search(text).group("extract")] # named group
except:
strings = regex.findall(text) # full regex or numbered groups
strings = flatten(strings)
if isinstance(text, six.text_type):
return [replace_entities(s, keep=["lt", "amp"]) for s in strings]
else:
return [replace_entities(to_unicode(s, encoding), keep=["lt", "amp"]) for s in strings]
示例10: extract_regex
def extract_regex(regex, text, encoding='utf-8'):
"""Extract a list of unicode strings from the given text/encoding using the following policies:
* if the regex contains a named group called "extract" that will be returned
* if the regex contains multiple numbered groups, all those will be returned (flattened)
* if the regex doesn't contain any group the entire regex matching is returned
"""
if isinstance(regex, basestring):
regex = re.compile(regex, re.UNICODE)
try:
strings = [regex.search(text).group('extract')] # named group
except:
strings = regex.findall(text) # full regex or numbered groups
strings = flatten(strings)
#flatten 把列表中的列表或者字典等嵌套结构去除,返回一个统一的列表。
if isinstance(text, unicode):
return [replace_entities(s, keep=['lt', 'amp']) for s in strings]
else:
return [replace_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings]
示例11: parse_item
def parse_item(self, response):
links = dict()
link_titles = set()
url = response.url.split('#')[0].lower()
url_head = url.split('/pages/')[0] + '/pages/'
title = response.xpath('//meta[@name="DC.title"]/@content').extract_first()
if title and title.endswith('- NHS Choices'):
title = title.rstrip(' NHS Choices').rstrip(' -')
subjects = response.xpath('//meta[@name="DC.Subject"][@scheme="NHSC.Ontology"]/@content').extract_first().split(', ')
subjects = [s.lower() for s in subjects if s]
if not subjects:
subjects = [title.lower()]
description = clean_text(response.xpath('//meta[@name="DC.description"]/@content').extract_first())
raw_page_content = response.xpath('//div[@class="main-content healthaz-content clear"]/.').extract_first()
page_content = clean_text(replace_entities(remove_tags(raw_page_content)))
for a in response.xpath('//div[@class="main-content healthaz-content clear"]/descendant::a'):
label = a.xpath('text()').extract_first()
href = a.xpath('@href').extract_first()
if href and label:
href = self.base_url + href.lstrip('/')
href = href.lower()
label = clean_text(label)
if '/conditions/' in href and url_head not in href:
link_titles.add(label)
if href in links:
links[href]['count'] += 1
else:
links[href] = {
'count': 1,
'label': label
}
if url_head in href and href != url:
print("********************", href)
yield scrapy.Request(href, self.parse_item)
article = NhsItem()
article['url'] = url
article['title'] = title
article['subjects'] = subjects
article['description'] = description
article['page_content'] = str(page_content)
article['links'] = links
article['link_titles'] = list(link_titles)
yield article
示例12: _has_ajaxcrawlable_meta
def _has_ajaxcrawlable_meta(text):
"""
>>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment" content="!"/></head><body></body></html>')
True
>>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
True
>>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment" content="!"/>--></head><body></body></html>')
False
>>> _has_ajaxcrawlable_meta('<html></html>')
False
"""
# Stripping scripts and comments is slow (about 20x slower than
# just checking if a string is in text); this is a quick fail-fast
# path that should work for most pages.
if 'fragment' not in text:
return False
if 'content' not in text:
return False
text = html.remove_tags_with_content(text, ('script', 'noscript'))
text = html.replace_entities(text)
text = html.remove_comments(text)
return _ajax_crawlable_re.search(text) is not None
示例13: clean_link
import urllib
import urlparse
from urlparse import urljoin
from w3lib.html import replace_entities
def clean_link(link_text):
return link_text.strip("\t\r\n '\"")
# 返回第一个url地址
list_first_item = lambda x:x[0] if x else None
# 将url地址组装返回,并移除空格标点 entites
clean_url = lambda base_url, u, response_encoding: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding))))
# 获取请求参数
def get_query(url, key):
bits = list(urlparse.urlparse(url))
query = urlparse.parse_qs(bits[4])
return query[key][0]
# 设置请求参数
def set_query(url, **args):
bits = list(urlparse.urlparse(url))
query = urlparse.parse_qs(bits[4])
示例14: test_returns_unicode
def test_returns_unicode(self):
# make sure it always return uncode
assert isinstance(replace_entities(b'no entities'), six.text_type)
assert isinstance(replace_entities(b'Price: £100!'), six.text_type)
assert isinstance(replace_entities(u'no entities'), six.text_type)
assert isinstance(replace_entities(u'Price: £100!'), six.text_type)
示例15: test_encoding
def test_encoding(self):
self.assertEqual(replace_entities(b'x\x99™™y', encoding='cp1252'), \
u'x\u2122\u2122\u2122y')