本文整理汇总了Python中lxml.html.clean.Cleaner方法的典型用法代码示例。如果您正苦于以下问题:Python clean.Cleaner方法的具体用法?Python clean.Cleaner怎么用?Python clean.Cleaner使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lxml.html.clean
的用法示例。
在下文中一共展示了clean.Cleaner方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: raw_scraper
# 需要导入模块: from lxml.html import clean [as 别名]
# 或者: from lxml.html.clean import Cleaner [as 别名]
def raw_scraper(url, memoize):
t1 = time.time()
try:
cleaner = Cleaner()
cleaner.javascript = True
cleaner.style = True
article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize)
article.download()
html = minify(article.html)
html = cleaner.clean_html(html)
article.parse()
except:
return None, None
if article.text == "":
return None, None
metadata = {"url": url, "elapsed": time.time() - t1, "scraper": "raw"}
return html, metadata
示例2: render
# 需要导入模块: from lxml.html import clean [as 别名]
# 或者: from lxml.html.clean import Cleaner [as 别名]
def render(self):
if not self:
return self
cleaner = Cleaner(
allow_tags=self.allow_tags,
safe_attrs=self.safe_attrs,
remove_unknown_tags=False,
safe_attrs_only=True,
)
markdown_obj = markdown.Markdown(extensions=self.extensions, extension_configs=self.extension_configs)
text = markdown_obj.convert(self)
text = cleaner.clean_html(text)
return safestring.mark_safe(text)
示例3: sanitize_html
# 需要导入模块: from lxml.html import clean [as 别名]
# 或者: from lxml.html.clean import Cleaner [as 别名]
def sanitize_html(unsecure_html_content):
cleaner = Cleaner(inline_style=False, scripts=True, javascript=True,
safe_attrs=lxml.html.defs.safe_attrs | set(['style']),
frames=False, embedded=False,
meta=True, links=True, page_structure=True, remove_tags=['body'])
try:
secure_html_content = lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(unsecure_html_content)), method="html")
except:
secure_html_content = b''
return secure_html_content.decode()
# Get the client IP address, considering proxies and RP
示例4: clean_html
# 需要导入模块: from lxml.html import clean [as 别名]
# 或者: from lxml.html.clean import Cleaner [as 别名]
def clean_html(text):
cleaner = Cleaner(style=False)
return cleaner.clean_html(text)
示例5: clean_article_html
# 需要导入模块: from lxml.html import clean [as 别名]
# 或者: from lxml.html.clean import Cleaner [as 别名]
def clean_article_html(html_string):
html_string = html_string.replace('<h1', '<h3').replace('</h1>', '</h3>')
# telegram will convert <b> anyway
html_string = re.sub(r'<(/?)b(?=\s|>)', r'<\1strong', html_string)
html_string = re.sub(r'<(/?)(h2|h5|h6)', r'<\1h4', html_string)
# convert telegram embed posts before cleaner
html_string = re.sub(telegram_embed_script_re, r'<iframe src="https://t.me/\1"></iframe>', html_string)
# remove <head> if present (can't do this with Cleaner)
html_string = header_re.sub('', html_string)
c = Cleaner(
allow_tags=allowed_tags,
style=True,
remove_unknown_tags=False,
embedded=False,
safe_attrs_only=True,
safe_attrs=('src', 'href', 'class')
)
# wrap with div to be sure it is there
# (otherwise lxml will add parent element in some cases
html_string = '<div>%s</div>' % html_string
cleaned = c.clean_html(html_string)
# remove wrapped div
cleaned = cleaned[5:-6]
# remove all line breaks and empty strings
html_string = replace_line_breaks_except_pre(cleaned)
# but replace multiple br tags with one line break, telegraph will convert it to <br class="inline">
html_string = re.sub(r'(<br(/?>|\s[^<>]*>)\s*)+', '\n', html_string)
return html_string.strip(' \t')
示例6: cleaned_html
# 需要导入模块: from lxml.html import clean [as 别名]
# 或者: from lxml.html.clean import Cleaner [as 别名]
def cleaned_html(self):
# Try to parse the provided HTML string using lxml
# strip all unnecessary information to save space
cleaner = Cleaner()
cleaner.scripts = True
cleaner.javascript = True
cleaner.comments = True
cleaner.style = True
self.dom = cleaner.clean_html(self.dom)
assert len(self.dom), 'The html needs to be parsed to get the cleaned html'
return lxml.html.tostring(self.dom)
示例7: _load
# 需要导入模块: from lxml.html import clean [as 别名]
# 或者: from lxml.html.clean import Cleaner [as 别名]
def _load(self):
"""
Load the ElementTree from the source
"""
# Convert directional quotation marks to regular quotes
double_quotes = ur'[\u201c\u201d]'
self.source = re.sub(double_quotes, u'"', self.source)
single_quotes = ur'[\u2019\u2018]'
self.source = re.sub(single_quotes, u"'", self.source)
# Convert colons
self.source = self.source.replace(u'\uff1a', u':')
# Remove line breaks and tabs
self.source = self.source.replace(u'\n', u'')
self.source = self.source.replace(u'\t', u'')
# There are also some "zero width joiners" in random places in the text
# Should remove them here, since they make string search unreliable
# these are the codes: ‍,   (nbsp), \xa0 (nbsp), \u200d
zero_width_joiners = u'\u200d'
self.source = self.source.replace(zero_width_joiners, u'')
# Also previously had some non breaking spaces in unicode \u00a0, but this
# may have been fixed by changing the parser below
# Use the lxml cleaner
cleaner = Cleaner()
parser = HTMLParser(encoding='utf-8')
# Finally, load the cleaned string to an ElementTree
self.tree = cleaner.clean_html(lxml.html.fromstring(to_string(self.source), parser=parser))
# self.tree = lxml.html.fromstring(to_string(self.source))
示例8: render_body
# 需要导入模块: from lxml.html import clean [as 别名]
# 或者: from lxml.html.clean import Cleaner [as 别名]
def render_body(self):
if not self.body:
return ""
cleaner = Cleaner(
allow_tags=["p", "a", "i", "b", "em", "strong", "ol", "ul", "li", "pre", "code"],
safe_attrs=["href"],
remove_unknown_tags=False,
safe_attrs_only=True,
)
body = markdown.markdown(self.body)
body = cleaner.clean_html(body)
return safestring.mark_safe(body)