本文整理汇总了Python中html5lib.HTMLParser.parseFragment方法的典型用法代码示例。如果您正苦于以下问题:Python HTMLParser.parseFragment方法的具体用法?Python HTMLParser.parseFragment怎么用?Python HTMLParser.parseFragment使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类html5lib.HTMLParser
的用法示例。
在下文中一共展示了HTMLParser.parseFragment方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: clean_html
# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
def clean_html(input, sanitize=False):
"""
Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed.
:param sanitize: Remove unwanted HTML tags and attributes.
>>> clean_html("<p>Foo<b>bar</b></p>")
u'<p>Foo<b>bar</b></p>'
>>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>")
u'<p>Foo<b>bar</b><i>Ooops!</i></p>'
>>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>')
u'<p>Foo<b>bar</b>& oops<a href=#foo&bar>This is a <>link</a></p>'
"""
parser_kwargs = {}
serializer_kwargs = {}
if sanitize:
if HTMLSanitizer is None:
# new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016)
serializer_kwargs["sanitize"] = True
else:
parser_kwargs["tokenizer"] = HTMLSanitizer
p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs)
dom_tree = p.parseFragment(input)
walker = treewalkers.getTreeWalker("dom")
stream = walker(dom_tree)
s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs)
return "".join(s.serialize(stream))
示例2: get_dom
# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
def get_dom(self, buf):
buf = buf.strip()
if not buf:
return None
p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
tokenizer=self.token_sanitizer())
return p.parseFragment(buf)
示例3: sanitize
# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
def sanitize(content):
parser = HTMLParser(tokenizer = sanitizer.HTMLSanitizer,
tree = treebuilders.getTreeBuilder("dom"))
dom = parser.parseFragment(content)
tree_walker = treewalkers.getTreeWalker("dom")
tree_stream = tree_walker(dom)
serial = serializer.HTMLSerializer(omit_optional_tags = False,
quote_attr_values = True)
output = serial.serialize(tree_stream)
return u''.join(output)
示例4: sanitize_html
# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
def sanitize_html(html):
"""Sanitizes an HTML fragment."""
p = HTMLParser(tokenizer=HTMLSanitizer,
tree=treebuilders.getTreeBuilder("dom"))
dom_tree = p.parseFragment(html)
walker = treewalkers.getTreeWalker("dom")
stream = walker(dom_tree)
s = serializer.HTMLSerializer(omit_optional_tags=False,
quote_attr_values=True)
output_generator = s.serialize(stream)
return u''.join(output_generator)
示例5: sanitize_input
# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
def sanitize_input(chars):
"""
html1 = "<b>shon</b>"
html1 = "<b>shon</b><script>zzz</script>"
print sanitize_input(html1)
"""
p = HTMLParser(tokenizer=sanitizer.HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) # could use Beautiful Soup here instead
s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True)
dom_tree = p.parseFragment(chars)
walker = treewalkers.getTreeWalker("dom")
stream = walker(dom_tree)
gen = s.serialize(stream)
out = ''.join(i for i in gen)
return str(BeautifulSoup(out)) # BeautifulSoup is to convert <br> to <br />
示例6: sanitize_html
# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
def sanitize_html(input):
"""
Removes any unwanted HTML tags and attributes, using html5lib.
>>> sanitize_html("foobar<p>adf<i></p>abc</i>")
u'foobar<p>adf<i></i></p><i>abc</i>'
>>> sanitize_html('foobar<p style="color:red; remove:me; background-image: url(http://example.com/test.php?query_string=bad);">adf<script>alert("Uhoh!")</script><i></p>abc</i>')
u'foobar<p style="color: red;">adf<script>alert("Uhoh!")</script><i></i></p><i>abc</i>'
"""
p = HTMLParser(tokenizer=HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom"))
dom_tree = p.parseFragment(input)
walker = treewalkers.getTreeWalker("dom")
stream = walker(dom_tree)
s = HTMLSerializer(omit_optional_tags=False)
return "".join(s.serialize(stream))
示例7: clean_html
# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
def clean_html(input):
"""
Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed.
>>> clean_html("<p>Foo<b>bar</b></p>")
u'<p>Foo<b>bar</b></p>'
>>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>")
u'<p>Foo<b>bar</b><i>Ooops!</i></p>'
>>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>')
u'<p>Foo<b>bar</b>& oops<a href=#foo&bar>This is a <>link</a></p>'
"""
p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
dom_tree = p.parseFragment(input)
walker = treewalkers.getTreeWalker("dom")
stream = walker(dom_tree)
s = HTMLSerializer(omit_optional_tags=False)
return "".join(s.serialize(stream))
示例8: _cleanTask
# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
def _cleanTask(self, task, org):
"""Cleans the data given so that it can be safely stored as a task.
Args:
task: Dictionary as constructed by the csv.DictReader().
org: the GCIOrganization for which the task is created.
Returns:
A list of error messages if any have occurred.
"""
errors = []
# check title
if not task['title']:
errors.append('No valid title present.')
# clean description
try:
parser = HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
parsed = parser.parseFragment(task['description'], encoding='utf-8')
cleaned_string = ''.join([tag.toxml() for tag in parsed.childNodes])
task['description'] = cleaned_string.strip().replace('\r\n', '\n')
except (HTMLParseError, ParseError, TypeError) as e:
logging.warning('Cleaning of description failed with: %s', e)
errors.append(
'Failed to clean the description, do not use naughty HTML such as '
'<script>.')
# clean time to complete
try:
hours_to_complete = int(task['time_to_complete'])
# Must be at least 2 days (48hrs)
if hours_to_complete < 2*24:
errors.append('Time to complete must be at least 48 hrs, given was: %s'
% hours_to_complete)
else:
task['time_to_complete'] = hours_to_complete
except (ValueError, TypeError) as e:
errors.append('No valid time to completion found, given was: %s.'
% task['time_to_complete'])
# clean mentors
mentor_ids = set(task['mentors'].split(','))
mentors = []
mentor_entities = []
for mentor_id in mentor_ids:
q = GCIProfile.all()
q.filter('link_id', mentor_id.strip())
q.filter('mentor_for', org)
q.filter('status', 'active')
mentor = q.get()
if mentor:
mentors.append(mentor.key())
mentor_entities.append(mentor)
else:
errors.append('%s is not a mentor.' % mentor_id)
task['mentors'] = mentors
task['mentor_entities'] = mentor_entities
program_entity = org.program
# clean task types
types = []
for task_type in set(task['task_type'].split(',')):
task_type = task_type.strip()
if task_type in program_entity.task_types:
types.append(task_type)
else:
errors.append('%s is not a valid task type.' % task_type)
task['types'] = types
# clean task tags
tags = []
for tag in set(task['arbit_tag'].split(',')):
tags.append(tag.strip())
task['tags'] = tags
return errors
示例9: as_unicode
# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
def as_unicode(cls, events):
parser = HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
for row in events:
for idx, cell in enumerate(row):
row[idx] = parser.parseFragment(unicode(cell, 'utf-8')).toxml()
yield row
示例10: parse
# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
def parse(self, value):
#print value
p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
return p.parseFragment(value)
示例11: test_self_closing_col
# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
def test_self_closing_col():
parser = HTMLParser()
parser.parseFragment('<table><colgroup><col /></colgroup></table>')
assert not parser.errors