本文整理汇总了Python中html5lib.parseFragment方法的典型用法代码示例。如果您正苦于以下问题:Python html5lib.parseFragment方法的具体用法?Python html5lib.parseFragment怎么用?Python html5lib.parseFragment使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类html5lib
的用法示例。
在下文中一共展示了html5lib.parseFragment方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: runtest
# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def runtest(self):
input = self.test["input"]
expected = self.test["output"]
parsed = parseFragment(input)
serialized = serialize(parsed,
sanitize=True,
omit_optional_tags=False,
use_trailing_solidus=True,
space_before_trailing_solidus=False,
quote_attr_values="always",
quote_char="'",
alphabetical_attributes=True)
errorMsg = "\n".join(["\n\nInput:", input,
"\nExpected:", expected,
"\nReceived:", serialized])
assert expected == serialized, errorMsg
示例2: span_serialize
# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def span_serialize(self, attributes, max_length=None):
"""Returns concatenated HTML code with SPAN tag.
Args:
attributes (dict): A map of name-value pairs for attributes of output
SPAN tags.
max_length (int, optional): Maximum length of span enclosed chunk.
Returns:
The organized HTML code. (str)
"""
doc = ET.Element('span')
for chunk in self:
if (chunk.has_cjk() and
not (max_length and len(chunk.word) > max_length)):
ele = ET.Element('span')
ele.text = chunk.word
for key, val in attributes.items():
ele.attrib[key] = val
doc.append(ele)
else:
# add word without span tag for non-CJK text (e.g. English)
# by appending it after the last element
if doc.getchildren():
if doc.getchildren()[-1].tail is None:
doc.getchildren()[-1].tail = chunk.word
else:
doc.getchildren()[-1].tail += chunk.word
else:
if doc.text is None:
doc.text = chunk.word
else:
doc.text += chunk.word
result = ET.tostring(doc, encoding='utf-8').decode('utf-8')
result = html5lib.serialize(
html5lib.parseFragment(result), sanitize=True,
quote_attr_values='always')
return result
示例3: wbr_serialize
# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def wbr_serialize(self):
"""Returns concatenated HTML code with WBR tag. This is still experimental.
Returns:
The organized HTML code. (str)
"""
doc = ET.Element('span')
doc.attrib['style'] = 'word-break: keep-all'
for chunk in self:
if (chunk.has_cjk() and doc.text):
ele = ET.Element('wbr')
doc.append(ele)
doc.getchildren()[-1].tail = chunk.word
else:
# add word without span tag for non-CJK text (e.g. English)
# by appending it after the last element
if doc.getchildren():
if doc.getchildren()[-1].tail is None:
doc.getchildren()[-1].tail = chunk.word
else:
doc.getchildren()[-1].tail += chunk.word
else:
if doc.text is None:
doc.text = chunk.word
else:
doc.text += chunk.word
content = ET.tostring(doc, encoding='utf-8').decode('utf-8')
dom = html5lib.parseFragment(content)
treewalker = getTreeWalker('etree')
stream = treewalker(dom)
serializer = html5lib.serializer.HTMLSerializer(
quote_attr_values='always')
allowed_elements = set(sanitizer.allowed_elements)
allowed_elements.add((namespaces['html'], 'wbr'))
allowed_css_properties = set(sanitizer.allowed_css_properties)
allowed_css_properties.add('word-break')
result = serializer.render(sanitizer.Filter(
stream, allowed_elements=allowed_elements,
allowed_css_properties=allowed_css_properties,
))
return result
示例4: preprocess
# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def preprocess(source):
"""Removes unnecessary break lines and white spaces.
Args:
source (str): Input sentence.
Returns:
Preprocessed sentence. (str)
"""
doc = html5lib.parseFragment(source)
source = ET.tostring(doc, encoding='utf-8', method='text').decode('utf-8')
source = source.replace(u'\n', u'').strip()
source = re.sub(r'\s\s+', u' ', source)
return source
示例5: transform_collapsibles
# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def transform_collapsibles(text):
"""Find simple collapsible elements and transform them to full html."""
tree = parseFragment(text, container='div', treebuilder='etree',
namespaceHTMLElements=False)
base_id = ''.join(filter(str.isdigit, str(time.time())))
collapsibles = tree.findall('./div[@class="collapsible-item"]')
for i, collapsible in enumerate(collapsibles):
title = collapsible.find('./div[@class="collapsible-item-title"]')
body = collapsible.find('./div[@class="collapsible-item-body"]')
if title is not None and body is not None:
title.tag = 'span'
del title.attrib['class']
body.tag = 'div'
del body.attrib['class']
final_html = render_to_string(
'a4ckeditor/collapsible_fragment.html',
dict(
id='a4ckeditor-collapsible-{}_{}'.format(base_id, i),
title=serialize(title),
body=serialize(body))
)
collapsible.clear()
collapsible.append(parseFragment(final_html, treebuilder='etree',
namespaceHTMLElements=False))
return serialize(tree)
示例6: walker_from_text
# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def walker_from_text(self, text):
parsed = html5lib.parseFragment(text)
TreeWalker = html5lib.treewalkers.getTreeWalker("etree")
walker = TreeWalker(parsed)
return walker
示例7: run
# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def run(self, text):
parsed = html5lib.parseFragment(text)
# if we didn't have to customize our sanitization, could just do:
# return html5lib.serialize(parsed, sanitize=True)
# instead we do the same steps as that function,
# but add our ForgeHTMLSanitizerFilter instead of sanitize=True which would use the standard one
TreeWalker = html5lib.treewalkers.getTreeWalker("etree")
walker = TreeWalker(parsed)
walker = ForgeHTMLSanitizerFilter(walker) # this is our custom step
s = html5lib.serializer.HTMLSerializer()
return s.render(walker)
示例8: test_no_duplicate_clone
# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def test_no_duplicate_clone():
frag = parseFragment("<b><em><foo><foob><fooc><aside></b></em>")
assert len(frag) == 2
示例9: test_self_closing_col
# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def test_self_closing_col():
parser = HTMLParser()
parser.parseFragment('<table><colgroup><col /></colgroup></table>')
assert not parser.errors
示例10: runSanitizerTest
# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def runSanitizerTest(_, expected, input):
parsed = parseFragment(expected)
expected = serialize(parsed,
omit_optional_tags=False,
use_trailing_solidus=True,
space_before_trailing_solidus=False,
quote_attr_values="always",
quote_char='"',
alphabetical_attributes=True)
assert expected == sanitize_html(input)
示例11: sanitize_html
# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def sanitize_html(stream):
parsed = parseFragment(stream)
serialized = serialize(parsed,
sanitize=True,
omit_optional_tags=False,
use_trailing_solidus=True,
space_before_trailing_solidus=False,
quote_attr_values="always",
quote_char='"',
alphabetical_attributes=True)
return serialized
示例12: html
# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def html(self):
try:
import html5lib
self.html5lib = html5lib
return html5lib.parseFragment(self.content)
except ImportError, err:
raise ImproperlyConfigured("Error while importing html5lib: %s" % err)