当前位置: 首页>>代码示例>>Python>>正文


Python html5lib.parseFragment方法代码示例

本文整理汇总了Python中html5lib.parseFragment方法的典型用法代码示例。如果您正苦于以下问题:Python html5lib.parseFragment方法的具体用法?Python html5lib.parseFragment怎么用?Python html5lib.parseFragment使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在html5lib的用法示例。


在下文中一共展示了html5lib.parseFragment方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: runtest

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def runtest(self):
        input = self.test["input"]
        expected = self.test["output"]

        parsed = parseFragment(input)
        serialized = serialize(parsed,
                               sanitize=True,
                               omit_optional_tags=False,
                               use_trailing_solidus=True,
                               space_before_trailing_solidus=False,
                               quote_attr_values="always",
                               quote_char="'",
                               alphabetical_attributes=True)
        errorMsg = "\n".join(["\n\nInput:", input,
                              "\nExpected:", expected,
                              "\nReceived:", serialized])
        assert expected == serialized, errorMsg 
开发者ID:morpheus65535,项目名称:bazarr,代码行数:19,代码来源:sanitizer.py

示例2: span_serialize

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def span_serialize(self, attributes, max_length=None):
    """Returns concatenated HTML code with SPAN tag.

    Args:
      attributes (dict): A map of name-value pairs for attributes of output
          SPAN tags.
      max_length (int, optional): Maximum length of span enclosed chunk.

    Returns:
      The organized HTML code. (str)
    """
    doc = ET.Element('span')
    for chunk in self:
      if (chunk.has_cjk() and
          not (max_length and len(chunk.word) > max_length)):
        ele = ET.Element('span')
        ele.text = chunk.word
        for key, val in attributes.items():
          ele.attrib[key] = val
        doc.append(ele)
      else:
        # add word without span tag for non-CJK text (e.g. English)
        # by appending it after the last element
        if doc.getchildren():
          if doc.getchildren()[-1].tail is None:
            doc.getchildren()[-1].tail = chunk.word
          else:
            doc.getchildren()[-1].tail += chunk.word
        else:
          if doc.text is None:
            doc.text = chunk.word
          else:
            doc.text += chunk.word
    result = ET.tostring(doc, encoding='utf-8').decode('utf-8')
    result = html5lib.serialize(
        html5lib.parseFragment(result), sanitize=True,
        quote_attr_values='always')
    return result 
开发者ID:google,项目名称:budou,代码行数:40,代码来源:chunk.py

示例3: wbr_serialize

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def wbr_serialize(self):
    """Returns concatenated HTML code with WBR tag. This is still experimental.

    Returns:
      The organized HTML code. (str)
    """
    doc = ET.Element('span')
    doc.attrib['style'] = 'word-break: keep-all'
    for chunk in self:
      if (chunk.has_cjk() and doc.text):
        ele = ET.Element('wbr')
        doc.append(ele)
        doc.getchildren()[-1].tail = chunk.word
      else:
        # add word without span tag for non-CJK text (e.g. English)
        # by appending it after the last element
        if doc.getchildren():
          if doc.getchildren()[-1].tail is None:
            doc.getchildren()[-1].tail = chunk.word
          else:
            doc.getchildren()[-1].tail += chunk.word
        else:
          if doc.text is None:
            doc.text = chunk.word
          else:
            doc.text += chunk.word
    content = ET.tostring(doc, encoding='utf-8').decode('utf-8')
    dom = html5lib.parseFragment(content)
    treewalker = getTreeWalker('etree')
    stream = treewalker(dom)
    serializer = html5lib.serializer.HTMLSerializer(
            quote_attr_values='always')
    allowed_elements = set(sanitizer.allowed_elements)
    allowed_elements.add((namespaces['html'], 'wbr'))
    allowed_css_properties = set(sanitizer.allowed_css_properties)
    allowed_css_properties.add('word-break')
    result = serializer.render(sanitizer.Filter(
        stream, allowed_elements=allowed_elements,
        allowed_css_properties=allowed_css_properties,
        ))
    return result 
开发者ID:google,项目名称:budou,代码行数:43,代码来源:chunk.py

示例4: preprocess

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def preprocess(source):
  """Removes unnecessary break lines and white spaces.

  Args:
    source (str): Input sentence.

  Returns:
    Preprocessed sentence. (str)
  """
  doc = html5lib.parseFragment(source)
  source = ET.tostring(doc, encoding='utf-8', method='text').decode('utf-8')
  source = source.replace(u'\n', u'').strip()
  source = re.sub(r'\s\s+', u' ', source)
  return source 
开发者ID:google,项目名称:budou,代码行数:16,代码来源:parser.py

示例5: transform_collapsibles

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def transform_collapsibles(text):
    """Find simple collapsible elements and transform them to full html."""
    tree = parseFragment(text, container='div', treebuilder='etree',
                         namespaceHTMLElements=False)

    base_id = ''.join(filter(str.isdigit, str(time.time())))
    collapsibles = tree.findall('./div[@class="collapsible-item"]')
    for i, collapsible in enumerate(collapsibles):
        title = collapsible.find('./div[@class="collapsible-item-title"]')
        body = collapsible.find('./div[@class="collapsible-item-body"]')

        if title is not None and body is not None:
            title.tag = 'span'
            del title.attrib['class']

            body.tag = 'div'
            del body.attrib['class']

            final_html = render_to_string(
                'a4ckeditor/collapsible_fragment.html',
                dict(
                    id='a4ckeditor-collapsible-{}_{}'.format(base_id, i),
                    title=serialize(title),
                    body=serialize(body))
            )

            collapsible.clear()
            collapsible.append(parseFragment(final_html, treebuilder='etree',
                                             namespaceHTMLElements=False))

    return serialize(tree) 
开发者ID:liqd,项目名称:adhocracy4,代码行数:33,代码来源:ckeditor_tags.py

示例6: walker_from_text

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def walker_from_text(self, text):
        parsed = html5lib.parseFragment(text)
        TreeWalker = html5lib.treewalkers.getTreeWalker("etree")
        walker = TreeWalker(parsed)
        return walker 
开发者ID:apache,项目名称:allura,代码行数:7,代码来源:test_utils.py

示例7: run

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def run(self, text):
        parsed = html5lib.parseFragment(text)

        # if we didn't have to customize our sanitization, could just do:
        # return html5lib.serialize(parsed, sanitize=True)

        # instead we do the same steps as that function,
        # but add our ForgeHTMLSanitizerFilter instead of sanitize=True which would use the standard one
        TreeWalker = html5lib.treewalkers.getTreeWalker("etree")
        walker = TreeWalker(parsed)
        walker = ForgeHTMLSanitizerFilter(walker)  # this is our custom step
        s = html5lib.serializer.HTMLSerializer()
        return s.render(walker) 
开发者ID:apache,项目名称:allura,代码行数:15,代码来源:markdown_extensions.py

示例8: test_no_duplicate_clone

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def test_no_duplicate_clone():
    frag = parseFragment("<b><em><foo><foob><fooc><aside></b></em>")
    assert len(frag) == 2 
开发者ID:morpheus65535,项目名称:bazarr,代码行数:5,代码来源:test_parser2.py

示例9: test_self_closing_col

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def test_self_closing_col():
    parser = HTMLParser()
    parser.parseFragment('<table><colgroup><col /></colgroup></table>')
    assert not parser.errors 
开发者ID:morpheus65535,项目名称:bazarr,代码行数:6,代码来源:test_parser2.py

示例10: runSanitizerTest

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def runSanitizerTest(_, expected, input):
    parsed = parseFragment(expected)
    expected = serialize(parsed,
                         omit_optional_tags=False,
                         use_trailing_solidus=True,
                         space_before_trailing_solidus=False,
                         quote_attr_values="always",
                         quote_char='"',
                         alphabetical_attributes=True)
    assert expected == sanitize_html(input) 
开发者ID:morpheus65535,项目名称:bazarr,代码行数:12,代码来源:test_sanitizer.py

示例11: sanitize_html

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def sanitize_html(stream):
    parsed = parseFragment(stream)
    serialized = serialize(parsed,
                           sanitize=True,
                           omit_optional_tags=False,
                           use_trailing_solidus=True,
                           space_before_trailing_solidus=False,
                           quote_attr_values="always",
                           quote_char='"',
                           alphabetical_attributes=True)
    return serialized 
开发者ID:morpheus65535,项目名称:bazarr,代码行数:13,代码来源:test_sanitizer.py

示例12: html

# 需要导入模块: import html5lib [as 别名]
# 或者: from html5lib import parseFragment [as 别名]
def html(self):
        try:
            import html5lib
            self.html5lib = html5lib
            return html5lib.parseFragment(self.content)
        except ImportError, err:
            raise ImproperlyConfigured("Error while importing html5lib: %s" % err) 
开发者ID:canvasnetworks,项目名称:canvas,代码行数:9,代码来源:html5lib.py


注:本文中的html5lib.parseFragment方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。