Python HTMLParser.parseFragment方法代码示例

本文整理汇总了Python中html5lib.HTMLParser.parseFragment方法的典型用法代码示例。如果您正苦于以下问题：Python HTMLParser.parseFragment方法的具体用法？Python HTMLParser.parseFragment怎么用？Python HTMLParser.parseFragment使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类html5lib.HTMLParser的用法示例。

在下文中一共展示了HTMLParser.parseFragment方法的11个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: clean_html

# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
def clean_html(input, sanitize=False):
    """
    Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed.

    :param sanitize: Remove unwanted HTML tags and attributes.

    >>> clean_html("<p>Foo<b>bar</b></p>")
    u'<p>Foo<b>bar</b></p>'
    >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>")
    u'<p>Foo<b>bar</b><i>Ooops!</i></p>'
    >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>')
    u'<p>Foo<b>bar</b>&amp; oops<a href=#foo&amp;bar>This is a &lt;&gt;link</a></p>'
    """
    parser_kwargs = {}
    serializer_kwargs = {}
    if sanitize:
        if HTMLSanitizer is None:
            # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016)
            serializer_kwargs["sanitize"] = True
        else:
            parser_kwargs["tokenizer"] = HTMLSanitizer

    p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs)
    dom_tree = p.parseFragment(input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs)
    return "".join(s.serialize(stream))

开发者ID:django-fluent，项目名称:django-fluent-contents，代码行数:31，代码来源:html.py

示例2: get_dom

# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
 def get_dom(self, buf):
     buf = buf.strip()
     if not buf:
         return None
     p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
                             tokenizer=self.token_sanitizer())
     return p.parseFragment(buf)

开发者ID:riffm，项目名称:iktomi，代码行数:9，代码来源:html.py

示例3: sanitize

# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
def sanitize(content):
    parser = HTMLParser(tokenizer = sanitizer.HTMLSanitizer,
                             tree = treebuilders.getTreeBuilder("dom"))
    dom = parser.parseFragment(content)
    tree_walker = treewalkers.getTreeWalker("dom")
    tree_stream = tree_walker(dom)
    serial = serializer.HTMLSerializer(omit_optional_tags = False,
                                           quote_attr_values = True)
    output = serial.serialize(tree_stream)
    return u''.join(output)

开发者ID:jeffrz，项目名称:airship，代码行数:12，代码来源:content_sanitizer.py

示例4: sanitize_html

# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
def sanitize_html(html):
    """Sanitizes an HTML fragment."""
    p = HTMLParser(tokenizer=HTMLSanitizer,
                   tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.HTMLSerializer(omit_optional_tags=False,
                                  quote_attr_values=True)
    output_generator = s.serialize(stream)
    return u''.join(output_generator)

开发者ID:teknolab，项目名称:teknolab-osqa，代码行数:13，代码来源:html.py

示例5: sanitize_input

# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
def sanitize_input(chars):
    """
    html1 = "<b>shon</b>"
    html1 = "<b>shon</b><script>zzz</script>"
    print sanitize_input(html1)
    """
    p = HTMLParser(tokenizer=sanitizer.HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) # could use Beautiful Soup here instead
    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True)
    dom_tree = p.parseFragment(chars)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    gen = s.serialize(stream)
    out = ''.join(i for i in gen)
    return str(BeautifulSoup(out)) # BeautifulSoup is to convert <br> to <br />

开发者ID:mightymau，项目名称:hubspace，代码行数:16，代码来源:uiutils.py

示例6: sanitize_html

# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
def sanitize_html(input):
    """
    Removes any unwanted HTML tags and attributes, using html5lib.

    >>> sanitize_html("foobar<p>adf<i></p>abc</i>")
    u'foobar<p>adf<i></i></p><i>abc</i>'
    >>> sanitize_html('foobar<p style="color:red; remove:me; background-image: url(http://example.com/test.php?query_string=bad);">adf<script>alert("Uhoh!")</script><i></p>abc</i>')
    u'foobar<p style="color: red;">adf&lt;script&gt;alert("Uhoh!")&lt;/script&gt;<i></i></p><i>abc</i>'
    """
    p = HTMLParser(tokenizer=HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = HTMLSerializer(omit_optional_tags=False)
    return "".join(s.serialize(stream))

开发者ID:philippbosch，项目名称:django-fluent-contents，代码行数:18，代码来源:html.py

示例7: clean_html

# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
def clean_html(input):
    """
    Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed.

    >>> clean_html("<p>Foo<b>bar</b></p>")
    u'<p>Foo<b>bar</b></p>'
    >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>")
    u'<p>Foo<b>bar</b><i>Ooops!</i></p>'
    >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>')
    u'<p>Foo<b>bar</b>&amp; oops<a href=#foo&amp;bar>This is a &lt;&gt;link</a></p>'
    """
    p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = HTMLSerializer(omit_optional_tags=False)
    return "".join(s.serialize(stream))

开发者ID:philippbosch，项目名称:django-fluent-contents，代码行数:20，代码来源:html.py

示例8: _cleanTask

# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
  def _cleanTask(self, task, org):
    """Cleans the data given so that it can be safely stored as a task.

      Args:
        task: Dictionary as constructed by the csv.DictReader().
        org: the GCIOrganization for which the task is created.

      Returns:
          A list of error messages if any have occurred.
    """

    errors = []

    # check title
    if not task['title']:
      errors.append('No valid title present.')

    # clean description
    try:
      parser = HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
      parsed = parser.parseFragment(task['description'], encoding='utf-8')
      cleaned_string = ''.join([tag.toxml() for tag in parsed.childNodes])
      task['description'] = cleaned_string.strip().replace('\r\n', '\n')
    except (HTMLParseError, ParseError, TypeError) as e:
      logging.warning('Cleaning of description failed with: %s', e)
      errors.append(
          'Failed to clean the description, do not use naughty HTML such as '
          '<script>.')

    # clean time to complete
    try:
      hours_to_complete = int(task['time_to_complete'])

      # Must be at least 2 days (48hrs)
      if hours_to_complete < 2*24:
        errors.append('Time to complete must be at least 48 hrs, given was: %s'
                      % hours_to_complete)
      else:
        task['time_to_complete'] = hours_to_complete
    except (ValueError, TypeError) as e:
      errors.append('No valid time to completion found, given was: %s.'
                    % task['time_to_complete'])

    # clean mentors
    mentor_ids = set(task['mentors'].split(','))

    mentors = []
    mentor_entities = []
    for mentor_id in mentor_ids:
      q = GCIProfile.all()
      q.filter('link_id', mentor_id.strip())
      q.filter('mentor_for', org)
      q.filter('status', 'active')
      mentor = q.get()
      if mentor:
        mentors.append(mentor.key())
        mentor_entities.append(mentor)
      else:
        errors.append('%s is not a mentor.' % mentor_id)

    task['mentors'] = mentors
    task['mentor_entities'] = mentor_entities

    program_entity = org.program

    # clean task types
    types = []
    for task_type in set(task['task_type'].split(',')):
      task_type = task_type.strip()
      if task_type in program_entity.task_types:
        types.append(task_type)
      else:
        errors.append('%s is not a valid task type.' % task_type)
    task['types'] = types

    # clean task tags
    tags = []
    for tag in set(task['arbit_tag'].split(',')):
      tags.append(tag.strip())
    task['tags'] = tags

    return errors

开发者ID:rhyolight，项目名称:nupic.son，代码行数:84，代码来源:bulk_create.py

示例9: as_unicode

# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
 def as_unicode(cls, events):
     parser = HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
     for row in events:
         for idx, cell in enumerate(row):
             row[idx] = parser.parseFragment(unicode(cell, 'utf-8')).toxml()
         yield row

开发者ID:Faldrian，项目名称:kuma，代码行数:8，代码来源:models.py

示例10: parse

# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
 def parse(self, value):
     #print value
     p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
     return p.parseFragment(value)

开发者ID:unk2k，项目名称:iktomi，代码行数:6，代码来源:widgets.py

示例11: test_self_closing_col

# 需要导入模块: from html5lib import HTMLParser [as 别名]
# 或者: from html5lib.HTMLParser import parseFragment [as 别名]
def test_self_closing_col():
    parser = HTMLParser()
    parser.parseFragment('<table><colgroup><col /></colgroup></table>')
    assert not parser.errors

开发者ID:Coder206，项目名称:servo，代码行数:6，代码来源:test_parser2.py

注：本文中的html5lib.HTMLParser.parseFragment方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。