当前位置: 首页>>代码示例>>Python>>正文


Python clean.Cleaner方法代码示例

本文整理汇总了Python中lxml.html.clean.Cleaner方法的典型用法代码示例。如果您正苦于以下问题:Python clean.Cleaner方法的具体用法?Python clean.Cleaner怎么用?Python clean.Cleaner使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在lxml.html.clean的用法示例。


在下文中一共展示了clean.Cleaner方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: raw_scraper

# 需要导入模块: from lxml.html import clean [as 别名]
# 或者: from lxml.html.clean import Cleaner [as 别名]
def raw_scraper(url, memoize):
    t1 = time.time()

    try:
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize)
        article.download()
        html = minify(article.html)
        html = cleaner.clean_html(html)
        article.parse()
    except:
        return None, None
    if article.text == "":
        return None, None

    metadata = {"url": url, "elapsed": time.time() - t1, "scraper": "raw"}
    return html, metadata 
开发者ID:jcpeterson,项目名称:openwebtext,代码行数:21,代码来源:scrapers.py

示例2: render

# 需要导入模块: from lxml.html import clean [as 别名]
# 或者: from lxml.html.clean import Cleaner [as 别名]
def render(self):
        if not self:
            return self

        cleaner = Cleaner(
            allow_tags=self.allow_tags,
            safe_attrs=self.safe_attrs,
            remove_unknown_tags=False,
            safe_attrs_only=True,
        )
        markdown_obj = markdown.Markdown(extensions=self.extensions, extension_configs=self.extension_configs)
        text = markdown_obj.convert(self)
        text = cleaner.clean_html(text)
        return safestring.mark_safe(text) 
开发者ID:Inboxen,项目名称:Inboxen,代码行数:16,代码来源:fields.py

示例3: sanitize_html

# 需要导入模块: from lxml.html import clean [as 别名]
# 或者: from lxml.html.clean import Cleaner [as 别名]
def sanitize_html(unsecure_html_content):
    cleaner = Cleaner(inline_style=False, scripts=True, javascript=True,
                      safe_attrs=lxml.html.defs.safe_attrs | set(['style']),
                      frames=False, embedded=False,
                      meta=True, links=True, page_structure=True, remove_tags=['body'])
    try:
        secure_html_content = lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(unsecure_html_content)), method="html")
    except:
        secure_html_content = b''
    return secure_html_content.decode()

# Get the client IP address, considering proxies and RP 
开发者ID:pytition,项目名称:Pytition,代码行数:14,代码来源:helpers.py

示例4: clean_html

# 需要导入模块: from lxml.html import clean [as 别名]
# 或者: from lxml.html.clean import Cleaner [as 别名]
def clean_html(text):
    cleaner = Cleaner(style=False)
    return cleaner.clean_html(text) 
开发者ID:Net-ng,项目名称:kansha,代码行数:5,代码来源:validator.py

示例5: clean_article_html

# 需要导入模块: from lxml.html import clean [as 别名]
# 或者: from lxml.html.clean import Cleaner [as 别名]
def clean_article_html(html_string):

    html_string = html_string.replace('<h1', '<h3').replace('</h1>', '</h3>')
    # telegram will convert <b> anyway
    html_string = re.sub(r'<(/?)b(?=\s|>)', r'<\1strong', html_string)
    html_string = re.sub(r'<(/?)(h2|h5|h6)', r'<\1h4', html_string)
    # convert telegram embed posts before cleaner
    html_string = re.sub(telegram_embed_script_re, r'<iframe src="https://t.me/\1"></iframe>', html_string)
    # remove <head> if present (can't do this with Cleaner)
    html_string = header_re.sub('', html_string)

    c = Cleaner(
        allow_tags=allowed_tags,
        style=True,
        remove_unknown_tags=False,
        embedded=False,
        safe_attrs_only=True,
        safe_attrs=('src', 'href', 'class')
    )
    # wrap with div to be sure it is there
    # (otherwise lxml will add parent element in some cases
    html_string = '<div>%s</div>' % html_string
    cleaned = c.clean_html(html_string)
    # remove wrapped div
    cleaned = cleaned[5:-6]
    # remove all line breaks and empty strings
    html_string = replace_line_breaks_except_pre(cleaned)
    # but replace multiple br tags with one line break, telegraph will convert it to <br class="inline">
    html_string = re.sub(r'(<br(/?>|\s[^<>]*>)\s*)+', '\n', html_string)

    return html_string.strip(' \t') 
开发者ID:mercuree,项目名称:html-telegraph-poster,代码行数:33,代码来源:html_to_telegraph.py

示例6: cleaned_html

# 需要导入模块: from lxml.html import clean [as 别名]
# 或者: from lxml.html.clean import Cleaner [as 别名]
def cleaned_html(self):
        # Try to parse the provided HTML string using lxml
        # strip all unnecessary information to save space
        cleaner = Cleaner()
        cleaner.scripts = True
        cleaner.javascript = True
        cleaner.comments = True
        cleaner.style = True
        self.dom = cleaner.clean_html(self.dom)
        assert len(self.dom), 'The html needs to be parsed to get the cleaned html'
        return lxml.html.tostring(self.dom) 
开发者ID:ecoron,项目名称:SerpScrap,代码行数:13,代码来源:parser.py

示例7: _load

# 需要导入模块: from lxml.html import clean [as 别名]
# 或者: from lxml.html.clean import Cleaner [as 别名]
def _load(self):
        """
        Load the ElementTree from the source
        """
        # Convert directional quotation marks to regular quotes
        double_quotes = ur'[\u201c\u201d]'
        self.source = re.sub(double_quotes, u'"', self.source)
        single_quotes = ur'[\u2019\u2018]'
        self.source = re.sub(single_quotes, u"'", self.source)
        # Convert colons
        self.source = self.source.replace(u'\uff1a', u':')
        # Remove line breaks and tabs
        self.source = self.source.replace(u'\n', u'')
        self.source = self.source.replace(u'\t', u'')
        # There are also some "zero width joiners" in random places in the text
        # Should remove them here, since they make string search unreliable
        # these are the codes: &#8205, &#160 (nbsp), \xa0 (nbsp), \u200d
        zero_width_joiners = u'\u200d'
        self.source = self.source.replace(zero_width_joiners, u'')
        # Also previously had some non breaking spaces in unicode \u00a0, but this
        # may have been fixed by changing the parser below

        # Use the lxml cleaner
        cleaner = Cleaner()
        parser = HTMLParser(encoding='utf-8')
        # Finally, load the cleaned string to an ElementTree
        self.tree = cleaner.clean_html(lxml.html.fromstring(to_string(self.source), parser=parser))
        # self.tree = lxml.html.fromstring(to_string(self.source)) 
开发者ID:legco-watch,项目名称:legco-watch,代码行数:30,代码来源:agenda.py

示例8: render_body

# 需要导入模块: from lxml.html import clean [as 别名]
# 或者: from lxml.html.clean import Cleaner [as 别名]
def render_body(self):
        if not self.body:
            return ""

        cleaner = Cleaner(
            allow_tags=["p", "a", "i", "b", "em", "strong", "ol", "ul", "li", "pre", "code"],
            safe_attrs=["href"],
            remove_unknown_tags=False,
            safe_attrs_only=True,
        )
        body = markdown.markdown(self.body)
        body = cleaner.clean_html(body)
        return safestring.mark_safe(body) 
开发者ID:Inboxen,项目名称:Inboxen,代码行数:15,代码来源:models.py


注:本文中的lxml.html.clean.Cleaner方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。