當前位置: 首頁>>代碼示例>>Python>>正文


Python clean.Cleaner方法代碼示例

本文整理匯總了Python中lxml.html.clean.Cleaner方法的典型用法代碼示例。如果您正苦於以下問題:Python clean.Cleaner方法的具體用法?Python clean.Cleaner怎麽用?Python clean.Cleaner使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在lxml.html.clean的用法示例。


在下文中一共展示了clean.Cleaner方法的8個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: raw_scraper

# 需要導入模塊: from lxml.html import clean [as 別名]
# 或者: from lxml.html.clean import Cleaner [as 別名]
def raw_scraper(url, memoize):
    t1 = time.time()

    try:
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize)
        article.download()
        html = minify(article.html)
        html = cleaner.clean_html(html)
        article.parse()
    except:
        return None, None
    if article.text == "":
        return None, None

    metadata = {"url": url, "elapsed": time.time() - t1, "scraper": "raw"}
    return html, metadata 
開發者ID:jcpeterson,項目名稱:openwebtext,代碼行數:21,代碼來源:scrapers.py

示例2: render

# 需要導入模塊: from lxml.html import clean [as 別名]
# 或者: from lxml.html.clean import Cleaner [as 別名]
def render(self):
        if not self:
            return self

        cleaner = Cleaner(
            allow_tags=self.allow_tags,
            safe_attrs=self.safe_attrs,
            remove_unknown_tags=False,
            safe_attrs_only=True,
        )
        markdown_obj = markdown.Markdown(extensions=self.extensions, extension_configs=self.extension_configs)
        text = markdown_obj.convert(self)
        text = cleaner.clean_html(text)
        return safestring.mark_safe(text) 
開發者ID:Inboxen,項目名稱:Inboxen,代碼行數:16,代碼來源:fields.py

示例3: sanitize_html

# 需要導入模塊: from lxml.html import clean [as 別名]
# 或者: from lxml.html.clean import Cleaner [as 別名]
def sanitize_html(unsecure_html_content):
    cleaner = Cleaner(inline_style=False, scripts=True, javascript=True,
                      safe_attrs=lxml.html.defs.safe_attrs | set(['style']),
                      frames=False, embedded=False,
                      meta=True, links=True, page_structure=True, remove_tags=['body'])
    try:
        secure_html_content = lxml.html.tostring(cleaner.clean_html(lxml.html.fromstring(unsecure_html_content)), method="html")
    except:
        secure_html_content = b''
    return secure_html_content.decode()

# Get the client IP address, considering proxies and RP 
開發者ID:pytition,項目名稱:Pytition,代碼行數:14,代碼來源:helpers.py

示例4: clean_html

# 需要導入模塊: from lxml.html import clean [as 別名]
# 或者: from lxml.html.clean import Cleaner [as 別名]
def clean_html(text):
    cleaner = Cleaner(style=False)
    return cleaner.clean_html(text) 
開發者ID:Net-ng,項目名稱:kansha,代碼行數:5,代碼來源:validator.py

示例5: clean_article_html

# 需要導入模塊: from lxml.html import clean [as 別名]
# 或者: from lxml.html.clean import Cleaner [as 別名]
def clean_article_html(html_string):

    html_string = html_string.replace('<h1', '<h3').replace('</h1>', '</h3>')
    # telegram will convert <b> anyway
    html_string = re.sub(r'<(/?)b(?=\s|>)', r'<\1strong', html_string)
    html_string = re.sub(r'<(/?)(h2|h5|h6)', r'<\1h4', html_string)
    # convert telegram embed posts before cleaner
    html_string = re.sub(telegram_embed_script_re, r'<iframe src="https://t.me/\1"></iframe>', html_string)
    # remove <head> if present (can't do this with Cleaner)
    html_string = header_re.sub('', html_string)

    c = Cleaner(
        allow_tags=allowed_tags,
        style=True,
        remove_unknown_tags=False,
        embedded=False,
        safe_attrs_only=True,
        safe_attrs=('src', 'href', 'class')
    )
    # wrap with div to be sure it is there
    # (otherwise lxml will add parent element in some cases
    html_string = '<div>%s</div>' % html_string
    cleaned = c.clean_html(html_string)
    # remove wrapped div
    cleaned = cleaned[5:-6]
    # remove all line breaks and empty strings
    html_string = replace_line_breaks_except_pre(cleaned)
    # but replace multiple br tags with one line break, telegraph will convert it to <br class="inline">
    html_string = re.sub(r'(<br(/?>|\s[^<>]*>)\s*)+', '\n', html_string)

    return html_string.strip(' \t') 
開發者ID:mercuree,項目名稱:html-telegraph-poster,代碼行數:33,代碼來源:html_to_telegraph.py

示例6: cleaned_html

# 需要導入模塊: from lxml.html import clean [as 別名]
# 或者: from lxml.html.clean import Cleaner [as 別名]
def cleaned_html(self):
        # Try to parse the provided HTML string using lxml
        # strip all unnecessary information to save space
        cleaner = Cleaner()
        cleaner.scripts = True
        cleaner.javascript = True
        cleaner.comments = True
        cleaner.style = True
        self.dom = cleaner.clean_html(self.dom)
        assert len(self.dom), 'The html needs to be parsed to get the cleaned html'
        return lxml.html.tostring(self.dom) 
開發者ID:ecoron,項目名稱:SerpScrap,代碼行數:13,代碼來源:parser.py

示例7: _load

# 需要導入模塊: from lxml.html import clean [as 別名]
# 或者: from lxml.html.clean import Cleaner [as 別名]
def _load(self):
        """
        Load the ElementTree from the source
        """
        # Convert directional quotation marks to regular quotes
        double_quotes = ur'[\u201c\u201d]'
        self.source = re.sub(double_quotes, u'"', self.source)
        single_quotes = ur'[\u2019\u2018]'
        self.source = re.sub(single_quotes, u"'", self.source)
        # Convert colons
        self.source = self.source.replace(u'\uff1a', u':')
        # Remove line breaks and tabs
        self.source = self.source.replace(u'\n', u'')
        self.source = self.source.replace(u'\t', u'')
        # There are also some "zero width joiners" in random places in the text
        # Should remove them here, since they make string search unreliable
        # these are the codes: &#8205, &#160 (nbsp), \xa0 (nbsp), \u200d
        zero_width_joiners = u'\u200d'
        self.source = self.source.replace(zero_width_joiners, u'')
        # Also previously had some non breaking spaces in unicode \u00a0, but this
        # may have been fixed by changing the parser below

        # Use the lxml cleaner
        cleaner = Cleaner()
        parser = HTMLParser(encoding='utf-8')
        # Finally, load the cleaned string to an ElementTree
        self.tree = cleaner.clean_html(lxml.html.fromstring(to_string(self.source), parser=parser))
        # self.tree = lxml.html.fromstring(to_string(self.source)) 
開發者ID:legco-watch,項目名稱:legco-watch,代碼行數:30,代碼來源:agenda.py

示例8: render_body

# 需要導入模塊: from lxml.html import clean [as 別名]
# 或者: from lxml.html.clean import Cleaner [as 別名]
def render_body(self):
        if not self.body:
            return ""

        cleaner = Cleaner(
            allow_tags=["p", "a", "i", "b", "em", "strong", "ol", "ul", "li", "pre", "code"],
            safe_attrs=["href"],
            remove_unknown_tags=False,
            safe_attrs_only=True,
        )
        body = markdown.markdown(self.body)
        body = cleaner.clean_html(body)
        return safestring.mark_safe(body) 
開發者ID:Inboxen,項目名稱:Inboxen,代碼行數:15,代碼來源:models.py


注:本文中的lxml.html.clean.Cleaner方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。