當前位置: 首頁>>代碼示例>>Python>>正文


Python defs.safe_attrs方法代碼示例

本文整理匯總了Python中lxml.html.defs.safe_attrs方法的典型用法代碼示例。如果您正苦於以下問題:Python defs.safe_attrs方法的具體用法?Python defs.safe_attrs怎麽用?Python defs.safe_attrs使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在lxml.html.defs的用法示例。


在下文中一共展示了defs.safe_attrs方法的2個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: clean_html

# 需要導入模塊: from lxml.html import defs [as 別名]
# 或者: from lxml.html.defs import safe_attrs [as 別名]
def clean_html(self, doc, allow_classes=None, safe_attrs=None):
        allow_classes = allow_classes or ()
        safe_attrs = (set(defs.safe_attrs) |
                      self.SAFE_ATTRS |
                      set(safe_attrs or []))
        cleaner = Cleaner(safe_attrs_only=True,
                          safe_attrs=safe_attrs)
        doc = cleaner.clean_html(doc)

        while (len(doc) == 1):
            doc = doc[0]

        def rename_tag(doc):
            for e in doc.iter():
                if e.tag.lower() == 'article':
                    e.tag = 'div'

        rename_tag(doc)

        def remove_attr(doc):
            pattern = re.compile('|'.join(self.STYLE_REMOVED_ATTRS),
                                 flags=re.IGNORECASE)
            for e in doc.iter():
                if 'style' in e.attrib:
                    style_ = re.sub(pattern,
                                    '',
                                    e.get('style')).strip()
                    style = re.sub(r'\s{2,}',
                                   ' ',
                                   style_).strip()
                    if style:
                        e.attrib['style'] = style
                    else:
                        e.attrib.pop('style')
                for attr in e.attrib:
                    if (attr in self.REMOVED_ATTRS and
                        not (attr == 'class' and
                             e.get(attr).strip() in allow_classes)):
                        e.attrib.pop(attr)

        remove_attr(doc)

        def remove_tags(doc):
            TAGS = ('//i[not(text())]',
                    '//ins[not(text())]')

            for t in TAGS:
                for e in doc.xpath(t):
                    e.drop_tree()

            for e in doc.xpath('//a/div'):
                e.drop_tree()

        remove_tags(doc)

        return doc 
開發者ID:hack4code,項目名稱:BlogSpider,代碼行數:58,代碼來源:content.py

示例2: process_item

# 需要導入模塊: from lxml.html import defs [as 別名]
# 或者: from lxml.html.defs import safe_attrs [as 別名]
def process_item(self, item, spider):
        item['title'] = self.format_title(item['title'])
        doc = item['content']
        if not isinstance(doc,
                          HtmlElement):
            if isinstance(doc,
                          (str, bytes)):
                doc = fromstring(bytes(bytearray(doc,
                                                 encoding=item['encoding'])),
                                 parser=HTMLParser(encoding=item['encoding']))
            else:
                raise ContentException((
                    'Error in content pipeline unsupported doc type[{}]'
                    ).format(doc.__class__.__name__))

        # remove element with class name for clean display
        removed_classes = getattr(spider,
                                  self.REMOVED_CLASSES_NAME,
                                  None)
        if removed_classes is not None:
            doc = self.remove_element_with_class(doc,
                                                 removed_classes)

        # remove element with xpath for clean display
        removed_xpath_nodes = getattr(spider,
                                      self.REMOVED_XPATH_NODES_NAME,
                                      None)
        if removed_xpath_nodes is not None:
            doc = self.remove_element_with_xpath(doc,
                                                 removed_xpath_nodes)
        allow_classes = getattr(spider,
                                self.ALLOW_CLASSES_NAME,
                                None)
        safe_attrs = getattr(spider,
                             self.SAFE_ATTRS_NAME,
                             None)
        doc = self.clean_html(doc,
                              allow_classes=allow_classes,
                              safe_attrs=safe_attrs)
        doc = self.make_abs_link(doc,
                                 item['link'])
        item['content'] = doc
        return item 
開發者ID:hack4code,項目名稱:BlogSpider,代碼行數:45,代碼來源:content.py


注:本文中的lxml.html.defs.safe_attrs方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。