当前位置: 首页>>代码示例>>Python>>正文


Python defs.safe_attrs方法代码示例

本文整理汇总了Python中lxml.html.defs.safe_attrs方法的典型用法代码示例。如果您正苦于以下问题:Python defs.safe_attrs方法的具体用法?Python defs.safe_attrs怎么用?Python defs.safe_attrs使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在lxml.html.defs的用法示例。


在下文中一共展示了defs.safe_attrs方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: clean_html

# 需要导入模块: from lxml.html import defs [as 别名]
# 或者: from lxml.html.defs import safe_attrs [as 别名]
def clean_html(self, doc, allow_classes=None, safe_attrs=None):
        allow_classes = allow_classes or ()
        safe_attrs = (set(defs.safe_attrs) |
                      self.SAFE_ATTRS |
                      set(safe_attrs or []))
        cleaner = Cleaner(safe_attrs_only=True,
                          safe_attrs=safe_attrs)
        doc = cleaner.clean_html(doc)

        while (len(doc) == 1):
            doc = doc[0]

        def rename_tag(doc):
            for e in doc.iter():
                if e.tag.lower() == 'article':
                    e.tag = 'div'

        rename_tag(doc)

        def remove_attr(doc):
            pattern = re.compile('|'.join(self.STYLE_REMOVED_ATTRS),
                                 flags=re.IGNORECASE)
            for e in doc.iter():
                if 'style' in e.attrib:
                    style_ = re.sub(pattern,
                                    '',
                                    e.get('style')).strip()
                    style = re.sub(r'\s{2,}',
                                   ' ',
                                   style_).strip()
                    if style:
                        e.attrib['style'] = style
                    else:
                        e.attrib.pop('style')
                for attr in e.attrib:
                    if (attr in self.REMOVED_ATTRS and
                        not (attr == 'class' and
                             e.get(attr).strip() in allow_classes)):
                        e.attrib.pop(attr)

        remove_attr(doc)

        def remove_tags(doc):
            TAGS = ('//i[not(text())]',
                    '//ins[not(text())]')

            for t in TAGS:
                for e in doc.xpath(t):
                    e.drop_tree()

            for e in doc.xpath('//a/div'):
                e.drop_tree()

        remove_tags(doc)

        return doc 
开发者ID:hack4code,项目名称:BlogSpider,代码行数:58,代码来源:content.py

示例2: process_item

# 需要导入模块: from lxml.html import defs [as 别名]
# 或者: from lxml.html.defs import safe_attrs [as 别名]
def process_item(self, item, spider):
        item['title'] = self.format_title(item['title'])
        doc = item['content']
        if not isinstance(doc,
                          HtmlElement):
            if isinstance(doc,
                          (str, bytes)):
                doc = fromstring(bytes(bytearray(doc,
                                                 encoding=item['encoding'])),
                                 parser=HTMLParser(encoding=item['encoding']))
            else:
                raise ContentException((
                    'Error in content pipeline unsupported doc type[{}]'
                    ).format(doc.__class__.__name__))

        # remove element with class name for clean display
        removed_classes = getattr(spider,
                                  self.REMOVED_CLASSES_NAME,
                                  None)
        if removed_classes is not None:
            doc = self.remove_element_with_class(doc,
                                                 removed_classes)

        # remove element with xpath for clean display
        removed_xpath_nodes = getattr(spider,
                                      self.REMOVED_XPATH_NODES_NAME,
                                      None)
        if removed_xpath_nodes is not None:
            doc = self.remove_element_with_xpath(doc,
                                                 removed_xpath_nodes)
        allow_classes = getattr(spider,
                                self.ALLOW_CLASSES_NAME,
                                None)
        safe_attrs = getattr(spider,
                             self.SAFE_ATTRS_NAME,
                             None)
        doc = self.clean_html(doc,
                              allow_classes=allow_classes,
                              safe_attrs=safe_attrs)
        doc = self.make_abs_link(doc,
                                 item['link'])
        item['content'] = doc
        return item 
开发者ID:hack4code,项目名称:BlogSpider,代码行数:45,代码来源:content.py


注:本文中的lxml.html.defs.safe_attrs方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。