本文整理汇总了Python中lxml.html.defs.safe_attrs方法的典型用法代码示例。如果您正苦于以下问题:Python defs.safe_attrs方法的具体用法?Python defs.safe_attrs怎么用?Python defs.safe_attrs使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lxml.html.defs
的用法示例。
在下文中一共展示了defs.safe_attrs方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: clean_html
# 需要导入模块: from lxml.html import defs [as 别名]
# 或者: from lxml.html.defs import safe_attrs [as 别名]
def clean_html(self, doc, allow_classes=None, safe_attrs=None):
allow_classes = allow_classes or ()
safe_attrs = (set(defs.safe_attrs) |
self.SAFE_ATTRS |
set(safe_attrs or []))
cleaner = Cleaner(safe_attrs_only=True,
safe_attrs=safe_attrs)
doc = cleaner.clean_html(doc)
while (len(doc) == 1):
doc = doc[0]
def rename_tag(doc):
for e in doc.iter():
if e.tag.lower() == 'article':
e.tag = 'div'
rename_tag(doc)
def remove_attr(doc):
pattern = re.compile('|'.join(self.STYLE_REMOVED_ATTRS),
flags=re.IGNORECASE)
for e in doc.iter():
if 'style' in e.attrib:
style_ = re.sub(pattern,
'',
e.get('style')).strip()
style = re.sub(r'\s{2,}',
' ',
style_).strip()
if style:
e.attrib['style'] = style
else:
e.attrib.pop('style')
for attr in e.attrib:
if (attr in self.REMOVED_ATTRS and
not (attr == 'class' and
e.get(attr).strip() in allow_classes)):
e.attrib.pop(attr)
remove_attr(doc)
def remove_tags(doc):
TAGS = ('//i[not(text())]',
'//ins[not(text())]')
for t in TAGS:
for e in doc.xpath(t):
e.drop_tree()
for e in doc.xpath('//a/div'):
e.drop_tree()
remove_tags(doc)
return doc
示例2: process_item
# 需要导入模块: from lxml.html import defs [as 别名]
# 或者: from lxml.html.defs import safe_attrs [as 别名]
def process_item(self, item, spider):
item['title'] = self.format_title(item['title'])
doc = item['content']
if not isinstance(doc,
HtmlElement):
if isinstance(doc,
(str, bytes)):
doc = fromstring(bytes(bytearray(doc,
encoding=item['encoding'])),
parser=HTMLParser(encoding=item['encoding']))
else:
raise ContentException((
'Error in content pipeline unsupported doc type[{}]'
).format(doc.__class__.__name__))
# remove element with class name for clean display
removed_classes = getattr(spider,
self.REMOVED_CLASSES_NAME,
None)
if removed_classes is not None:
doc = self.remove_element_with_class(doc,
removed_classes)
# remove element with xpath for clean display
removed_xpath_nodes = getattr(spider,
self.REMOVED_XPATH_NODES_NAME,
None)
if removed_xpath_nodes is not None:
doc = self.remove_element_with_xpath(doc,
removed_xpath_nodes)
allow_classes = getattr(spider,
self.ALLOW_CLASSES_NAME,
None)
safe_attrs = getattr(spider,
self.SAFE_ATTRS_NAME,
None)
doc = self.clean_html(doc,
allow_classes=allow_classes,
safe_attrs=safe_attrs)
doc = self.make_abs_link(doc,
item['link'])
item['content'] = doc
return item