本文整理汇总了Python中lxml.cssselect.CSSSelector.iter方法的典型用法代码示例。如果您正苦于以下问题:Python CSSSelector.iter方法的具体用法?Python CSSSelector.iter怎么用?Python CSSSelector.iter使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lxml.cssselect.CSSSelector
的用法示例。
在下文中一共展示了CSSSelector.iter方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: process_html
# 需要导入模块: from lxml.cssselect import CSSSelector [as 别名]
# 或者: from lxml.cssselect.CSSSelector import iter [as 别名]
def process_html(self, html, path):
parser = etree.HTMLParser(encoding='utf-8')
tree = etree.fromstring(html.decode('utf-8'), parser).getroottree()
page = tree.getroot()
if page is None:
print(repr(html))
raise ParserError('Could not parse the html')
lines = html.splitlines()
body, = CSSSelector('body')(page)
self._bodies.append(body)
if self.optimize_lookup:
for each in body.iter():
identifier = each.attrib.get('id')
if identifier:
self._all_ids.add(identifier)
classes = each.attrib.get('class')
if classes:
for class_ in classes.split():
self._all_classes.add(class_)
for style in CSSSelector('style')(page):
first_line = style.text.strip().splitlines()[0]
for i, line in enumerate(lines):
if line.count(first_line):
key = (i + 1, path)
self.blocks[key] = style.text
break
示例2: process_html
# 需要导入模块: from lxml.cssselect import CSSSelector [as 别名]
# 或者: from lxml.cssselect.CSSSelector import iter [as 别名]
def process_html(self, html, url):
parser = etree.HTMLParser(encoding='utf-8')
tree = etree.fromstring(html.encode('utf-8'), parser).getroottree()
page = tree.getroot()
if page is None:
print(repr(html))
raise ParserError('Could not parse the html')
lines = html.splitlines()
body, = CSSSelector('body')(page)
self._bodies.append(body)
if self.optimize_lookup:
for each in body.iter():
identifier = each.attrib.get('id')
if identifier:
self._all_ids.add(identifier)
classes = each.attrib.get('class')
if classes:
for class_ in classes.split():
self._all_classes.add(class_)
for style in CSSSelector('style')(page):
try:
first_line = style.text.strip().splitlines()[0]
except IndexError:
# meaning the inline style tag was just whitespace
continue
except AttributeError:
# happend when the style tag has absolute nothing it
# not even whitespace
continue
for i, line in enumerate(lines):
if line.count(first_line):
key = (i + 1, url)
self.blocks[key] = style.text
break
for link in CSSSelector('link')(page):
if (
link.attrib.get('rel', '') == 'stylesheet' or
link.attrib['href'].lower().split('?')[0].endswith('.css')
):
link_url = self.make_absolute_url(url, link.attrib['href'])
key = (link_url, link.attrib['href'])
self.blocks[key] = self.download(link_url)
if self.preserve_remote_urls:
self.blocks[key] = self._rewrite_urls(
self.blocks[key],
link_url
)
示例3: process_html
# 需要导入模块: from lxml.cssselect import CSSSelector [as 别名]
# 或者: from lxml.cssselect.CSSSelector import iter [as 别名]
def process_html(self, html, url):
parser = etree.HTMLParser()
tree = etree.fromstring(html, parser).getroottree()
page = tree.getroot()
if page is None:
print repr(html)
raise ParserError("Could not parse the html")
lines = html.splitlines()
body, = CSSSelector('body')(page)
self._bodies.append(body)
if self.optimize_lookup:
for each in body.iter():
id = each.attrib.get('id')
if id:
self._all_ids.add(id)
classes = each.attrib.get('class')
if classes:
for class_ in classes.split():
self._all_classes.add(class_)
for style in CSSSelector('style')(page):
first_line = style.text.strip().splitlines()[0]
for i, line in enumerate(lines):
if line.count(first_line):
key = (i + 1, url)
self.blocks[key] = style.text
break
for link in CSSSelector('link')(page):
if (
link.attrib.get('rel', '') == 'stylesheet' or
link.attrib['href'].lower().split('?')[0].endswith('.css')
):
link_url = self.make_absolute_url(url, link.attrib['href'])
key = (link_url, link.attrib['href'])
self.blocks[key] = self._download(link_url)
if self.preserve_remote_urls:
self.blocks[key] = self._rewrite_urls(
self.blocks[key],
link_url
)