本文整理汇总了Python中lxml.cssselect.CSSSelector类的典型用法代码示例。如果您正苦于以下问题:Python CSSSelector类的具体用法?Python CSSSelector怎么用?Python CSSSelector使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了CSSSelector类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_selected_items
def get_selected_items(self):
response = self.session.get(self.url("selected_items"))
tree = lxml.html.fromstring(response.text)
item_sel = CSSSelector('div[headers="th_selected_items"]')
name_sel = CSSSelector("h4.il_ContainerItemTitle")
icon_sel = CSSSelector("img.ilListItemIcon")
results = item_sel(tree)
for result in results:
item = Item()
name = name_sel(result)[0]
try:
name = CSSSelector("a")(name)[0]
except IndexError:
pass
item.name = name.text
item.url = name.get("href")
icon = icon_sel(result)[0]
item.icon = icon.get("src")
yield item
示例2: _fetch_from_cache
def _fetch_from_cache(language, url):
from . import utils
cms_url = utils.get_cms_url(language, url)
if cms_url in cache:
html = cache.get(cms_url)
else:
html = utils.get_cms_page(language, url)
cache.set(cms_url, html)
parser = etree.HTMLParser()
tree = etree.parse(StringIO(html), parser).getroot()
toc = CSSSelector('.toc')
# Removing all table of contents
for table in toc(tree):
table.getparent().remove(table)
title = CSSSelector('.page-title')(tree)[0]
title.getparent().remove(title)
elements = list(CSSSelector('.cms-content')(tree)[0])
headers = [i for i, e in enumerate(elements) if CSSSelector('.section-header')(e)]
title_icons = list(CSSSelector('.title-icon')(tree))
page_contents = []
for i, h in enumerate(headers):
icon = ""
if i < len(title_icons) and 'src' in title_icons[i].attrib:
icon = title_icons[i].attrib['src']
element = elements[h]
if (i + 1) == len(headers):
contents = elements[h + 1:]
else:
contents = elements[h + 1:headers[i + 1]]
for e in elements:
if 'dir' in e.attrib:
del e.attrib['dir']
section_title = CSSSelector('a[name]')(element)[0].text
section_body = ""
for c in contents:
section_body += etree.tostring(c, pretty_print=True, method="html")
page_contents.append({
"is_important": True if CSSSelector('.important')(element) else False,
"title": section_title,
"body": section_body,
"icon": icon
})
return {
"title": title.text,
"contents": page_contents
}
示例3: process_html
def process_html(self, html, path):
parser = etree.HTMLParser(encoding='utf-8')
tree = etree.fromstring(html.decode('utf-8'), parser).getroottree()
page = tree.getroot()
if page is None:
print(repr(html))
raise ParserError('Could not parse the html')
lines = html.splitlines()
body, = CSSSelector('body')(page)
self._bodies.append(body)
if self.optimize_lookup:
for each in body.iter():
identifier = each.attrib.get('id')
if identifier:
self._all_ids.add(identifier)
classes = each.attrib.get('class')
if classes:
for class_ in classes.split():
self._all_classes.add(class_)
for style in CSSSelector('style')(page):
first_line = style.text.strip().splitlines()[0]
for i, line in enumerate(lines):
if line.count(first_line):
key = (i + 1, path)
self.blocks[key] = style.text
break
示例4: detect_withdrawn
def detect_withdrawn(self, tree, url):
comment = CSSSelector(".tablecell.comments")(tree)
if comment:
comment = comment[0].text_content()
if "withdrawn" in comment.lower():
print("Paper", url, "appears to be withdrawn!")
return True
return False
示例5: post_node
def post_node(title, datetime, content):
post = copy(POST)
CSSSelector('.title .text')(post)[0].text = title
CSSSelector('.datetime')(post)[0].text = datetime.strftime("%H:%M on %A the %%s of %B, %Y") % niceday(datetime)
content_css = CSSSelector('.content')(post)[0]
for fragment in fragments_fromstring(cleaner_trusted.clean_html(content)):
content_css.append(fragment)
return post
示例6: get_or_create_head
def get_or_create_head(root):
"""Ensures that `root` contains a <head> element and returns it.
"""
head = CSSSelector('head')(root)
if not head:
head = etree.Element('head')
body = CSSSelector('body')(root)[0]
body.getparent().insert(0, head)
return head
else:
return head[0]
示例7: process_html
def process_html(self, html, url):
parser = etree.HTMLParser(encoding='utf-8')
tree = etree.fromstring(html.encode('utf-8'), parser).getroottree()
page = tree.getroot()
if page is None:
print(repr(html))
raise ParserError('Could not parse the html')
lines = html.splitlines()
body, = CSSSelector('body')(page)
self._bodies.append(body)
if self.optimize_lookup:
for each in body.iter():
identifier = each.attrib.get('id')
if identifier:
self._all_ids.add(identifier)
classes = each.attrib.get('class')
if classes:
for class_ in classes.split():
self._all_classes.add(class_)
for style in CSSSelector('style')(page):
try:
first_line = style.text.strip().splitlines()[0]
except IndexError:
# meaning the inline style tag was just whitespace
continue
except AttributeError:
# happend when the style tag has absolute nothing it
# not even whitespace
continue
for i, line in enumerate(lines):
if line.count(first_line):
key = (i + 1, url)
self.blocks[key] = style.text
break
for link in CSSSelector('link')(page):
if (
link.attrib.get('rel', '') == 'stylesheet' or
link.attrib['href'].lower().split('?')[0].endswith('.css')
):
link_url = self.make_absolute_url(url, link.attrib['href'])
key = (link_url, link.attrib['href'])
self.blocks[key] = self.download(link_url)
if self.preserve_remote_urls:
self.blocks[key] = self._rewrite_urls(
self.blocks[key],
link_url
)
示例8: get_submission_dates
def get_submission_dates(self, arxiv_tree, queried_version):
links = CSSSelector("div.submission-history")(arxiv_tree)[0]
versions = {}
#print "Parsing", links.text_content()
for line in links.text_content().split("\n"):
match = self.version_re.match(line)
if match:
version, d = match.group(1), match.group(2)
d = datetime.datetime.strptime(d,'%a, %d %b %Y').date()
versions[version] = d
if queried_version == version:
return {version: d}
#print version, date
return versions
示例9: process_html
def process_html(self, html, url):
parser = etree.HTMLParser()
tree = etree.fromstring(html, parser).getroottree()
page = tree.getroot()
if page is None:
print repr(html)
raise ParserError("Could not parse the html")
lines = html.splitlines()
body, = CSSSelector('body')(page)
self._bodies.append(body)
if self.optimize_lookup:
for each in body.iter():
id = each.attrib.get('id')
if id:
self._all_ids.add(id)
classes = each.attrib.get('class')
if classes:
for class_ in classes.split():
self._all_classes.add(class_)
for style in CSSSelector('style')(page):
first_line = style.text.strip().splitlines()[0]
for i, line in enumerate(lines):
if line.count(first_line):
key = (i + 1, url)
self.blocks[key] = style.text
break
for link in CSSSelector('link')(page):
if (
link.attrib.get('rel', '') == 'stylesheet' or
link.attrib['href'].lower().split('?')[0].endswith('.css')
):
link_url = self.make_absolute_url(url, link.attrib['href'])
key = (link_url, link.attrib['href'])
self.blocks[key] = self._download(link_url)
if self.preserve_remote_urls:
self.blocks[key] = self._rewrite_urls(
self.blocks[key],
link_url
)
示例10: load_stations
def load_stations(file="stations-converted.json"):
global STATIONS
with open(file) as f:
STATIONS = anyjson.deserialize(f.read())
for station in STATIONS.values():
try:
uri = "http://hydro.chmi.cz/isarrow/object.php?seq=2000855701&chemie=1&biota=1&ukol_p=1&id_objekt=&vod_typ=R&nadmh_sign=%3E&rickm_sign=%3E&rok_od=2007&rok_do=2012&objekty_chemdata=1&matrice=2000868184&typodb=41"
seq = CSSSelector("form input[name='seq']")(fromstring(urllib2.urlopen(uri).read().decode("cp1250")))[
0
].value
# print 'seq is ' + seq
uri = (
"http://hydro.chmi.cz/isarrow/object.php?agenda=POV&objekty_chemdata=1&objekty_biodata=&taxon_tree=&seq="
+ seq
+ "&data_sel=chemdata&chemie=1&biota=1&rok_od=2007&rok_do=2012&matrice=2000868184&typodb=41&tscongrp=&tscon=&data_mez_stanovitelnosti=&data_od=&data_do=&taxon=&send=Chemick%E9+vzorky"
)
tree = fromstring(urllib2.urlopen(uri).read().decode("cp1250"))
link = CSSSelector("table.tbl a")(tree)[-1]
uri = "http://hydro.chmi.cz/isarrow/" + link.get("href")
tree = fromstring(urllib2.urlopen(uri).read().decode("cp1250"))
csv_link = tree.xpath("//form[1]//a")[0]
uri = "http://hydro.chmi.cz/isarrow/" + csv_link.get("href")
# FIXME: CSV export is now broken on IS ARROW
# wait for them to fix it or parse from table -- and store relevant data into structure
reader = csv.reader(urllib2.urlopen(uri))
for row in reader:
print row
except Exception:
print "Failed to retrieve values for station " + station["id"]
import traceback
traceback.print_exc()
示例11: make_emoji_img_elem
def make_emoji_img_elem(emoji_span_elem: CSSSelector) -> Dict[str, Any]:
# Convert the emoji spans to img tags.
classes = emoji_span_elem.get('class')
match = re.search('emoji-(?P<emoji_code>\S+)', classes)
# re.search is capable of returning None,
# but since the parent function should only be called with a valid css element
# we assert that it does not.
assert match is not None
emoji_code = match.group('emoji_code')
emoji_name = emoji_span_elem.get('title')
alt_code = emoji_span_elem.text
image_url = base_url + '/static/generated/emoji/images-%(emojiset)s-64/%(emoji_code)s.png' % {
'emojiset': emojiset,
'emoji_code': emoji_code
}
img_elem = lxml.html.fromstring(
'<img alt="%(alt_code)s" src="%(image_url)s" title="%(title)s">' % {
'alt_code': alt_code,
'image_url': image_url,
'title': emoji_name,
})
img_elem.set('style', 'height: 20px;')
img_elem.tail = emoji_span_elem.tail
return img_elem
示例12: getView
def getView(self, document, sheet, media='all', name=None, styleCallback=None):
"""
document
a DOM document, currently an lxml HTML document
sheet
a CSS StyleSheet object, currently cssutils sheet
media: optional
TODO: view for which media it should be
name: optional
TODO: names of sheets only
styleCallback: optional
should return css.CSSStyleDeclaration of inline styles, for html
a style declaration for ``[email protected]``. Gets one parameter
``element`` which is the relevant DOMElement
returns style view
a dict of {DOMElement: css.CSSStyleDeclaration} for html
"""
styleCallback = styleCallback or self.styleattribute
_unmergable_rules = CSSStyleSheet()
view = {}
specificities = {} # needed temporarily
# TODO: filter rules simpler?, add @media
rules = (rule for rule in sheet if rule.type == rule.STYLE_RULE)
for rule in rules:
for selector in rule.selectorList:
self.log(0, 'SELECTOR', selector.selectorText)
# TODO: make this a callback to be able to use other stuff than lxml
try:
cssselector = CSSSelector(selector.selectorText)
except (ExpressionError, NotImplementedError) as e:
_unmergable_rules.add(CSSStyleRule(selectorText=selector.selectorText,
style=rule.style))
continue
matching = cssselector.evaluate(document)
for element in matching:
if element.tag in self.NONVISUAL_TAGS:
continue
# add styles for all matching DOM elements
self.log(1, 'ELEMENT', id(element), element.text)
if element not in view:
# add initial empty style declatation
view[element] = CSSStyleDeclaration()
specificities[element] = {}
# and add inline @style if present
inlinestyle = styleCallback(element)
if inlinestyle:
for p in inlinestyle:
# set inline style specificity
view[element].setProperty(p)
specificities[element][p.name] = (1, 0, 0, 0)
for p in rule.style:
# update style declaration
if p not in view[element]:
# setProperty needs a new Property object and
# MUST NOT reuse the existing Property
# which would be the same for all elements!
# see Issue #23
view[element].setProperty(p.name, p.value, p.priority)
specificities[element][p.name] = selector.specificity
self.log(2, view[element].getProperty('color'))
else:
self.log(2, view[element].getProperty('color'))
sameprio = (p.priority ==
view[element].getPropertyPriority(p.name))
if not sameprio and bool(p.priority) or (
sameprio and selector.specificity >=
specificities[element][p.name]):
# later, more specific or higher prio
view[element].setProperty(p.name, p.value, p.priority)
_unmergable_css = _unmergable_rules.cssText
if _unmergable_css:
e = etree.Element('style')
# print __name__, _unmergable_css.__repr__()
e.text = to_unicode(_unmergable_css, 'utf-8')
body = document.find('body') or document
body.insert(0, e) # add <style> right into body
return view
示例13: len
import lxml.etree
from lxml.cssselect import CSSSelector
from BeautifulSoup import BeautifulSoup
if len(sys.argv) < 2:
print >>sys.stderr, 'usage: weather.py CITY, STATE'
exit(2)
data = urllib.urlencode({'inputstring': ' '.join(sys.argv[1:])})
info = urllib2.urlopen('http://forecast.weather.gov/zipcity.php', data)
content = info.read()
# Solution #1
parser = lxml.etree.HTMLParser(encoding='utf-8')
tree = lxml.etree.fromstring(content, parser)
big = CSSSelector('td.big')(tree)[0]
if big.find('font') is not None:
big = big.find('font')
print 'Condition:', big.text.strip()
print 'Temperature:', big.findall('br')[1].tail
tr = tree.xpath('.//td[b="Humidity"]')[0].getparent()
print 'Humidity:', tr.findall('td')[1].text
print
# Solution #2
soup = BeautifulSoup(content) # doctest: +SKIP
big = soup.find('td', 'big')
if big.font is not None:
big = big.font
print 'Condition:', big.contents[0].string.strip()
temp = big.contents[3].string or big.contents[4].string # can be either
示例14: getView
def getView(document, css, media='all', name=None,
styleCallback=lambda element: None):
"""
document
a DOM document, currently an lxml HTML document
css
a CSS StyleSheet string
media: optional
TODO: view for which media it should be
name: optional
TODO: names of sheets only
styleCallback: optional
should return css.CSSStyleDeclaration of inline styles, for html
a style declaration for ``[email protected]``. Gets one parameter
``element`` which is the relevant DOMElement
returns style view
a dict of {DOMElement: css.CSSStyleDeclaration} for html
"""
sheet = cssutils.parseString(css)
view = {}
specificities = {} # needed temporarily
# TODO: filter rules simpler?, add @media
rules = (rule for rule in sheet if rule.type == rule.STYLE_RULE)
for rule in rules:
for selector in rule.selectorList:
log(0, 'SELECTOR', selector.selectorText)
# TODO: make this a callback to be able to use other stuff than lxml
cssselector = CSSSelector(selector.selectorText)
matching = cssselector.evaluate(document)
for element in matching:
#if element.tag in ('div',):
# add styles for all matching DOM elements
log(1, 'ELEMENT', id(element), element.text)
if element not in view:
# add initial empty style declatation
view[element] = cssutils.css.CSSStyleDeclaration()
specificities[element] = {}
# and add inline @style if present
inlinestyle = styleCallback(element)
if inlinestyle:
for p in inlinestyle:
# set inline style specificity
view[element].setProperty(p)
specificities[element][p.name] = (1,0,0,0)
for p in rule.style:
# update style declaration
if p not in view[element]:
# setProperty needs a new Property object and
# MUST NOT reuse the existing Property
# which would be the same for all elements!
# see Issue #23
view[element].setProperty(p.name, p.value, p.priority)
specificities[element][p.name] = selector.specificity
log(2, view[element].getProperty('color'))
else:
log(2, view[element].getProperty('color'))
sameprio = (p.priority ==
view[element].getPropertyPriority(p.name))
if not sameprio and bool(p.priority) or (
sameprio and selector.specificity >=
specificities[element][p.name]):
# later, more specific or higher prio
view[element].setProperty(p.name, p.value, p.priority)
#pprint(view)
return view
示例15: require
def require(self,url):
hc= urlparse(url)[1].replace('.ganji.com',"")
hc2=citynameDict_sf.get(hc)
if hc2:
self.fd['house_city']=hc2
else:
self.fd['house_city']=hc
request = urllib2.Request(url, None, self.header)
response = urllib2.urlopen(request).read()
if self.mayGetIt(response):
self.fd={}
raise
tree = etree.HTML(response)
if re.search("<span class=\"city\"><a .*?>(.*?)</a>", response):
cityname=re.search("<span class=\"city\"><a .*?>(.*?)</a>", response).group(1)
self.fd['cityname'] = cityname
else:
raise
self.fd['house_flag'] = 4
self.fd['house_type'] = 6
self.fd['house_floor'] = 0
self.fd['house_topfloor'] = 0
self.fd['house_area']=0
self.fd['house_age'] = 0
self.fd['house_toward'] = 0
self.fd['house_fitment'] = 0
self.fd['house_deposit'] = 0
# self.fd['house_totalarea_max'] = 0
# self.fd['house_totalarea_min'] = 0
soup =BeautifulSoup(response)
detail_mer = soup.find('div',{'class':'detail_mer'})
#非个人房源 return
if u"个人房源" not in str(detail_mer):raise
Dname = detail_mer.find('span',{'class':'Dname'})
if Dname:
self.fd['owner_name'] = Dname.string
else:
self.fd['owner_name'] = None
ganji_phone_call_class = detail_mer.find('span',{'class':'ganji_phone_call_class'})
if ganji_phone_call_class:
self.fd['owner_phone_pic'] = ganji_phone_call_class.contents[0]
if str(ganji_phone_call_class).find('src='):
self.fd['owner_phone_pic'] = 'http://'+urlparse(url)[1]+ganji_phone_call_class.img['src']
else:
self.fd['owner_phone_pic'] = None
else:
self.fd['owner_phone_pic'] = None
#没有联系方式 return
if not self.fd['owner_phone_pic']:raise
if re.search(self.house_price_regex_zu, response):
house_price_zu = re.search(self.house_price_regex_zu, response).group(1)
house_price_zu = house_price_zu.replace('元/月','')
if house_price_zu.find("以上") != -1:
self.fd['house_price_max'] = 0
self.fd['house_price'] = int(house_price_zu.replace('以上',''))
elif house_price_zu.find("以下") != -1:
self.fd['house_price_max'] = int(house_price_zu.replace('以下',''))
self.fd['house_price'] = 0
elif house_price_zu.find("-") != -1:
self.fd['house_price_max'] = int(house_price_zu.split('-')[1])
self.fd['house_price'] = int(house_price_zu.split('-')[0])
else:
self.fd['house_price_max'] = 0
self.fd['house_price'] = 0
else:
self.fd['house_price_max'] = 0
self.fd['house_price'] = 0
posttime=CSSSelector('span.pub_time')(tree)!=None and CSSSelector('span.pub_time')(tree)[0].text.strip() or None
if posttime:
Y=int(time.strftime('%Y', time.localtime()))
M=int(posttime.split(' ')[0].split('-')[0])
D=int(posttime.split(' ')[0].split('-')[1])
H=int(time.strftime('%H',time.localtime(time.time())))
Min=int(time.strftime('%M',time.localtime(time.time())))
s = datetime.datetime(Y,M,D,H,Min)
posttime=str(int(time.mktime(s.timetuple())))
self.fd['house_posttime'] =posttime
else:
s=time.localtime(time.time())
self.fd['house_posttime'] =str(int(time.mktime(s)))
house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None
self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
if re.search(self.house_room_regex, response):
house_room=re.search(self.house_room_regex, response).group(1)
self.fd['house_room'] = int(house_room)
else:
self.fd['house_room'] = 0
#.........这里部分代码省略.........