本文整理汇总了Python中utils.lxmlize函数的典型用法代码示例。如果您正苦于以下问题:Python lxmlize函数的具体用法?Python lxmlize怎么用?Python lxmlize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了lxmlize函数的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_people
def get_people(self):
page = lxmlize(COUNCIL_PAGE, 'iso-8859-1')
nodes = page.xpath('//table[@width="484"]//tr')
try:
for district_row, councillor_row, contact_row, _ in chunks(nodes, 4):
post_id = district_row.xpath('string(.//strong)')
name = councillor_row.xpath('string(.)')[len('Councillor '):]
# TODO: phone numbers on site don't include area code. Add manually?
#phone = contact_row.xpath('string(td[2]/text())')
email = contact_row.xpath('string(td[4]/a)').replace('[at]', '@')
p = Legislator(name=name, post_id=post_id, role='Councillor')
p.add_source(COUNCIL_PAGE)
#p.add_contact('voice', phone, 'legislature')
p.add_contact('email', email, None)
yield p
except ValueError:
# on the last run through, there will be less than 4 rows to unpack
pass
mayor_page = lxmlize(MAYOR_PAGE, 'iso-8859-1')
name = mayor_page.xpath('string(//h1[contains(., "Bio")])')[:-len(' Bio')]
contact_page = lxmlize(MAYOR_CONTACT_URL, 'iso-8859-1')
email = contact_page.xpath('string(//a[contains(., "@")][1])')
p = Legislator(name=name, post_id='Halifax', role='Councillor')
p.add_source(MAYOR_PAGE)
p.add_source(MAYOR_CONTACT_URL)
p.add_contact('email', email, None)
yield p
示例2: get_people
def get_people(self):
# mayor first, can't find email
page = lxmlize(MAYOR_URL)
photo_url = page.xpath('string(//img/@src[contains(., "Maire")])')
name = page.xpath('string(//td[@class="contenu"]/text()[last()])')
p = Legislator(name=name, post_id=u"Trois-Rivières", role="Maire", image=photo_url)
p.add_source(MAYOR_URL)
yield p
resp = requests.get(COUNCIL_PAGE)
# page rendering through JS on the client
page_re = re.compile(r'createItemNiv3.+"District (.+?)".+(index.+)\\"')
for district, url_rel in page_re.findall(resp.text):
if district not in ("des Estacades", "des Plateaux", "des Terrasses", "du Sanctuaire"):
district = re.sub("\A(?:de(?: la)?|des|du) ", "", district)
url = urljoin(COUNCIL_PAGE, url_rel)
page = lxmlize(url)
name = page.xpath("string(//h2)")
email = page.xpath('string(//a/@href[contains(., "mailto:")])')[len("mailto:") :]
photo_url = page.xpath('string(//img/@src[contains(., "Conseiller")])')
p = Legislator(name=name, post_id=district, role="Conseiller", image=photo_url)
p.add_source(url)
p.add_contact("email", email, None)
yield p
示例3: scrape_mayor
def scrape_mayor(self, div):
url = div.attrib['href']
page = lxmlize(url)
name = div.text_content().replace('Mayor ', '')
contact_url = page.xpath('//ul[@class="navSecondary"]//a[contains(text(),"Contact")]')[0].attrib['href']
page = lxmlize(contact_url)
contact_div = page.xpath('//div[@class="col"][2]')[0]
address = contact_div.xpath('.//p[1]')[0].text_content()
address = re.findall(r'(City of Greater .*)', address, flags=re.DOTALL)[0]
phone = contact_div.xpath('.//p[2]')[0].text_content()
phone = phone.replace('Phone: ', '')
fax = contact_div.xpath('.//p[3]')[0].text_content()
fax = fax.split(' ')[-1]
email = contact_div.xpath('//a[contains(@href, "mailto:")]')[0].text_content()
p = Legislator(name=name, post_id='Greater Sudbury', role='Mayor')
p.add_source(COUNCIL_PAGE)
p.add_source(contact_url)
p.add_contact('address', address, 'legislature')
p.add_contact('voice', phone, 'legislature')
p.add_contact('fax', fax, 'legislature')
p.add_contact('email', email, None)
return p
示例4: mayor_data
def mayor_data(url, name):
page = lxmlize(url)
photo_url = urljoin(url,
page.xpath('string((//div[@id="contentcontainer"]//img)[1]/@src)'))
contact_page = lxmlize(MAYOR_CONTACT_URL)
email = contact_page.xpath('string(//a[contains(., "@")][1])')
m = Legislator(name=name, post_id='Regina', role='Mayor')
m.add_source(COUNCIL_PAGE)
m.add_source(url)
m.add_source(MAYOR_CONTACT_URL)
m.add_contact('email', email, None)
m.image = photo_url
return m