本文整理汇总了Python中pupa.scrape.Person.extras['occupation']方法的典型用法代码示例。如果您正苦于以下问题:Python Person.extras['occupation']方法的具体用法?Python Person.extras['occupation']怎么用?Python Person.extras['occupation']使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pupa.scrape.Person
的用法示例。
在下文中一共展示了Person.extras['occupation']方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: scrape_legislator
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import extras['occupation'] [as 别名]
def scrape_legislator(self, name, chamber, url):
page = self.get(url).text
page = lxml.html.fromstring(page)
page.make_links_absolute(url)
party = page.xpath("string(//span[contains(@id, 'Party')])")
party = party.strip()
if party == 'Democrat':
party = 'Democratic'
district = page.xpath("string(//span[contains(@id, 'District')])")
district = district.strip().lstrip('0')
occupation = page.xpath(
"string(//span[contains(@id, 'Occupation')])")
occupation = occupation.strip()
(photo_url, ) = page.xpath('//img[contains(@id, "_imgMember")]/@src')
office_phone = page.xpath(
"string(//span[contains(@id, 'CapitolPhone')])").strip()
email = None
email_link = page.xpath('//a[@id="lnkMail"]')
legislator = Person(primary_org=chamber,
image=photo_url,
name=name,
party=party,
district=district
)
legislator.extras['occupation'] = occupation
if office_phone.strip() != "":
legislator.add_contact_detail(type='voice', value=office_phone, note='Capitol Office')
if email_link:
email = email_link[0].attrib['href'].split(":")[1]
legislator.add_contact_detail(type='email', value=email, note='Capitol Office')
# SD is hiding their email addresses entirely in JS now, so
# search through <script> blocks looking for them
for script in page.xpath('//script'):
if script.text:
match = re.search(r'([\w.][email protected]\.gov)', script.text)
if match:
legislator.add_contact_detail(type='email',
value=match.group(0),
note='Capitol Office')
break
home_address = [
x.strip() for x in
page.xpath('//td/span[contains(@id, "HomeAddress")]/text()')
if x.strip()
]
if home_address:
home_address = "\n".join(home_address)
home_phone = page.xpath(
"string(//span[contains(@id, 'HomePhone')])").strip()
legislator.add_contact_detail(type='address',
value=home_address,
note='District Office')
if home_phone:
legislator.add_contact_detail(type='voice',
value=home_phone,
note='District Office')
legislator.add_source(url)
comm_url = page.xpath("//a[. = 'Committees']")[0].attrib['href']
yield from self.scrape_committees(legislator, comm_url, chamber)
yield legislator
示例2: scrape_legislator
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import extras['occupation'] [as 别名]
def scrape_legislator(self, name, chamber, url, contact_page):
page = self.get(url).text
page = lxml.html.fromstring(page)
page.make_links_absolute(url)
party = page.xpath("string(//span[contains(@id, 'Party')])")
party = party.strip()
if party == 'Democrat':
party = 'Democratic'
district = page.xpath("string(//span[contains(@id, 'District')])")
district = district.strip().lstrip('0')
occupation = page.xpath(
"string(//span[contains(@id, 'Occupation')])")
occupation = occupation.strip()
(photo_url, ) = page.xpath('//img[contains(@id, "_imgMember")]/@src')
office_phone = page.xpath(
"string(//span[contains(@id, 'CapitolPhone')])").strip()
legislator = Person(primary_org=chamber,
image=photo_url,
name=name,
party=party,
district=district
)
legislator.extras['occupation'] = occupation
if office_phone.strip() != "":
legislator.add_contact_detail(
type='voice', value=office_phone, note='Capitol Office')
# SD removed email from the detail pages but it's still in the
# contact page, shared for all congress people
member_id = re.search(r'Member=(\d+)', url).group(1)
# find the profile block by finding a link inside it to their
# detail page
profile_link = contact_page.xpath(
'//ul[@id="contact-list"]//a[contains(@href, "Member=%s")]' % (member_id,))
if profile_link:
# look for the adjacent email mailto link
profile_link = profile_link[0]
profile_block = profile_link.getparent().getparent().getparent()
email_link = profile_block.xpath(
'./span/span/a[@class="mail-break"]')
if email_link:
email = email_link[0].text
email = email.lstrip()
email = email.rstrip()
if email:
legislator.add_contact_detail(type='email',
value=email,
note='Capitol Office')
home_address = [
x.strip() for x in
page.xpath('//td/span[contains(@id, "HomeAddress")]/text()')
if x.strip()
]
if home_address:
home_address = "\n".join(home_address)
home_phone = page.xpath(
"string(//span[contains(@id, 'HomePhone')])").strip()
legislator.add_contact_detail(type='address',
value=home_address,
note='District Office')
if home_phone:
legislator.add_contact_detail(type='voice',
value=home_phone,
note='District Office')
legislator.add_source(url)
legislator.add_link(url)
committees = page.xpath(
'//div[@id="divCommittees"]/span/section/table/tbody/tr/td/a')
for committee in committees:
self.scrape_committee(legislator, url, committee, chamber)
yield legislator
示例3: scrape_member
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import extras['occupation'] [as 别名]
def scrape_member(self, chamber, member_url):
page = self.get(member_url).text
root = lxml.html.fromstring(page)
name_and_party = root.xpath(
'string(//td[@class="SiteNames"])').split()
title = name_and_party[0]
# Account for Representative-Elect and Senator-Elect, for incoming class
if title.startswith('Representative'):
chamber = 'lower'
elif title.startswith('Senator'):
chamber = 'upper'
full_name = ' '.join(name_and_party[1:-1])
party = name_and_party[-1]
if party == '(R)':
party = 'Republican'
elif party == '(D)':
party = 'Democratic'
elif party == '(G)':
party = 'Green'
elif party == '(I)':
party = 'Independent'
elif '-Elect' in title and not party.startswith('('):
self.warning('Member-elect is currently missing a party')
full_name = ' '.join(name_and_party[1:])
party = ''
else:
raise AssertionError(
"Unknown party ({0}) for {1}".format(party, full_name))
try:
img = root.xpath('//img[@class="SitePhotos"]')[0]
photo_url = img.attrib['src']
except IndexError:
self.warning("No member photo found")
photo_url = ""
# Need to figure out a cleaner method for this later
info_box = root.xpath('string(//table[@class="InfoTable"])')
try:
district = re.search(r'District(.+)\r', info_box).group(1)
except AttributeError:
self.warning('Member has no district listed; skipping them')
return
person = Person(name=full_name, district=district,
party=party, primary_org=chamber, image=photo_url)
person.add_link(member_url)
person.add_source(member_url)
try:
phone = re.search(r'Phone(.+)\r', info_box).group(1)
except AttributeError:
phone = None
try:
email = re.search(r'Email(.+)\r', info_box).group(1)
except AttributeError:
email = None
address = root.xpath('//nobr/text()')[0].replace(u'\xa0', ' ')
person.add_contact_detail(type='address', value=address, note='District Office')
person.add_contact_detail(type='voice', value=phone, note='District Office')
person.add_contact_detail(type='email', value=email, note='District Office')
try:
person.extras['occupation'] = re.search(
r'Occupation(.+)\r', info_box).group(1)
except AttributeError:
pass
yield person
示例4: scrape_chamber
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import extras['occupation'] [as 别名]
def scrape_chamber(self, chamber, session):
chamber_abbrev = {'upper': 'S', 'lower': 'H'}[chamber]
url = "https://wyoleg.gov/LsoService/api/legislator/2018/{}".format(
chamber_abbrev)
response = self.get(url)
people_json = json.loads(response.content.decode('utf-8'))
for row in people_json:
# some fields are only available in the list json, some only in the details call
details_url = 'https://wyoleg.gov/LsoService/api/legislator/{}'.format(
row['legID'])
details_response = self.get(details_url)
details = json.loads(details_response.content.decode('utf-8'))
party = self.party_map[row['party']]
if details['dob'] is not None:
dob = datetime.datetime.strptime(
details['dob'], '%m/%d/%Y %I:%M:%S %p')
dob_str = datetime.datetime.strftime(dob, "%Y-%m-%d")
else:
dob_str = ''
photo_url = 'http://wyoleg.gov/LegislatorSummary/Photos/{}'.format(
details['legPhoto'])
person = Person(
name=row['name'],
district=row['district'].lstrip('SH0'),
party=party,
primary_org=chamber,
birth_date=dob_str,
image=photo_url,
)
if details['address']:
address = '{}, {} {} {}'.format(
details['address'],
details['city'],
details['state'],
details['zip']
)
person.add_contact_detail(type='address', value=address)
if row['eMail']:
person.add_contact_detail(type='email', value=row['eMail'])
if row['phone']:
person.add_contact_detail(type='voice', value=row['phone'])
person.extras['wy_leg_id'] = row['legID']
person.extras['county'] = row['county']
person.extras['given_name'] = row['firstName']
person.extras['family_name'] = row['lastName']
person.extras['religion'] = details['religion']
person.extras['number_children'] = details['noChildren']
person.extras['spouse_given_name'] = details['spouseName']
person.extras['place_of_birth'] = details['birthPlace']
person.extras['occupation'] = details['occupationDesc']
if details['legEducation']:
person.extras['education'] = details['legEducation']
if details['civicOrgs']:
person.extras['civic_organizations'] = details['civicOrgs']
# http://wyoleg.gov/Legislators/2018/S/2032
leg_url = 'http://wyoleg.gov/Legislators/{}/{}/{}'.format(
session,
row['party'],
row['legID'])
person.add_source(leg_url)
person.add_link(leg_url)
yield person