本文整理汇总了Python中pupa.scrape.Person.image方法的典型用法代码示例。如果您正苦于以下问题:Python Person.image方法的具体用法?Python Person.image怎么用?Python Person.image使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pupa.scrape.Person
的用法示例。
在下文中一共展示了Person.image方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_same_name_people
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import image [as 别名]
def test_same_name_people():
# ensure two people with the same name don't import without birthdays
o = Organization.objects.create(name='WWE', jurisdiction_id='jurisdiction-id')
p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1')
p2 = ScrapePerson('Dwayne Johnson', image='http://example.com/2')
# the people have the same name but are apparently different
with pytest.raises(SameNameError):
PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()])
# when we give them birth dates all is well though
p1.birth_date = '1970'
p2.birth_date = '1930'
resp = PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()])
assert resp['person']['insert'] == 2
assert resp['person']['noop'] == 0
assert resp['person']['update'] == 0
assert Person.objects.count() == 2
# fake some memberships so future lookups work on these people
for p in Person.objects.all():
Membership.objects.create(person=p, organization=o)
# and now test that an update works and we can insert a new one with the same name
p1.image = 'http://example.com/1.jpg'
p2.birth_date = '1931' # change birth_date, means a new insert
resp = PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()])
assert Person.objects.count() == 3
assert resp['person']['insert'] == 1
assert resp['person']['noop'] == 0
assert resp['person']['update'] == 1
示例2: scrape_alderman
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import image [as 别名]
def scrape_alderman(self, ward_num):
ward_url = "{}/ward-{}".format(Utils.ALDERMEN_HOME, ward_num)
alderman_url = self.alderman_url(ward_url)
alderman_page = self.lxmlize(alderman_url)
# person's name is the only <h1> tag on the page
name = alderman_page.xpath("//h1/text()")[0]
# initialize person object with appropriate data so that pupa can
# automatically create a membership object linking this person to
# a post in the jurisdiction's "Board of Aldermen" organization
district = "Ward {} Alderman".format(ward_num)
person = Person(name=name, district=district, role="Alderman",
primary_org="legislature")
# set additional fields
person.image = alderman_page.xpath("//div/img/@src")[0]
phone_number = alderman_page.xpath("//strong[text()='Phone:']/../text()")[1].strip()
person.add_contact_detail(type="voice", value=phone_number)
# add sources
person.add_source(alderman_url, note="profile")
person.add_source(ward_url, note="ward")
return person
示例3: test_same_name_people
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import image [as 别名]
def test_same_name_people():
o = Organization.objects.create(name='WWE', jurisdiction_id='jurisdiction-id')
# importing two people with the same name to a pristine database should error
p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1')
p2 = ScrapePerson('Dwayne Johnson', image='http://example.com/2')
with pytest.raises(SameNameError):
PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()])
# importing one person should pass
PersonImporter('jurisdiction-id').import_data([p1.as_dict()])
# create fake memberships so that future lookups work on the imported people
for p in Person.objects.all():
Membership.objects.create(person=p, organization=o)
# importing another person with the same name should fail
with pytest.raises(SameNameError):
PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()])
# adding birth dates should pass
p1.birth_date = '1970'
p2.birth_date = '1930'
resp = PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()])
assert resp['person']['insert'] == 1
assert resp['person']['noop'] == 0
assert resp['person']['update'] == 1
assert Person.objects.count() == 2
# create fake memberships so that future lookups work on the imported people
for p in Person.objects.all():
Membership.objects.create(person=p, organization=o)
# adding a third person with the same name but without a birthday should error
p3 = ScrapePerson('Dwayne Johnson', image='http://example.com/3')
with pytest.raises(SameNameError):
PersonImporter('jurisdiction-id').import_data([p3.as_dict()])
# and now test that an update works and we can insert a new one with the same name
p1.image = 'http://example.com/1.jpg'
p2.birth_date = '1931' # change birth_date, means a new insert
resp = PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()])
assert Person.objects.count() == 3
assert resp['person']['insert'] == 1
assert resp['person']['noop'] == 0
assert resp['person']['update'] == 1
示例4: scrape
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import image [as 别名]
def scrape(self):
committee_d = {}
non_committees = {'City Council', 'Office of the Mayor',
'Office of the City Clerk'}
for councilman, committees in self.councilMembers() :
if councilman['Ward/Office'] == "":
continue
ward = councilman['Ward/Office']
if ward not in {"Mayor", "Clerk"} :
ward = "Ward {}".format(int(ward))
role = "Alderman"
p = Person(councilman['Person Name']['label'],
district=ward,
primary_org="legislature",
role=role)
if councilman['Photo'] :
p.image = councilman['Photo']
contact_types = {
"City Hall Office": ("address", "City Hall Office"),
"City Hall Phone": ("voice", "City Hall Phone"),
"Ward Office Phone": ("voice", "Ward Office Phone"),
"Ward Office Address": ("address", "Ward Office Address"),
"Fax": ("fax", "Fax")
}
for contact_type, (type_, _note) in contact_types.items():
if councilman[contact_type]:
p.add_contact_detail(type=type_,
value= councilman[contact_type],
note=_note)
if councilman["E-mail"]:
p.add_contact_detail(type="email",
value=councilman['E-mail']['label'],
note='E-mail')
if councilman['Website']:
p.add_link(councilman['Website']['url'])
p.add_source(councilman['Person Name']['url'], note='web')
for committee, _, _ in committees:
committee_name = committee['Legislative Body']['label']
if committee_name and committee_name not in non_committees:
o = committee_d.get(committee_name, None)
if o is None:
o = Organization(committee_name,
classification='committee',
parent_id={'name' : 'Chicago City Council'})
o.add_source(committee['Legislative Body']['url'],
note='web')
committee_d[committee_name] = o
o.add_member(p, role=committee["Title"])
yield p
for name, term in FORMER_ALDERMEN.items() :
p = Person(name=name,
primary_org="legislature",
start_date=term['term'][0],
end_date=term['term'][1],
district="Ward {}".format(term['ward']),
role='Alderman')
if name == 'Chandler, Michael D.' :
p.add_term('Alderman',
"legislature",
district="Ward {}".format(term['ward']),
start_date=datetime.date(2011, 5, 16),
end_date=datetime.date(2015, 5, 18))
p.add_source(term['source'], note='web')
yield p
for o in committee_d.values() :
yield o
for committee_name in FORMER_COMMITTEES :
o = Organization(committee_name,
classification='committee',
parent_id={'name' : 'Chicago City Council'})
o.add_source("https://chicago.legistar.com/Departments.aspx",
note='web')
yield o
for joint_committee in JOINT_COMMITTEES :
o = Organization(joint_committee,
classification='committee',
parent_id={'name' : 'Chicago City Council'})
o.add_source("https://chicago.legistar.com/Departments.aspx",
note='web')
yield o
示例5: _scrape_upper_chamber
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import image [as 别名]
def _scrape_upper_chamber(self):
self.info('Scraping upper chamber for legislators.')
chamber = 'upper'
url = self._senators_url
source_url = url
page = self.get(url).text
page = lxml.html.fromstring(page)
table = page.xpath('//*[@id="content-2"]//table//tr')
rowcount = 0
for tr in table:
rowcount += 1
# the first two rows are headers, skip:
if rowcount <= 2:
continue
tds = tr.xpath('td')
full_name = tds[0].xpath('div/a')[0].text_content().strip()
if full_name.startswith('Vacant'):
continue
party_and_district = tds[1].xpath('div')[0].text_content() \
.strip().split('-')
if party_and_district[0] == 'D':
party = 'Democratic'
elif party_and_district[0] == 'R':
party = 'Republican'
district = party_and_district[1]
phone = tds[3].xpath('div')[0].text_content().strip()
url = self._senator_details_url.format(int(district))
details_page = self.get(url).text
if 'currently vacant' in details_page:
continue
person = Person(
name=full_name,
primary_org=chamber,
district=district,
party=party,
)
person.add_source(source_url)
person.add_source(url)
person.add_link(url)
page = lxml.html.fromstring(details_page)
photo_url = page.xpath('//*[@id="content-2"]//img[contains(@src, "uploads")]/@src')[0]
contact_info = [
line.strip()
for line
in page.xpath('//div[@class="textwidget"]/p[1]')[0]
.text_content().split('\n')
if 'Capitol Office:' not in line
]
address = '\n'.join(contact_info[:2])
email = next((line for line in iter(contact_info) if '@' in line),
None)
phone_pattern = re.compile(r'\(\d{3}\) \d{3}-\d{4}')
phone_numbers = [line for line in contact_info
if phone_pattern.search(line) is not None]
phone = phone_pattern.search(phone_numbers[0]).group()
fax = next(
(phone_pattern.search(phone_number).group()
for phone_number in iter(phone_numbers)
if 'fax' in phone_number.lower()),
None
)
person.add_contact_detail(type='address', value=address, note='Capitol Office')
person.add_contact_detail(type='voice', value=phone, note='Capitol Office')
if fax:
person.add_contact_detail(type='fax', value=fax, note='Capitol Office')
if email:
person.add_contact_detail(type='email', value=email, note='Capitol Office')
person.image = photo_url
yield person
示例6: _scrape_lower_chamber
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import image [as 别名]
def _scrape_lower_chamber(self):
self.info('Scraping lower chamber for legislators.')
chamber = 'lower'
roster_url = (self._reps_url)
page = self.get(roster_url).text
page = lxml.html.fromstring(page)
# This is the ASP.net table container
table_xpath = ('id("ContentPlaceHolder1_'
'gridMembers_DXMainTable")')
table = page.xpath(table_xpath)[0]
for tr in table.xpath('tr')[1:]:
# If a given term hasn't occurred yet, then ignore it
# Eg, in 2017, the 2018 term page will have a blank table
if tr.attrib.get('class') == 'dxgvEmptyDataRow':
self.warning('No House members found')
return
tds = tr.xpath('td')
last_name = tds[0].text_content().strip()
first_name = tds[1].text_content().strip()
full_name = '{} {}'.format(first_name, last_name)
district = str(int(tds[2].text_content().strip()))
party = tds[3].text_content().strip()
if party == 'Democrat':
party = 'Democratic'
if party.strip() == "": # Workaround for now.
party = "Other"
phone = tds[4].text_content().strip()
room = tds[5].text_content().strip()
address = self._assumed_address_fmt.format(room if room else '')
if last_name == 'Vacant':
person = Person(
name=full_name,
primary_org=chamber,
district=district,
party=party,
)
person.extras = {
'first_name': first_name,
'last_name': last_name,
}
person.add_contact_detail(type='address', value=address, note='Capitol Office')
if phone.strip():
person.add_contact_detail(type='voice', value=phone, note='Capitol Office')
person.add_source(roster_url)
self._save_vacant_legislator(person)
else:
party_override = {" Green": "Democratic",
" Sisco": "Republican"}
if party == "" and full_name in party_override:
party = party_override[full_name]
details_url = self._rep_details_url.format(district)
details_page = lxml.html.fromstring(self.get(details_url).text)
person = Person(
name=full_name,
primary_org=chamber,
district=district,
party=party,
)
person.extras = {
'first_name': first_name,
'last_name': last_name,
}
person.add_source(roster_url)
person.add_source(details_url)
person.add_link(details_url)
email = details_page.xpath(
'//*[@id="ContentPlaceHolder1_lblAddresses"]'
'/table/tr[4]/td/a/@href'
)
if len(email) > 0 and email[0].lower() != 'mailto:':
email = email[0].split(':')[1]
else:
email = None
person.add_contact_detail(type='address', value=address, note='Capitol Office')
if phone:
person.add_contact_detail(type='voice', value=phone, note='Capitol Office')
if email:
person.add_contact_detail(type='email', value=email, note='Capitol Office')
picture = details_page.xpath(
'//*[@id="ContentPlaceHolder1_imgPhoto"]/@src')
if len(picture) > 0:
person.image = picture[0]
yield person
示例7: _parse_person
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import image [as 别名]
def _parse_person(self, row, chamber, seat_map):
# Capture legislator vitals.
first_name = row['FirstName']
middle_name = row['MiddleName']
last_name = row['LastName']
full_name = '{} {} {}'.format(first_name, middle_name, last_name)
full_name = re.sub(r'[\s]{2,}', ' ', full_name)
if chamber == 'lower':
district = '{} {}'.format(row['County'], int(row['District'])).strip()
else:
district = str(int(row['District'])).strip()
party = self.party_map[row['party'].upper()]
email = row['WorkEmail']
print(district)
person = Person(primary_org=chamber,
district=district,
name=full_name,
party=party)
extras = {
'first_name': first_name,
'middle_name': middle_name,
'last_name': last_name
}
person.extras = extras
if email:
person.add_contact_detail(type='email', value=email, note='District Office')
# Capture legislator office contact information.
district_address = '{}\n{}\n{}, {} {}'.format(row['Address'],
row['address2'],
row['city'], row['State'],
row['Zipcode']).strip()
phone = row['Phone'].strip()
if not phone:
phone = None
if district_address:
person.add_contact_detail(type='address', value=district_address, note='Home Office')
if phone:
person.add_contact_detail(type='voice', value=phone, note='Home Office')
# Retrieve legislator portrait.
profile_url = None
if chamber == 'upper':
profile_url = self.senate_profile_url.format(row['District'])
elif chamber == 'lower':
try:
seat_number = seat_map[row['seatno']]
profile_url = self.house_profile_url.format(seat_number)
except KeyError:
pass
if profile_url:
person.image = self._get_photo(profile_url, chamber)
person.add_source(profile_url)
return person
示例8: scrape
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import image [as 别名]
def scrape(self):
committee_d = {}
non_committees = {'City Council', 'Office of the Mayor',
'Office of the City Clerk'}
for councilman, committees in self.councilMembers() :
if councilman['Ward/Office'] == "":
continue
ward = councilman['Ward/Office']
if ward not in {"Mayor", "Clerk"} :
ward = "Ward {}".format(int(ward))
role = "Alderman"
p = Person(councilman['Person Name']['label'],
district=ward,
primary_org="legislature",
role=role)
if councilman['Photo'] :
p.image = councilman['Photo']
contact_types = {
"City Hall Office": ("address", "City Hall Office"),
"City Hall Phone": ("voice", "City Hall Phone"),
"Ward Office Phone": ("voice", "Ward Office Phone"),
"Ward Office Address": ("address", "Ward Office Address"),
"Fax": ("fax", "Fax")
}
for contact_type, (type_, _note) in contact_types.items():
if councilman[contact_type]:
p.add_contact_detail(type=type_,
value= councilman[contact_type],
note=_note)
if councilman["E-mail"]:
p.add_contact_detail(type="email",
value=councilman['E-mail']['label'],
note='E-mail')
if councilman['Website']:
p.add_link(councilman['Website']['url'])
p.add_source(MEMBERLIST)
for committee, _, _ in committees:
committee_name = committee['Legislative Body']['label']
if committee_name and committee_name not in non_committees:
o = committee_d.get(committee_name, None)
if o is None:
o = Organization(committee_name,
classification='committee',
parent_id={'name' : 'Chicago City Council'})
o.add_source("https://chicago.legistar.com/Departments.aspx")
committee_d[committee_name] = o
o.add_member(p, role=committee["Title"])
yield p
for o in committee_d.values() :
yield o
o = Organization('Council Office of Financial Analysis Oversight Committee',
classification='committee',
parent_id={'name' : 'Chicago City Council'})
o.add_source("https://chicago.legistar.com/Departments.aspx")
yield o
o = Organization('Committee on Parks and Recreation',
classification='committee',
parent_id={'name' : 'Chicago City Council'})
o.add_source("https://chicago.legistar.com/Departments.aspx")
yield o
示例9: scrape
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import image [as 别名]
def scrape(self):
web_scraper = LegistarPersonScraper(requests_per_minute = self.requests_per_minute)
web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody'
if self.cache_storage:
web_scraper.cache_storage = self.cache_storage
if self.requests_per_minute == 0:
web_scraper.cache_write_only = False
web_info = {}
for member, _ in web_scraper.councilMembers():
name = member['Person Name']['label'].strip()
web_info[name] = member
city_council, = [body for body in self.bodies()
if body['BodyName'] == 'City Council']
terms = collections.defaultdict(list)
public_advocates = { # Match casing to Bill De Blasio as council member
'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio',
'The Public Advocate (Ms. James)': 'Letitia James',
}
for office in self.body_offices(city_council):
name = office['OfficeRecordFullName']
name = public_advocates.get(name, name).strip()
terms[name].append(office)
# Add past members (and advocates public)
if name not in web_info:
web_info[name] = collections.defaultdict(lambda: None)
# Check that we have everyone we expect, formatted consistently, in
# both information arrays. For instance, this will fail if we forget to
# strip trailing spaces from names on one side or the other (which has
# the effect of omitting information, such as post, from the scrape).
assert set(web_info.keys()) == set(terms.keys())
members = {}
for member, offices in terms.items():
p = Person(member)
web = web_info[member]
for term in offices:
role = term['OfficeRecordTitle']
if role == 'Public Advocate':
role = 'Non-Voting Council Member'
else:
role = 'Council Member'
district = web.get('District', '').replace(' 0', ' ')
p.add_term(role,
'legislature',
district=district,
start_date=self.toDate(term['OfficeRecordStartDate']),
end_date=self.toDate(term['OfficeRecordEndDate']))
party = web.get('Political Party')
if party == 'Democrat':
party = 'Democratic'
if party:
p.add_party(party)
if web.get('Photo'):
p.image = web['Photo']
contact_types = {
"City Hall Office": ("address", "City Hall Office"),
"City Hall Phone": ("voice", "City Hall Phone"),
"Ward Office Phone": ("voice", "Ward Office Phone"),
"Ward Office Address": ("address", "Ward Office Address"),
"Fax": ("fax", "Fax")
}
for contact_type, (type_, _note) in contact_types.items():
if web.get(contact_type) and web(contact_type) != 'N/A':
p.add_contact_detail(type=type_,
value= web[contact_type],
note=_note)
if web.get('E-mail'):
p.add_contact_detail(type="email",
value=web['E-mail']['url'],
note='E-mail')
if web.get('Web site'):
p.add_link(web['Web site']['url'], note='web site')
#.........这里部分代码省略.........
示例10: scrape_current_legislators
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import image [as 别名]
def scrape_current_legislators(self, repos):
for repo in repos:
CURRENT_LEGISLATORS = self.get_url(repo)
people = self.yamlize(CURRENT_LEGISLATORS)
parties = set()
posts = {}
person_cache = defaultdict(lambda: defaultdict(lambda: None))
for person in people:
name = person['name'].get('official_full')
if name is None:
name = "{name[first]} {name[last]}".format(**person)
if 'birthday' in person['bio']:
birth_date = person['bio']['birthday']
who = person_cache[name][birth_date]
has_term = False
if who is None:
who = Person(name=name, birth_date=birth_date)
who.add_source(url=CURRENT_LEGISLATORS, note="unitedstates project on GitHub")
for term in person.get('terms', []):
has_term = True
start_date = term['start']
end_date = term['end']
state = term['state']
type_ = term['type']
district = term.get('district', None)
party = term.get('party', None)
chamber = {'rep': self.house,
'sen': self.senate}[type_]
role = {'rep': 'Representative',
'sen': 'Senator'}[type_]
if type_ == "rep" and district is not None:
label = "%s for District %s in %s" % (role, district, state)
division_id = ("ocd-division/country:us/state:{state}".format(state=state.lower()))
if district != 0:
division_id += "/cd:{district}".format(district=district)
if type_ == "sen":
label = "Senator for %s" % state
division_id = ("ocd-division/country:us/state:{state}".format(state=state.lower()))
post = posts.get(division_id)
if post is None:
post = Post(organization_id=chamber._id,
division_id=division_id,
label=label, role=role)
posts[division_id] = post
yield post
membership = Membership(
post_id=post._id,
role=role,
label=label,
start_date=start_date,
end_date=end_date,
person_id=who._id,
organization_id=chamber._id)
yield membership
if party == "Democrat":
party = "Democratic"
if party:
membership = Membership(
role='member',
start_date=start_date,
end_date=end_date,
person_id=who._id,
organization_id=make_pseudo_id(
classification="party",
name=party))
yield membership
for key, value in person.get('id', {}).items():
if isinstance(value, list):
for v in value:
who.add_identifier(str(v), scheme=key)
else:
who.add_identifier(str(value), scheme=key)
if key == 'bioguide':
who.image = self.get_image_url(str(value))
if has_term:
yield who
示例11: _scrape_representative
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import image [as 别名]
def _scrape_representative(self, url, parties):
# logger.info(f'Generating representative person object from {url}')
"""
Returns a Person object representing a member of the lower
legislative chamber.
"""
# url = self.get(url).text.replace('<br>', '')
member_page = self.lxmlize(url)
photo_url = member_page.xpath('//img[@class="member-photo"]/@src')[0]
if photo_url.endswith('/.jpg'):
photo_url = None
scraped_name, district_text = member_page.xpath(
'//div[@class="member-info"]/h2')
scraped_name = scraped_name.text_content().strip().replace('Rep. ', '')
scraped_name = ' '.join(scraped_name.split())
name = ' '.join(scraped_name.split(', ')[::-1])
district_text = district_text.text_content().strip()
district = str(self.district_re.search(district_text).group(1))
# Vacant house "members" are named after their district numbers:
if re.match(r'^District \d+$', scraped_name):
return None
party = parties[district]
person = Person(name=name, district=district, party=party,
primary_org='lower')
if photo_url is not None:
person.image = photo_url
person.add_link(url)
person.add_source(url)
def office_name(element):
"""Returns the office address type."""
return element.xpath('preceding-sibling::h4[1]/text()')[0] \
.rstrip(':')
offices_text = [{
'name': office_name(p_tag),
'type': office_name(p_tag).replace(' Address', '').lower(),
'details': p_tag.text_content()
} for p_tag in member_page.xpath(
'//h4/following-sibling::p[@class="double-space"]')]
for office_text in offices_text:
details = office_text['details'].strip()
# A few member pages have blank office listings:
if details == '':
continue
# At the time of writing, this case of multiple district
# offices occurs exactly once, for the representative at
# District 43:
if details.count('Office') > 1:
district_offices = [
district_office.strip()
for district_office
in re.findall(r'(\w+ Office.+?(?=\w+ Office|$))',
details, flags=re.DOTALL)
]
offices_text += [{
'name': re.match(r'\w+ Office', office).group(),
'type': 'district',
'details': re.search(
r'(?<=Office).+(?=\w+ Office|$)?', office,
re.DOTALL).group()
} for office in district_offices]
match = self.address_re.search(details)
if match is not None:
address = re.sub(
' +$', '',
match.group().replace('\r', '').replace('\n\n', '\n'),
flags=re.MULTILINE
)
else:
# No valid address found in the details.
continue
phone_number = extract_phone(details)
fax_number = extract_fax(details)
if address:
person.add_contact_detail(type='address', value=address,
note=office_text['name'])
if phone_number:
person.add_contact_detail(type='voice', value=phone_number,
note=office_text['name'])
if fax_number:
person.add_contact_detail(type='fax', value=fax_number,
note=office_text['name'])
yield person
示例12: _scrape_senator
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import image [as 别名]
def _scrape_senator(self, url, parties):
# logger.info(f'Generating senator person object from {url}')
"""
Returns a Person object representing a member of the upper
legislative chamber.
"""
# Scrape legislator information from roster URL
# Example: view-source:https://senate.texas.gov/member.php?d=1
member_page = self.lxmlize(url)
photo_url = member_page.xpath('//img[@id="memhead"]/@src')[0]
scraped_name_district_text = member_page.xpath(
'//div[@class="pgtitle"]/text()')[0]
scraped_name, district_text = scraped_name_district_text.split(':')
name = ' '.join(scraped_name.replace('Senator ', '').split()).strip()
district = str(district_text.split()[1]).strip()
# Vacant house "members" are named after their district numbers:
if re.match(r'^District \d+$', name):
return None
bio = ' '.join(member_page.xpath('//div[@class="bio"]/text()'))
party = parties[district]
person = Person(name=name,
district=district,
party=party,
primary_org='upper',
biography=bio)
if photo_url is not None:
person.image = photo_url
person.add_link(url)
person.add_source(url)
office_ids = []
# Get offices based on table headers
for th_tag in member_page.xpath('//table[@class="memdir"]/tr/th'):
# logger.warn([th_tag.xpath('text()'),th_tag.xpath('@id')])
id = th_tag.xpath('@id')[0] if th_tag.xpath('@id') else ''
label = th_tag.xpath('text()')[0].strip() if th_tag.xpath('text()') else ''
if id != '' and label != '':
office_ids.append({'id': id, 'label': label})
# logger.warn(office_ids)
for office in office_ids:
# logger.warn(office)
row = member_page.xpath(
f'//table[@class="memdir"]/tr/td[@headers="{office["id"]}"]')
# A few member pages have broken ids for office listings:
if len(row) == 0:
row = member_page.xpath(
f'//table[@class="memdir"]/tr/td[@headers="dDA1"]')
if len(row) > 0:
details = " ".join(row[0].xpath('text()')).strip()
details = details.replace('\r', '').replace('\n', '')
# logger.warn(details)
# A few member pages have blank office listings:
if details == '':
continue
match = self.address_re.search(details)
if match is not None:
address = re.sub(
' +$', '',
match.group().replace('\r', '').replace('\n', ''),
flags=re.MULTILINE
)
else:
# No valid address found in the details.
continue
phone_number = extract_phone(details)
fax_number = extract_fax(details)
if address:
person.add_contact_detail(type='address', value=address,
note=office['label'])
if phone_number:
person.add_contact_detail(type='voice', value=phone_number,
note=office['label'])
if fax_number:
person.add_contact_detail(type='fax', value=fax_number,
note=office['label'])
yield person
示例13: scrape
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import image [as 别名]
def scrape(self):
noncommittees = {'Committee of the Whole'}
committee_d = {}
people_d = {}
for councilman, committees in self.councilMembers() :
if 'url' in councilman['Person Name'] :
councilman_url = councilman['Person Name']['url']
if councilman_url in people_d :
people_d[councilman_url][0].append(councilman)
else :
people_d[councilman_url] = [councilman], committees
for person_entries, committees in people_d.values() :
councilman = person_entries[-1]
p = Person(councilman['Person Name']['label'])
if p.name == 'Letitia James' :
p.name = 'Letitia Ms. James'
p.add_name('Letitia James')
spans = [(self.toTime(entry['Start Date']).date(),
self.toTime(entry['End Date']).date(),
entry['District'])
for entry in person_entries]
merged_spans = []
last_end_date = None
last_district = None
for start_date, end_date, district in sorted(spans) :
if last_end_date is None :
span = [start_date, end_date, district]
elif (start_date - last_end_date) == datetime.timedelta(1) and district == last_district :
span[1] = end_date
else :
merged_spans.append(span)
span = [start_date, end_date, district]
last_end_date = end_date
last_district = district
merged_spans.append(span)
for start_date, end_date, district in merged_spans :
district = councilman['District'].replace(' 0', ' ')
if end_date == datetime.date(2017, 12, 31) :
end_date = ''
else :
end_date = end_date.isoformat()
print(start_date, end_date)
p.add_term('Council Member', 'legislature',
district=district,
start_date=start_date.isoformat(),
end_date=end_date)
party = councilman['Political Party']
if party == 'Democrat' :
party = 'Democratic'
if party :
p.add_party(party)
if councilman['Photo'] :
p.image = councilman['Photo']
if councilman["E-mail"]:
p.add_contact_detail(type="email",
value=councilman['E-mail']['url'],
note='E-mail')
if councilman['Web site']:
p.add_link(councilman['Web site']['url'], note='web site')
p.extras = {'Notes' : councilman['Notes']}
p.add_source(councilman['Person Name']['url'], note='web')
for committee, _, _ in committees:
committee_name = committee['Department Name']['label']
if committee_name not in noncommittees and 'committee' in committee_name.lower():
o = committee_d.get(committee_name, None)
if o is None:
parent_id = PARENT_ORGS.get(committee_name,
'New York City Council')
o = Organization(committee_name,
classification='committee',
parent_id={'name' : parent_id})
o.add_source(committee['Department Name']['url'])
committee_d[committee_name] = o
membership = o.add_member(p, role=committee["Title"])
membership.start_date = self.mdY2Ymd(committee["Start Date"])
yield p
#.........这里部分代码省略.........
示例14: legislators
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import image [as 别名]
def legislators(self, latest_only):
legs = {}
for member, chamber, term, url in self._memberships(latest_only):
name, _, _, district, party = member.xpath('td')
district = district.text
detail_url = name.xpath('a/@href')[0]
if party.text_content().strip() == "":
self.warning("Garbage party: Skipping!")
continue
party = {'D': 'Democratic', 'R': 'Republican', 'I': 'Independent'}[party.text]
name = name.text_content().strip()
# inactive legislator, skip them for now
if name.endswith('*'):
name = name.strip('*')
continue
name = AKA.get(name, name)
if name in legs:
p, terms = legs[name]
terms.append((chamber, district, term, party))
else:
p = Person(name, party=party)
legs[name] = p, [(chamber, district, term, party)]
p.add_source(url)
p.add_source(detail_url)
p.add_link(detail_url)
birth_date = BIRTH_DATES.get(name, None)
if birth_date:
p.birth_date = birth_date
leg_html = self.get(detail_url).text
leg_doc = lxml.html.fromstring(leg_html)
leg_doc.make_links_absolute(detail_url)
hotgarbage = (
'Senate Biography Information for the 98th General '
'Assembly is not currently available.')
if hotgarbage in leg_html:
# The legislator's bio isn't available yet.
self.logger.warning('No legislator bio available for ' + name)
continue
photo_url = leg_doc.xpath('//img[contains(@src, "/members/")]/@src')[0]
p.image = photo_url
p.contact_details = []
# email
email = leg_doc.xpath('//b[text()="Email: "]')
if email:
p.add_contact_detail(type='email', value=email[0].tail.strip(), note='capitol')
offices = {'capitol': '//table[contains(string(), "Springfield Office")]',
'district': '//table[contains(string(), "District Office")]'}
for location, xpath in offices.items():
table = leg_doc.xpath(xpath)
if table:
for type, value in self._table_to_office(table[3]):
if type in ('fax', 'voice') and not validate_phone_number(value):
continue
p.add_contact_detail(type=type, value=value, note=location)
return legs
示例15: scrape_chamber
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import image [as 别名]
def scrape_chamber(self, chamber, session):
url = 'https://docs.legis.wisconsin.gov/{}/legislators/{}'.format(
session,
{'upper': 'senate', 'lower': 'assembly'}[chamber],
)
body = self.get(url).text
page = lxml.html.fromstring(body)
page.make_links_absolute(url)
for row in page.xpath(".//div[@class='box-content']/div[starts-with(@id,'district')]"):
if row.xpath(".//a/@href") and not row.xpath(".//a[text()='Vacant']"):
rep_url = row.xpath(".//a[text()='Details']/@href")[0].strip("https://")
rep_url = "https://" + rep_url
rep_doc = lxml.html.fromstring(self.get(rep_url).text)
rep_doc.make_links_absolute(rep_url)
full_name = rep_doc.xpath(
'.//div[@id="district"]/h1/text()'
)[0].replace("Senator ", "").replace("Representative ", "")
party = rep_doc.xpath('.//div[@id="district"]//small/text()')
if len(party) > 0:
party = PARTY_DICT[party[0].split("-")[0].strip("(").strip()]
else:
party = None
district = rep_doc.xpath('.//div[@id="district"]/h3/a/@href')[1]
district = district.split("/")[-1]
district = str(int(district))
# email
email = rep_doc.xpath("//span[@class='info email']/a/text()")
if email:
email = email[0]
else:
email = ''
assert party is not None, "{} is missing party".format(full_name)
person = Person(
name=full_name,
district=district,
primary_org=chamber,
party=party,
)
img = rep_doc.xpath('.//div[@id="district"]/img/@src')
if img:
person.image = img[0]
# office ####
address_lines = rep_doc.xpath('.//span[@class="info office"]/text()')
address = '\n'.join([line.strip() for line in address_lines if line.strip() != ""])
person.add_contact_detail(type='address', value=address, note='Capitol Office')
phone = rep_doc.xpath('.//span[@class="info telephone"]/text()')
if phone:
phone = re.sub(r'\s+', ' ', phone[1]).strip()
person.add_contact_detail(type='voice', value=phone, note='Capitol Office')
fax = rep_doc.xpath('.//span[@class="info fax"]/text()')
if fax:
fax = re.sub(r'\s+', ' ', fax[1]).strip()
person.add_contact_detail(type='fax', value=fax, note='Capitol Office')
if email:
person.add_contact_detail(type='email', value=email, note='Capitol Office')
person.add_link(rep_url)
person.add_source(rep_url)
yield person