本文整理汇总了Python中pupa.scrape.Person.extras方法的典型用法代码示例。如果您正苦于以下问题:Python Person.extras方法的具体用法?Python Person.extras怎么用?Python Person.extras使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pupa.scrape.Person
的用法示例。
在下文中一共展示了Person.extras方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_member
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import extras [as 别名]
def get_member(self, session, chamber, kpid):
url = '%smembers/%s' % (ksapi.url, kpid)
content = json.loads(self.get(url).text)['content']
party = content['PARTY']
if party == 'Democrat':
party = 'Democratic'
slug = {'2013-2014': 'b2013_14',
'2015-2016': 'b2015_16',
'2017-2018': 'b2017_18',
'2019-2020': 'b2019_20',
}[session]
leg_url = 'http://www.kslegislature.org/li/%s/members/%s/' % (slug, kpid)
try:
legislator_page = self.lxmlize(leg_url)
photo_url, = legislator_page.xpath(
'//img[@class="profile-picture"]/@src')
except scrapelib.HTTPError:
self.warning("{}'s legislator bio page not found".format(content['FULLNAME']))
leg_url = ''
photo_url = ''
person = Person(
name=content['FULLNAME'],
district=str(content['DISTRICT']),
primary_org=chamber,
party=party,
image=photo_url,
)
person.extras = {'occupation': content['OCCUPATION']}
address = '\n'.join([
'Room {}'.format(content['OFFICENUM']),
'Kansas State Capitol Building',
'300 SW 10th St.',
'Topeka, KS 66612',
])
note = 'Capitol Office'
person.add_contact_detail(type='address', value=address, note=note)
person.add_contact_detail(type='email', value=content['EMAIL'], note=note)
if content['OFFPH']:
person.add_contact_detail(type='voice', value=content['OFFPH'], note=note)
person.add_source(url)
person.add_link(leg_url)
yield person
示例2: _scrape_lower_chamber
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import extras [as 别名]
def _scrape_lower_chamber(self):
self.info('Scraping lower chamber for legislators.')
chamber = 'lower'
roster_url = (self._reps_url)
page = self.get(roster_url).text
page = lxml.html.fromstring(page)
# This is the ASP.net table container
table_xpath = ('id("ContentPlaceHolder1_'
'gridMembers_DXMainTable")')
table = page.xpath(table_xpath)[0]
for tr in table.xpath('tr')[1:]:
# If a given term hasn't occurred yet, then ignore it
# Eg, in 2017, the 2018 term page will have a blank table
if tr.attrib.get('class') == 'dxgvEmptyDataRow':
self.warning('No House members found')
return
tds = tr.xpath('td')
last_name = tds[0].text_content().strip()
first_name = tds[1].text_content().strip()
full_name = '{} {}'.format(first_name, last_name)
district = str(int(tds[2].text_content().strip()))
party = tds[3].text_content().strip()
if party == 'Democrat':
party = 'Democratic'
if party.strip() == "": # Workaround for now.
party = "Other"
phone = tds[4].text_content().strip()
room = tds[5].text_content().strip()
address = self._assumed_address_fmt.format(room if room else '')
if last_name == 'Vacant':
person = Person(
name=full_name,
primary_org=chamber,
district=district,
party=party,
)
person.extras = {
'first_name': first_name,
'last_name': last_name,
}
person.add_contact_detail(type='address', value=address, note='Capitol Office')
if phone.strip():
person.add_contact_detail(type='voice', value=phone, note='Capitol Office')
person.add_source(roster_url)
self._save_vacant_legislator(person)
else:
party_override = {" Green": "Democratic",
" Sisco": "Republican"}
if party == "" and full_name in party_override:
party = party_override[full_name]
details_url = self._rep_details_url.format(district)
details_page = lxml.html.fromstring(self.get(details_url).text)
person = Person(
name=full_name,
primary_org=chamber,
district=district,
party=party,
)
person.extras = {
'first_name': first_name,
'last_name': last_name,
}
person.add_source(roster_url)
person.add_source(details_url)
person.add_link(details_url)
email = details_page.xpath(
'//*[@id="ContentPlaceHolder1_lblAddresses"]'
'/table/tr[4]/td/a/@href'
)
if len(email) > 0 and email[0].lower() != 'mailto:':
email = email[0].split(':')[1]
else:
email = None
person.add_contact_detail(type='address', value=address, note='Capitol Office')
if phone:
person.add_contact_detail(type='voice', value=phone, note='Capitol Office')
if email:
person.add_contact_detail(type='email', value=email, note='Capitol Office')
picture = details_page.xpath(
'//*[@id="ContentPlaceHolder1_imgPhoto"]/@src')
if len(picture) > 0:
person.image = picture[0]
yield person
示例3: _parse_person
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import extras [as 别名]
def _parse_person(self, row, chamber, seat_map):
# Capture legislator vitals.
first_name = row['FirstName']
middle_name = row['MiddleName']
last_name = row['LastName']
full_name = '{} {} {}'.format(first_name, middle_name, last_name)
full_name = re.sub(r'[\s]{2,}', ' ', full_name)
if chamber == 'lower':
district = '{} {}'.format(row['County'], int(row['District'])).strip()
else:
district = str(int(row['District'])).strip()
party = self.party_map[row['party'].upper()]
email = row['WorkEmail']
print(district)
person = Person(primary_org=chamber,
district=district,
name=full_name,
party=party)
extras = {
'first_name': first_name,
'middle_name': middle_name,
'last_name': last_name
}
person.extras = extras
if email:
person.add_contact_detail(type='email', value=email, note='District Office')
# Capture legislator office contact information.
district_address = '{}\n{}\n{}, {} {}'.format(row['Address'],
row['address2'],
row['city'], row['State'],
row['Zipcode']).strip()
phone = row['Phone'].strip()
if not phone:
phone = None
if district_address:
person.add_contact_detail(type='address', value=district_address, note='Home Office')
if phone:
person.add_contact_detail(type='voice', value=phone, note='Home Office')
# Retrieve legislator portrait.
profile_url = None
if chamber == 'upper':
profile_url = self.senate_profile_url.format(row['District'])
elif chamber == 'lower':
try:
seat_number = seat_map[row['seatno']]
profile_url = self.house_profile_url.format(seat_number)
except KeyError:
pass
if profile_url:
person.image = self._get_photo(profile_url, chamber)
person.add_source(profile_url)
return person
示例4: scrape
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import extras [as 别名]
#.........这里部分代码省略.........
if party == 'Democrat':
party = 'Democratic'
if party:
p.add_party(party)
if web.get('Photo'):
p.image = web['Photo']
contact_types = {
"City Hall Office": ("address", "City Hall Office"),
"City Hall Phone": ("voice", "City Hall Phone"),
"Ward Office Phone": ("voice", "Ward Office Phone"),
"Ward Office Address": ("address", "Ward Office Address"),
"Fax": ("fax", "Fax")
}
for contact_type, (type_, _note) in contact_types.items():
if web.get(contact_type) and web(contact_type) != 'N/A':
p.add_contact_detail(type=type_,
value= web[contact_type],
note=_note)
if web.get('E-mail'):
p.add_contact_detail(type="email",
value=web['E-mail']['url'],
note='E-mail')
if web.get('Web site'):
p.add_link(web['Web site']['url'], note='web site')
if web.get('Notes'):
p.extras = {'Notes': web['Notes']}
if not p.sources: # Only add sources once
source_urls = self.person_sources_from_office(term)
person_api_url, person_web_url = source_urls
p.add_source(person_api_url, note='api')
p.add_source(person_web_url, note='web')
members[member] = p
committee_types = ['Committee',
'Inactive Committee',
'Select Committee',
'Subcommittee',
'Task Force',
'Land Use', # Committee on Land Use
]
body_types = {k: v for k, v in self.body_types().items()
if k in committee_types}
for body in self.bodies():
if body['BodyTypeName'] in body_types \
or body['BodyName'] in ('Legislative Documents Unit',
'Legal and Government Affairs Division'):
# Skip typo in API data
if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services':
continue
parent_org = PARENT_ORGS.get(body['BodyName'], 'New York City Council')
body_name = body['BodyName']
示例5: scrape
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import extras [as 别名]
def scrape(self):
noncommittees = {'Committee of the Whole'}
committee_d = {}
people_d = {}
for councilman, committees in self.councilMembers() :
if 'url' in councilman['Person Name'] :
councilman_url = councilman['Person Name']['url']
if councilman_url in people_d :
people_d[councilman_url][0].append(councilman)
else :
people_d[councilman_url] = [councilman], committees
for person_entries, committees in people_d.values() :
councilman = person_entries[-1]
p = Person(councilman['Person Name']['label'])
if p.name == 'Letitia James' :
p.name = 'Letitia Ms. James'
p.add_name('Letitia James')
spans = [(self.toTime(entry['Start Date']).date(),
self.toTime(entry['End Date']).date(),
entry['District'])
for entry in person_entries]
merged_spans = []
last_end_date = None
last_district = None
for start_date, end_date, district in sorted(spans) :
if last_end_date is None :
span = [start_date, end_date, district]
elif (start_date - last_end_date) == datetime.timedelta(1) and district == last_district :
span[1] = end_date
else :
merged_spans.append(span)
span = [start_date, end_date, district]
last_end_date = end_date
last_district = district
merged_spans.append(span)
for start_date, end_date, district in merged_spans :
district = councilman['District'].replace(' 0', ' ')
if end_date == datetime.date(2017, 12, 31) :
end_date = ''
else :
end_date = end_date.isoformat()
print(start_date, end_date)
p.add_term('Council Member', 'legislature',
district=district,
start_date=start_date.isoformat(),
end_date=end_date)
party = councilman['Political Party']
if party == 'Democrat' :
party = 'Democratic'
if party :
p.add_party(party)
if councilman['Photo'] :
p.image = councilman['Photo']
if councilman["E-mail"]:
p.add_contact_detail(type="email",
value=councilman['E-mail']['url'],
note='E-mail')
if councilman['Web site']:
p.add_link(councilman['Web site']['url'], note='web site')
p.extras = {'Notes' : councilman['Notes']}
p.add_source(councilman['Person Name']['url'], note='web')
for committee, _, _ in committees:
committee_name = committee['Department Name']['label']
if committee_name not in noncommittees and 'committee' in committee_name.lower():
o = committee_d.get(committee_name, None)
if o is None:
parent_id = PARENT_ORGS.get(committee_name,
'New York City Council')
o = Organization(committee_name,
classification='committee',
parent_id={'name' : parent_id})
o.add_source(committee['Department Name']['url'])
committee_d[committee_name] = o
membership = o.add_member(p, role=committee["Title"])
membership.start_date = self.mdY2Ymd(committee["Start Date"])
yield p
#.........这里部分代码省略.........
示例6: scrape_session
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import extras [as 别名]
def scrape_session(self, session, chambers):
sid = SESSION_SITE_IDS[session]
members = backoff(
self.sservice.GetMembersBySession,
sid
)['MemberListing']
seen_guids = []
for member in members:
guid = member['Id']
member_info = backoff(self.sservice.GetMember, guid)
# If a member switches chambers during the session, they may
# appear twice. Skip the duplicate record accordingly.
if guid in seen_guids:
self.warning('Skipping duplicate record of {}'.format(member_info['Name']['Last']))
continue
else:
seen_guids.append(guid)
# Check to see if the member has vacated; skip if so.
# A member can have multiple services for a given session,
# if they switched chambers. Filter these down to just the
# active service.
try:
(legislative_service, ) = [
service for service
in member_info['SessionsInService']['LegislativeService']
if service['Session']['Id'] == sid and service['DateVacated'] is None
]
except ValueError:
self.info('Skipping retired member {}'.format(member_info['Name']['Last']))
continue
nick_name, first_name, middle_name, last_name = (
member_info['Name'][x] for x in [
'Nickname', 'First', 'Middle', 'Last'
]
)
first_name = nick_name if nick_name else first_name
if middle_name:
full_name = "%s %s %s" % (first_name, middle_name, last_name)
else:
full_name = "%s %s" % (first_name, last_name)
party = legislative_service['Party']
if party == 'Democrat':
party = 'Democratic'
elif party.strip() == '':
party = 'other'
chamber, district = (
legislative_service['District'][x] for x in [
'Type', 'Number'
]
)
chamber = {
"House": 'lower',
"Senate": 'upper'
}[chamber]
url, photo = self.scrape_homepage(HOMEPAGE_URLS[chamber],
{"code": guid, "sid": sid})
legislator = Person(
name=full_name,
district=str(district),
party=party,
primary_org=chamber,
image=photo,
)
legislator.extras = {
'family_name': last_name,
'given_name': first_name,
'guid': guid,
}
if member_info['Address']['Street'] is not None and \
member_info['Address']['Street'].strip():
capitol_address_info = {
k: v.strip() for k, v
in dict(member_info['Address']).items()
if k in ['Street', 'City', 'State', 'Zip']
}
capitol_address = '{Street}\n{City}, {State} {Zip}'.format(**capitol_address_info)
legislator.add_contact_detail(
type='address', value=capitol_address, note='Capitol Address')
else:
self.warning('Could not find full capitol address for {}'.format(full_name))
capitol_contact_info = self.clean_list([
member_info['Address'][x] for x in [
'Email', 'Phone', 'Fax'
]
])
#.........这里部分代码省略.........
示例7: transform_parse
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import extras [as 别名]
def transform_parse(self, parsed_form, response):
_source = {
"url": response.url,
"note": "LDA Form LD-1"
}
# basic disclosure fields
_disclosure = Disclosure(
effective_date=datetime.strptime(
parsed_form['datetimes']['effective_date'],
'%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
timezone='America/New_York',
submitted_date=datetime.strptime(
parsed_form['datetimes']['signature_date'],
'%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC),
classification="lobbying"
)
_disclosure.add_authority(name=self.authority.name,
type=self.authority._type,
id=self.authority._id)
_disclosure.add_identifier(
identifier=parsed_form['_meta']['document_id'],
scheme="urn:sopr:filing"
)
# disclosure extras
_disclosure.extras = {}
_disclosure.extras['registrant'] = {
'self_employed_individual': parsed_form['registrant']['self_employed_individual'],
'general_description': parsed_form['registrant']['registrant_general_description'],
'signature': {
"signature_date": parsed_form['datetimes']['signature_date'],
"signature": parsed_form['signature']
}
}
_disclosure.extras['client'] = {
'same_as_registrant':
parsed_form['client']['client_self'],
'general_description':
parsed_form['client']['client_general_description']
}
_disclosure.extras['registration_type'] = {
'is_amendment':
parsed_form['registration_type']['is_amendment'],
'new_registrant':
parsed_form['registration_type']['new_registrant'],
'new_client_for_existing_registrant':
parsed_form['registration_type'][
'new_client_for_existing_registrant'],
}
# # Registrant
# build registrant
_registrant_self_employment = None
if parsed_form['registrant']['self_employed_individual']:
n = ' '.join([p for p in [
parsed_form['registrant']['registrant_individual_prefix'],
parsed_form['registrant']['registrant_individual_firstname'],
parsed_form['registrant']['registrant_individual_lastname']
] if len(p) > 0]).strip()
_registrant = Person(
name=n,
source_identified=True
)
_registrant_self_employment = Organization(
name='SELF-EMPLOYMENT of {n}'.format(n=n),
classification='company',
source_identified=True
)
_registrant.add_membership(
organization=_registrant_self_employment,
role='self_employed',
label='self-employment of {n}'.format(n=n),
start_date=_disclosure.effective_date.strftime('%Y-%m-%d')
)
else:
_registrant = Organization(
name=parsed_form['registrant']['registrant_org_name'],
classification='company',
source_identified=True
)
if len(parsed_form['registrant']['registrant_house_id']) > 0:
_registrant.add_identifier(
identifier=parsed_form['registrant']['registrant_house_id'],
scheme='urn:house_clerk:registrant'
)
if len(parsed_form['registrant']['registrant_senate_id']) > 0:
_registrant.add_identifier(
identifier=parsed_form['registrant']['registrant_senate_id'],
#.........这里部分代码省略.........
示例8: scrape_session
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import extras [as 别名]
def scrape_session(self, session, chambers):
sid = SESSION_SITE_IDS[session]
members = backoff(
self.sservice.GetMembersBySession,
sid
)['MemberListing']
for member in members:
guid = member['Id']
member_info = backoff(self.sservice.GetMember, guid)
# Check to see if the member has vacated; skip if so:
try:
legislative_service = next(
service for service
in member_info['SessionsInService']['LegislativeService']
if service['Session']['Id'] == sid
)
except IndexError:
raise Exception("Something very bad is going on with the "
"Legislative service")
if legislative_service['DateVacated']:
continue
nick_name, first_name, middle_name, last_name = (
member_info['Name'][x] for x in [
'Nickname', 'First', 'Middle', 'Last'
]
)
first_name = nick_name if nick_name else first_name
if middle_name:
full_name = "%s %s %s" % (first_name, middle_name, last_name)
else:
full_name = "%s %s" % (first_name, last_name)
party = legislative_service['Party']
if party == 'Democrat':
party = 'Democratic'
elif party.strip() == '':
party = 'other'
chamber, district = (
legislative_service['District'][x] for x in [
'Type', 'Number'
]
)
chamber = {
"House": 'lower',
"Senate": 'upper'
}[chamber]
url, photo = self.scrape_homepage(HOMEPAGE_URLS[chamber],
{"code": guid, "sid": sid})
legislator = Person(
name=full_name,
district=str(district),
party=party,
primary_org=chamber,
image=photo,
)
legislator.extras = {
'last_name': last_name,
'first_name': first_name,
'guid': guid,
}
capitol_address = self.clean_list([
member_info['Address'][x] for x in [
'Street', 'City', 'State', 'Zip'
]
])
capitol_address = " ".join(
addr_component for addr_component
in capitol_address if addr_component
).strip()
capitol_contact_info = self.clean_list([
member_info['Address'][x] for x in [
'Email', 'Phone', 'Fax'
]
])
# Sometimes email is set to a long cryptic string.
# If it doesn't have a @ character, simply set it to None
# examples:
# 01X5dvct3G1lV6RQ7I9o926Q==&c=xT8jBs5X4S7ZX2TOajTx2W7CBprTaVlpcvUvHEv78GI=
# 01X5dvct3G1lV6RQ7I9o926Q==&c=eSH9vpfdy3XJ989Gpw4MOdUa3n55NTA8ev58RPJuzA8=
if capitol_contact_info[0] and '@' not in capitol_contact_info[0]:
capitol_contact_info[0] = None
# if we have more than 2 chars (eg state)
#.........这里部分代码省略.........