本文整理汇总了Python中pupa.scrape.Bill.add_abstract方法的典型用法代码示例。如果您正苦于以下问题:Python Bill.add_abstract方法的具体用法?Python Bill.add_abstract怎么用?Python Bill.add_abstract使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pupa.scrape.Bill
的用法示例。
在下文中一共展示了Bill.add_abstract方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: scrape_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_abstract [as 别名]
def scrape_bill(self, row, chamber, session):
bill_id = row['LegislationNumber']
# TODO: re-evaluate if these should be separate bills
if 'SA' in bill_id or 'HA' in bill_id:
self.warning('skipping amendment %s', bill_id)
return
bill_type = self.classify_bill(bill_id)
bill = Bill(identifier=bill_id,
legislative_session=session,
chamber=chamber,
title=row['LongTitle'],
classification=bill_type)
if row['Synopsis']:
bill.add_abstract(row['Synopsis'], 'synopsis')
if row['ShortTitle']:
bill.add_title(row['ShortTitle'], 'short title')
if row['SponsorPersonId']:
self.add_sponsor_by_legislator_id(bill, row['SponsorPersonId'], 'primary')
# TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API?
html_url = 'https://legis.delaware.gov/BillDetail?LegislationId={}'.format(
row['LegislationId']
)
bill.add_source(html_url, note='text/html')
html = self.lxmlize(html_url)
# Additional Sponsors: '//label[text()="Additional Sponsor(s):"]/following-sibling::div/a'
additional_sponsors = html.xpath('//label[text()="Additional Sponsor(s):"]'
'/following-sibling::div/a/@href')
for sponsor_url in additional_sponsors:
sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?'
'personId=', '')
self.add_sponsor_by_legislator_id(bill, sponsor_id, 'primary')
# CoSponsors: '//label[text()="Co-Sponsor(s):"]/following-sibling::div/a'
cosponsors = html.xpath('//label[text()="Additional Sponsor(s):"]/'
'following-sibling::div/a/@href')
for sponsor_url in cosponsors:
sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?'
'personId=', '')
self.add_sponsor_by_legislator_id(bill, sponsor_id, 'cosponsor')
versions = html.xpath('//label[text()="Original Text:"]/following-sibling::div/a/@href')
for version_url in versions:
media_type = self.mime_from_link(version_url)
version_name = 'Bill Text'
# on_duplicate='error'
bill.add_version_link(version_name, version_url, media_type=media_type)
fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href')
for fiscal in fiscals:
self.scrape_fiscal_note(bill, fiscal)
self.scrape_actions(bill, row['LegislationId'])
yield from self.scrape_votes(bill, row['LegislationId'], session)
yield bill
示例2: scrape_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_abstract [as 别名]
def scrape_bill(self, session, bill_id, chamber):
# https://malegislature.gov/Bills/189/SD2739
session_for_url = self.replace_non_digits(session)
bill_url = 'https://malegislature.gov/Bills/{}/{}'.format(session_for_url, bill_id)
try:
response = requests.get(bill_url)
except requests.exceptions.RequestException as e:
self.warning(u'Server Error on {}'.format(bill_url))
return False
html = response.text
page = lxml.html.fromstring(html)
if not page.xpath('//div[contains(@class, "followable")]/h1/text()'):
self.warning(u'Server Error on {}'.format(bill_url))
return False
bill_title = page.xpath('//div[@id="contentContainer"]/div/div/h2/text()')[0]
bill_id = re.sub(r'[^S|H|D|\d]', '', bill_id)
bill = Bill(bill_id, legislative_session=session, chamber=chamber,
title=bill_title, classification='bill')
bill_summary = None
if page.xpath('//p[@id="pinslip"]/text()'):
bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0]
if bill_summary:
bill.add_abstract(bill_summary, 'summary')
bill.add_source(bill_url)
# https://malegislature.gov/Bills/189/SD2739 has a presenter
# https://malegislature.gov/Bills/189/S2168 no sponsor
# Find the non-blank text of the dt following Sponsor or Presenter,
# including any child link text.
sponsor = page.xpath(
'//dt[text()="Sponsor:" or text()="Presenter:"]/'
'following-sibling::dd/descendant-or-self::*/text()[normalize-space()]')
if sponsor:
sponsor = sponsor[0].strip()
bill.add_sponsorship(sponsor, classification='primary', primary=True,
entity_type='person')
self.scrape_cosponsors(bill, bill_url)
version = page.xpath("//div[contains(@class, 'modalBtnGroup')]/"
"a[contains(text(), 'Download PDF') and not(@disabled)]/@href")
if version:
version_url = "https://malegislature.gov{}".format(version[0])
bill.add_version_link('Bill Text', version_url, media_type='application/pdf')
# yield back votes and bill
yield from self.scrape_actions(bill, bill_url, session)
yield bill
示例3: scrape_bills
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_abstract [as 别名]
def scrape_bills(self, session):
session_key = SESSION_KEYS[session]
measures_response = self.api_client.get('measures', page=500, session=session_key)
legislators = index_legislators(self, session_key)
for measure in measures_response:
bid = '{} {}'.format(measure['MeasurePrefix'], measure['MeasureNumber'])
chamber = self.chamber_code[bid[0]]
bill = Bill(
bid.replace(' ', ''),
legislative_session=session,
chamber=chamber,
title=measure['RelatingTo'],
classification=self.bill_types[measure['MeasurePrefix'][1:]]
)
bill.add_abstract(measure['MeasureSummary'].strip(), note='summary')
for sponsor in measure['MeasureSponsors']:
legislator_code = sponsor['LegislatoreCode'] # typo in API
if legislator_code:
try:
legislator = legislators[legislator_code]
except KeyError:
logger.warn('Legislator {} not found in session {}'.format(
legislator_code, session))
legislator = legislator_code
bill.add_sponsorship(
name=legislator,
classification={'Chief': 'primary', 'Regular': 'cosponsor'}[
sponsor['SponsorLevel']],
entity_type='person',
primary=True if sponsor['SponsorLevel'] == 'Chief' else False
)
bill.add_source(
"https://olis.leg.state.or.us/liz/{session}/Measures/Overview/{bid}".format(
session=session_key, bid=bid.replace(' ', ''))
)
for document in measure['MeasureDocuments']:
# TODO: probably mixing documents & versions here - should revisit
try:
bill.add_version_link(document['VersionDescription'], document['DocumentUrl'],
media_type='application/pdf')
except ValueError:
logger.warn('Duplicate link found for {}'.format(document['DocumentUrl']))
for action in measure['MeasureHistoryActions']:
classifiers = self.determine_action_classifiers(action['ActionText'])
when = datetime.datetime.strptime(action['ActionDate'], '%Y-%m-%dT%H:%M:%S')
when = self.tz.localize(when)
bill.add_action(action['ActionText'], when,
chamber=self.chamber_code[action['Chamber']],
classification=classifiers)
yield bill
示例4: scrape_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_abstract [as 别名]
def scrape_bill(self, session, session_slug, chamber, url):
page = lxml.html.fromstring(self.get(url).text)
bill_no = page.xpath('//*[@id="item-header"]/text()')[0].strip()
# state bill id
internal_id = re.search(r'\/Bill\/(\d+)\/Overview', url).group(1)
# bill data gets filled in from another call
bill_data_base = 'https://www.leg.state.nv.us/App/NELIS/REL/{}/Bill/' \
'FillSelectedBillTab?selectedTab=Overview&billKey={}&_={}'
bill_data_url = bill_data_base.format(
session_slug, internal_id, time.time() * 1000)
bill_page = lxml.html.fromstring(self.get(bill_data_url).text)
short_title = self.get_header_field(bill_page, 'Summary:').text
short_title = short_title.replace(u'\u00a0', ' ')
bill = Bill(
identifier=bill_no,
legislative_session=session,
title=short_title,
chamber=chamber
)
long_title = self.get_header_field(bill_page, 'Title:').text
if long_title is not None:
bill.add_abstract(long_title, 'Summary')
sponsor_div = self.get_header_field(bill_page, 'Primary Sponsor')
if sponsor_div is not None:
self.add_sponsors(sponsor_div, bill, 'primary')
cosponsor_div = self.get_header_field(bill_page, 'Co-Sponsor')
if cosponsor_div is not None:
self.add_sponsors(cosponsor_div, bill, 'cosponsor')
self.add_actions(bill_page, bill, chamber)
self.add_versions(session_slug, internal_id, bill)
bill.subject = list(set(self.subject_mapping[bill_no]))
bdr = self.extract_bdr(short_title)
if bdr:
bill.extras['BDR'] = bdr
bill.extras['NV_ID'] = internal_id
bill.add_source(url)
yield bill
示例5: scrape_bill_2012
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_abstract [as 别名]
def scrape_bill_2012(self, chamber, session, bill_id, url):
html = self.get(url).text
doc = lxml.html.fromstring(html)
doc.make_links_absolute(url)
# find <a name="Title">, get parent dt, get parent dl, then dd n dl
title = doc.xpath('//a[@name="Title"][1]/../../dd[1]/text()')[0].strip()
summary = doc.xpath('//font[@size="3"]/p/text()')[0].strip()
if 'B' in bill_id:
_type = ['bill']
elif 'J' in bill_id:
_type = ['joint resolution']
bill = Bill(
bill_id,
legislative_session=session,
classification=_type,
chamber=chamber,
title=title,
)
bill.add_abstract(summary, note='summary')
bill.add_source(url)
self.parse_bill_sponsors(doc, bill) # sponsors
self.parse_bill_actions(doc, bill) # actions
self.parse_bill_documents(doc, bill) # documents and versions
yield from self.parse_bill_votes(doc, bill) # votes
# subjects
subjects = []
for subj in doc.xpath('//a[contains(@href, "/subjects/")]'):
subjects.append(subj.text.split('-see also-')[0])
bill.subject = subjects
# add bill to collection
self.save_bill(bill)
示例6: _parse_senate_billpage
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_abstract [as 别名]
def _parse_senate_billpage(self, bill_url, year):
bill_page = self.lxmlize(bill_url)
# get all the info needed to record the bill
# TODO probably still needs to be fixed
bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content()
bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content()
bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content()
# bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content()
bill_type = "bill"
triplet = bill_id[:3]
if triplet in bill_types:
bill_type = bill_types[triplet]
subs = []
bid = bill_id.replace(" ", "")
if bid in self._subjects:
subs = self._subjects[bid]
self.info("With subjects for this bill")
self.info(bid)
if bid == 'XXXXXX':
self.info("Skipping Junk Bill")
return
bill = Bill(
bill_id,
title=bill_desc,
chamber='upper',
legislative_session=self._session_id,
classification=bill_type,
)
bill.subject = subs
bill.add_abstract(bill_desc, note='abstract')
bill.add_source(bill_url)
if bill_title:
bill.add_title(bill_title)
# Get the primary sponsor
sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0]
bill_sponsor = sponsor.text_content()
# bill_sponsor_link = sponsor.attrib.get('href')
bill.add_sponsorship(
bill_sponsor,
entity_type='person',
classification='primary',
primary=True,
)
# cosponsors show up on their own page, if they exist
cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]')
if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'):
self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href'])
# get the actions
action_url = bill_page.xpath('//a[@id="hlAllActions"]')
if len(action_url) > 0:
action_url = action_url[0].attrib['href']
self._parse_senate_actions(bill, action_url)
# stored on a separate page
versions_url = bill_page.xpath('//a[@id="hlFullBillText"]')
if len(versions_url) > 0 and versions_url[0].attrib.get('href'):
self._parse_senate_bill_versions(bill, versions_url[0].attrib['href'])
amendment_links = bill_page.xpath('//a[contains(@href,"ShowAmendment.asp")]')
for link in amendment_links:
link_text = link.xpath('string(.)').strip()
if 'adopted' in link_text.lower():
link_url = link.xpath('@href')[0]
bill.add_version_link(link_text, link_url, media_type='application/pdf',
on_duplicate='ignore')
yield bill
示例7: scrape
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_abstract [as 别名]
def scrape(self):
for agenda_item in self.agendaItems(date_from=self.start_date, date_to=self.end_date):
# TODO: Add agenda_item type to OCD
leg_type = "bill"
title = agenda_item["Title"].replace("\n", " ")
title_re = re.compile(
"^(.+?)(?: - (?:by )?((?:Deputy )?Mayor|Councillor) (.+), seconded by ((?:Deputy )?Mayor|Councillor) (.+))?$"
)
title, primary_role, primary_sponsor, secondary_role, secondary_sponsor = re.match(title_re, title).groups()
b = Bill(
identifier=agenda_item["Item No."],
title=title,
legislative_session=None,
classification=leg_type,
from_organization={"name": self.jurisdiction.name},
)
b.add_source(agenda_item["url"], note="web")
if primary_sponsor and secondary_sponsor:
b.add_sponsorship(primary_sponsor, "mover", "person", True)
b.add_sponsorship(secondary_sponsor, "seconder", "person", False)
# TODO: Fake session for now
b.legislative_session = "2014-2018"
agenda_item_versions = self.agendaItemVersions(agenda_item["url"])
# Use one version's full_text (will be most recent)
b.extras["full_text"] = agenda_item_versions[0]["full_text"]
for version in agenda_item_versions:
action_date = self.toDate(version["date"])
if "Summary" in version["sections"]:
# TODO: Investigate whether these vary between versions, as
# we perhaps don't need to add one for each
b.add_abstract(version["sections"]["Summary"], note="", date=action_date)
if not version["action"]:
continue
if re.match(r"\d+:\d+ [A|P]M", version["action"]):
continue
action_description = version["action"]
responsible_org = version["responsible_org"]
action_class = ACTION_CLASSIFICATION.get(version["action"])
def is_recommendation(version):
return any("Recommendations" in s for s in version["sections"].keys())
if responsible_org == "City Council":
responsible_org = self.jurisdiction.name
else:
if action_class == "passage":
action_class = "committee-passage"
if is_recommendation(version):
action_class = "committee-passage-favorable"
b.add_action(
action_description, action_date, organization={"name": responsible_org}, classification=action_class
)
yield b
示例8: _scrape_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_abstract [as 别名]
def _scrape_bill(self, session, bill_data):
details = self._parse_bill_details(bill_data)
(senate_url, assembly_url, bill_chamber, bill_type, bill_id,
title, (prefix, number, active_version)) = details
bill = Bill(
bill_id,
legislative_session=session,
chamber=bill_chamber,
title=title or bill_data['summary'],
classification=bill_type,
)
if bill_data['summary']:
bill.add_abstract(bill_data['summary'], note='')
bill_active_version = bill_data['amendments']['items'][active_version]
# Parse sponsors.
if bill_data['sponsor'] is not None:
if bill_data['sponsor']['rules'] is True:
bill.add_sponsorship(
'Rules Committee',
entity_type='organization',
classification='primary',
primary=True,
)
elif not bill_data['sponsor']['budget']:
primary_sponsor = bill_data['sponsor']['member']
bill.add_sponsorship(
primary_sponsor['shortName'],
entity_type='person',
classification='primary',
primary=True,
)
# There *shouldn't* be cosponsors if there is no sponsor.
cosponsors = bill_active_version['coSponsors']['items']
for cosponsor in cosponsors:
bill.add_sponsorship(
cosponsor['shortName'],
entity_type='person',
classification='cosponsor',
primary=False,
)
# List companion bill.
same_as = bill_active_version.get('sameAs', {})
# Check whether "sameAs" property is populated with at least one bill.
if same_as['items']:
# Get companion bill ID.
companion_bill_id = same_as['items'][0]['basePrintNo']
# Build companion bill session.
start_year = same_as['items'][0]['session']
end_year = start_year + 1
companion_bill_session = '-'.join([str(start_year), str(end_year)])
# Attach companion bill data.
bill.add_related_bill(
companion_bill_id,
companion_bill_session,
relation_type='companion',
)
# Parse actions.
chamber_map = {
'senate': 'upper',
'assembly': 'lower',
}
for action in bill_data['actions']['items']:
chamber = chamber_map[action['chamber'].lower()]
action_datetime = datetime.datetime.strptime(action['date'], '%Y-%m-%d')
action_date = action_datetime.date()
types, _ = NYBillScraper.categorizer.categorize(action['text'])
bill.add_action(
action['text'],
action_date.strftime('%Y-%m-%d'),
chamber=chamber,
classification=types,
)
# Handling of sources follows. Sources serving either chamber
# maintain duplicate data, so we can see certain bill data
# through either chamber's resources. However, we have to refer
# to a specific chamber's resources if we want to grab certain
# specific information such as vote data.
#
# As such, I'm placing all potential sources in the interest of
# thoroughness. - Andy Lo
# List Open Legislation API endpoint as a source.
api_url = self.api_client.root + self.api_client.resources['bill'].format(
session_year=session,
bill_id=bill_id,
summary='',
detail='')
#.........这里部分代码省略.........
示例9: scrape_bill_type
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_abstract [as 别名]
#.........这里部分代码省略.........
if len(version.title) < len(version.short_title) and \
not version.title.lower().startswith('an act'):
title = clean_title(version.short_title)
else:
title = clean_title(version.title)
if title:
all_titles.add(title)
type_ = [bill_type]
if version.appropriation == 'Yes':
type_.append('appropriation')
tags = []
if version.fiscal_committee == 'Yes':
tags.append('fiscal committee')
if version.local_program == 'Yes':
tags.append('local program')
if version.urgency == 'Yes':
tags.append('urgency')
if version.taxlevy == 'Yes':
tags.append('tax levy')
if version.subject:
subject = clean_title(version.subject)
if not title:
self.warning("Couldn't find title for %s, skipping" % bill_id)
continue
fsbill.title = title
if summary:
fsbill.add_abstract(summary, note='summary')
fsbill.classification = type_
fsbill.subject = [subject] if subject else []
fsbill.extras['impact_clause'] = impact_clause
fsbill.extras['tags'] = tags
# We don't want the current title in alternate_titles
all_titles.remove(title)
for title in all_titles:
fsbill.add_title(title)
for author in version.authors:
fsbill.add_sponsorship(
author.name,
classification=SPONSOR_TYPES[author.contribution],
primary=author.primary_author_flg == 'Y',
entity_type='person',
)
# fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution}
seen_actions = set()
for action in bill.actions:
if not action.action:
# NULL action text seems to be an error on CA's part,
# unless it has some meaning I'm missing
continue
actor = action.actor or chamber
actor = actor.strip()
match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
if match:
actor = {'Assembly': 'lower',
'Senate': 'upper'}[match.group(1)]
示例10: scrape
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_abstract [as 别名]
#.........这里部分代码省略.........
instrument = backoff(self.lservice.GetLegislationDetail, lid)
history = [x for x in instrument['StatusHistory'][0]]
actions = reversed([{
'code': x['Code'],
'action': x['Description'],
'_guid': x['Id'],
'date': x['Date']
} for x in history])
guid = instrument['Id']
# A little bit hacky.
bill_prefix = instrument['DocumentType']
bill_chamber = chamber_map[bill_prefix[0]]
bill_type = bill_type_map[bill_prefix[1:]]
bill_id = '%s %s' % (
bill_prefix,
instrument['Number'],
)
if instrument['Suffix']:
bill_id += instrument['Suffix']
title = instrument['Caption']
description = instrument['Summary']
if title is None:
continue
bill = Bill(
bill_id, legislative_session=session, chamber=bill_chamber, title=title,
classification=bill_type)
bill.add_abstract(description, note='description')
bill.extras = {'guid': guid}
if instrument['Votes']:
for vote_ in instrument['Votes']:
_, vote_ = vote_
vote_ = backoff(self.vservice.GetVote, vote_[0]['VoteId'])
vote = VoteEvent(
start_date=vote_['Date'].strftime('%Y-%m-%d'),
motion_text=vote_['Caption'] or 'Vote on Bill',
chamber={'House': 'lower', 'Senate': 'upper'}[vote_['Branch']],
result='pass' if vote_['Yeas'] > vote_['Nays'] else 'fail',
classification='passage',
bill=bill,
)
vote.set_count('yes', vote_['Yeas'])
vote.set_count('no', vote_['Nays'])
vote.set_count('other', vote_['Excused'] + vote_['NotVoting'])
vote.add_source(self.vsource)
methods = {'Yea': 'yes', 'Nay': 'no'}
for vdetail in vote_['Votes'][0]:
whom = vdetail['Member']
how = vdetail['MemberVoted']
vote.vote(methods.get(how, 'other'), whom['Name'])
yield vote
ccommittees = defaultdict(list)
committees = instrument['Committees']
示例11: scrape_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_abstract [as 别名]
def scrape_bill(self, bill_url, bill_id, session_id):
page = self.lxmlize(bill_url)
# create bill
title = page.xpath("//h1/text()")[0]
bill = Bill(identifier=bill_id,
legislative_session=session_id,
title=title)
bill.add_source(bill_url, note="detail")
# add additional fields
# abstract
try:
# abstract is directly above <h2>Legislative History</h2>
leg_his = page.xpath("//h2[text()='Legislative History']")[0]
abstract = leg_his.xpath("preceding-sibling::p/text()")[0]
bill.add_abstract(abstract=abstract.strip(), note="summary")
# TODO trim whitespace from summary
except IndexError:
print("No abstract for bill {} in session {}".format(bill_id, session_id))
# the rest of the fields are found inside this <table>
data_table = page.xpath("//table[contains(@class, 'data')]")[0]
# sponsor
sponsor_name = data_table.xpath(self.bill_table_query("Sponsor") + "/text()")[0]
bill.add_sponsorship(name=sponsor_name,
classification="Primary",
entity_type="person",
primary=True
)
# actions
action_lines = data_table.xpath(self.bill_table_query("Actions") + "/text()")
for line in action_lines:
line = line.join('')
try:
for date_str, action_type in self.parse_actions(line):
bill.add_action(date=date_str,
description=action_type,
classification=action_type)
except ValueError:
print("failed to parse these actions: {}".format([line]))
# co-sponsors
co_sponsors = data_table.xpath(self.bill_table_query("Co-Sponsors") + "/text()")
co_sponsors = [name.strip() for name in co_sponsors if name.strip()]
for name in co_sponsors:
bill.add_sponsorship(name=name,
classification="co-sponsor",
entity_type="person",
primary=False)
# committee (stored as another sponsorship in OCD)
committees = data_table.xpath(self.bill_table_query("Committee") + "/a/text()")
for comm in committees:
bill.add_sponsorship(name=comm,
classification="secondary", # classification ?
entity_type="organization",
primary=False)
return bill
示例12: _scrape_bills
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_abstract [as 别名]
def _scrape_bills(self):
"""
Does the following
1) Scrapes bill data from unitedstates project and saves the data to path specified in UnitedStates module
2) Iterates over bill data and converts each one to an OCD-compliant bill model.
3) Yields the OCD-compliant bill model instance
@return: generator for federal US bills in OCD-compliant format
@rtype: generator
"""
# run scraper first to pull in all the bill data
self._run_unitedstates_bill_scraper()
# iterate over all the files and build and yield Bill objects
for filename in find_files(settings.SCRAPED_DATA_DIR, '.*/data/[0-9]+/bills/[^\/]+/[^\/]+/data.json'):
try:
with open(filename) as json_file:
json_data = json.load(json_file)
# Initialize Object
bill = Bill(constants.TYPE_MAP[json_data['bill_type']]['canonical'] + ' ' + json_data['number'],
json_data['congress'],
json_data['official_title'],
chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber']
)
# add source of data
bill.add_source(json_data['url'], note='all')
# add subjects
for subject in json_data['subjects']:
bill.add_subject(subject)
# add summary
if 'summary' in json_data and json_data['summary'] is not None:
bill.add_abstract(json_data['summary']['text'],
json_data['summary']['as'],
json_data['summary']['date'])
# add titles
for item in json_data['titles']:
bill.add_title(item['title'], item['type'])
# add other/related Bills
for b in json_data['related_bills']:
if 'type' in b and b['type'] == 'bill':
split = b['bill_id'].split('-')
m = UnitedStatesBillScraper.BILL_SPLIT.match(split[0])
bill.add_related_bill(constants.TYPE_MAP[m.group(1)]['canonical'] + ' ' + m.group(2),
legislative_session=split[1],
relation_type='companion')
# add sponsor
bill.add_sponsorship_by_identifier(json_data['sponsor']['name'], 'person', 'person', True,
scheme='thomas_id', identifier=json_data['sponsor']['thomas_id'],
chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber'])
# add cosponsors
for cs in json_data['cosponsors']:
bill.add_sponsorship_by_identifier(cs['name'], 'person', 'person', False,
scheme='thomas_id', identifier=cs['thomas_id'],
chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber'])
# add introduced_at and actions
bill.add_action('date of introduction', datetime_to_date(json_data['introduced_at']),
chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber'],
related_entities=[])
# add other actions
for action in json_data['actions']:
bill.actions.append({'date': datetime_to_date(action['acted_at']),
'type': [action['type']],
'description': action['text'],
'actor': constants.TYPE_MAP[json_data['bill_type']]['chamber'],
'related_entities': []
})
# add bill versions
for version_path in find_files(os.path.join(settings.SCRAPED_DATA_DIR,
'data', bill.legislative_session, 'bills', json_data['bill_type'],
json_data['bill_type'] + json_data['number'],
'text-versions'), '/.*/*\.json'):
try:
with open(version_path) as version_file:
version_json_data = json.load(version_file)
for k, v in version_json_data['urls'].items():
bill.versions.append({'date': datetime_to_date(version_json_data['issued_on']),
'type': version_json_data['version_code'],
'name': constants.VERSION_MAP[version_json_data['version_code']],
'links': [{'mimetype': k, 'url': v}]})
except IOError:
print("Unable to open or parse file with path " + version_path)
continue
# finally yield bill object
yield bill
except IOError:
#.........这里部分代码省略.........
示例13: test_full_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_abstract [as 别名]
def test_full_bill():
create_jurisdiction()
person = Person.objects.create(id='person-id', name='Adam Smith')
org = ScrapeOrganization(name='House', classification='lower')
com = ScrapeOrganization(name='Arbitrary Committee', classification='committee',
parent_id=org._id)
oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act',
classification='tax bill', from_organization=org._id)
bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act',
classification='tax bill', from_organization=org._id)
bill.subject = ['taxes', 'axes']
bill.add_identifier('SB 9')
bill.add_title('Tack & Axe Tax Act')
bill.add_action('introduced in house', '1900-04-01', chamber='lower')
act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower')
act.add_related_entity('arbitrary committee', 'organization', com._id)
bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session")
bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person',
primary=False, entity_id=person.id)
bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person',
primary=True)
bill.add_abstract('This is an act about axes and taxes and tacks.', note="official")
bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf',
media_type='application/pdf')
bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html')
bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html')
bill.add_source('http://example.com/source')
# import bill
oi = OrganizationImporter('jid')
oi.import_data([org.as_dict(), com.as_dict()])
pi = PersonImporter('jid')
pi.json_to_db_id['person-id'] = 'person-id'
# Since we have to create this person behind the back of the import
# transaction, we'll fake the json-id to db-id, since they match in this
# case. This is *really* getting at some implementation detail, but it's
# the cleanest way to ensure we short-circut the json id lookup.
BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()])
# get bill from db and assert it imported correctly
b = Bill.objects.get(identifier='HB 1')
assert b.from_organization.classification == 'lower'
assert b.identifier == bill.identifier
assert b.title == bill.title
assert b.classification == bill.classification
assert b.subject == ['taxes', 'axes']
assert b.abstracts.get().note == 'official'
# other_title, other_identifier added
assert b.other_titles.get().title == 'Tack & Axe Tax Act'
assert b.other_identifiers.get().identifier == 'SB 9'
# actions
actions = list(b.actions.all())
assert len(actions) == 2
# ensure order was preserved (if this breaks it'll be intermittent)
assert actions[0].organization == Organization.objects.get(classification='lower')
assert actions[0].description == "introduced in house"
assert actions[1].description == "sent to arbitrary committee"
assert (actions[1].related_entities.get().organization ==
Organization.objects.get(classification='committee'))
# related_bills were added
rb = b.related_bills.get()
assert rb.identifier == 'HB 99'
# and bill got resolved
assert rb.related_bill.identifier == 'HB 99'
# sponsors added, linked & unlinked
sponsorships = b.sponsorships.all()
assert len(sponsorships) == 2
for ss in sponsorships:
if ss.primary:
assert ss.person is None
assert ss.organization is None
else:
assert ss.person == person
# versions & documents with their links
versions = b.versions.all()
assert len(versions) == 1
assert versions[0].links.count() == 1
documents = b.documents.all()
assert len(documents) == 1
assert documents[0].links.count() == 2
# sources
assert b.sources.count() == 1
示例14: _parse_senate_billpage
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_abstract [as 别名]
def _parse_senate_billpage(self, bill_url, year):
bill_page = self.lxmlize(bill_url)
# get all the info needed to record the bill
# TODO probably still needs to be fixed
bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content()
bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content()
bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content()
# bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content()
bill_type = "bill"
triplet = bill_id[:3]
if triplet in bill_types:
bill_type = bill_types[triplet]
subs = []
bid = bill_id.replace(" ", "")
if bid in self._subjects:
subs = self._subjects[bid]
self.info("With subjects for this bill")
self.info(bid)
bill = Bill(
bill_id,
title=bill_desc,
legislative_session=year,
classification=bill_type,
)
bill.subject = subs
bill.add_abstract(bill_desc, note='abstract')
bill.add_source(bill_url)
if bill_title:
bill.add_title(bill_title)
# Get the primary sponsor
sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0]
bill_sponsor = sponsor.text_content()
# bill_sponsor_link = sponsor.attrib.get('href')
bill.add_sponsorship(
bill_sponsor,
entity_type='person',
classification='primary',
primary=True,
)
# cosponsors show up on their own page, if they exist
cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]')
if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'):
self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href'])
# get the actions
action_url = bill_page.xpath('//a[@id="hlAllActions"]')
if len(action_url) > 0:
action_url = action_url[0].attrib['href']
self._parse_senate_actions(bill, action_url)
# stored on a separate page
versions_url = bill_page.xpath('//a[@id="hlFullBillText"]')
if len(versions_url) > 0 and versions_url[0].attrib.get('href'):
self._parse_senate_bill_versions(bill, versions_url[0].attrib['href'])
yield bill
示例15: scrape_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_abstract [as 别名]
def scrape_bill(self, bill_id):
old = self.api('bills/' + bill_id + '?')
# not needed
old.pop('id')
old.pop('state')
old.pop('level', None)
old.pop('country', None)
old.pop('created_at')
old.pop('updated_at')
old.pop('action_dates')
old.pop('+bill_type',None)
old.pop('+subject', None)
old.pop('+scraped_subjects', None)
old.pop('subjects', [])
classification = old.pop('type')
# ca weirdness
if 'fiscal committee' in classification:
classification.remove('fiscal committee')
if 'urgency' in classification:
classification.remove('urgency')
if 'local program' in classification:
classification.remove('local program')
if 'tax levy' in classification:
classification.remove('tax levy')
if classification[0] in ['miscellaneous', 'jres', 'cres']:
return
if classification == ['memorial resolution'] and self.state == 'ar':
classification = ['memorial']
if classification == ['concurrent memorial resolution'] and self.state == 'ar':
classification = ['concurrent memorial']
if classification == ['joint session resolution'] and self.state == 'il':
classification = ['joint resolution']
if classification == ['legislative resolution'] and self.state == 'ny':
classification = ['resolution']
if classification == ['address'] and self.state == 'nh':
classification = ['resolution']
if not old['title'] and self.state == 'me':
old['title'] = '(unknown)'
chamber = old.pop('chamber')
if self.state in ('ne', 'dc'):
chamber = 'legislature'
elif chamber in ('joint', 'conference'):
chamber = 'legislature'
new = Bill(old.pop('bill_id'), old.pop('session'), old.pop('title'),
chamber=chamber, classification=classification)
abstract = old.pop('summary', None)
if abstract:
new.add_abstract(abstract, note='')
for title in old.pop('alternate_titles'):
new.add_title(title)
for doc in old.pop('documents'):
new.add_document_link(doc['name'], doc['url'], on_duplicate='ignore')
for doc in old.pop('versions'):
new.add_version_link(doc['name'], doc['url'], media_type=doc.pop('mimetype', ''))
for subj in old.pop('scraped_subjects', []):
if subj:
new.add_subject(subj)
for spon in old.pop('sponsors'):
if spon.get('committee_id') is not None:
entity_type = 'organization'
elif spon.get('leg_id') is not None:
entity_type = 'person'
else:
entity_type = ''
new.add_sponsorship(spon['name'], spon['type'], entity_type,
spon['type'] == 'primary')
for act in old.pop('actions'):
actor = act['actor']
if actor.lower() in ('governor', 'mayor', 'secretary of state'):
actor = 'executive'
elif actor.lower() == 'house' or (actor.lower().startswith('lower (') and self.state == 'ca'):
actor = 'lower'
elif actor.lower() in ('senate', 'upper`') or (actor.lower().startswith('upper (') and self.state == 'ca'):
actor = 'upper'
elif actor in ('joint', 'other', 'Data Systems', 'Speaker', 'clerk',
'Office of the Legislative Fiscal Analyst', 'Became Law w',
'conference') or (actor.lower().startswith('legislature (') and self.state == 'ca'):
actor = 'legislature'
if actor in ('committee', 'sponsor') and self.state == 'pr':
actor = 'legislature'
# nebraska & DC
if actor in ('upper','council') and self.state in ('ne', 'dc'):
actor = 'legislature'
#.........这里部分代码省略.........