本文整理汇总了Python中pupa.scrape.Bill.add_version_link方法的典型用法代码示例。如果您正苦于以下问题:Python Bill.add_version_link方法的具体用法?Python Bill.add_version_link怎么用?Python Bill.add_version_link使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pupa.scrape.Bill
的用法示例。
在下文中一共展示了Bill.add_version_link方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: scrape_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_version_link [as 别名]
def scrape_bill(self, row, chamber, session):
bill_id = row['LegislationNumber']
# TODO: re-evaluate if these should be separate bills
if 'SA' in bill_id or 'HA' in bill_id:
self.warning('skipping amendment %s', bill_id)
return
bill_type = self.classify_bill(bill_id)
bill = Bill(identifier=bill_id,
legislative_session=session,
chamber=chamber,
title=row['LongTitle'],
classification=bill_type)
if row['Synopsis']:
bill.add_abstract(row['Synopsis'], 'synopsis')
if row['ShortTitle']:
bill.add_title(row['ShortTitle'], 'short title')
if row['SponsorPersonId']:
self.add_sponsor_by_legislator_id(bill, row['SponsorPersonId'], 'primary')
# TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API?
html_url = 'https://legis.delaware.gov/BillDetail?LegislationId={}'.format(
row['LegislationId']
)
bill.add_source(html_url, note='text/html')
html = self.lxmlize(html_url)
# Additional Sponsors: '//label[text()="Additional Sponsor(s):"]/following-sibling::div/a'
additional_sponsors = html.xpath('//label[text()="Additional Sponsor(s):"]'
'/following-sibling::div/a/@href')
for sponsor_url in additional_sponsors:
sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?'
'personId=', '')
self.add_sponsor_by_legislator_id(bill, sponsor_id, 'primary')
# CoSponsors: '//label[text()="Co-Sponsor(s):"]/following-sibling::div/a'
cosponsors = html.xpath('//label[text()="Additional Sponsor(s):"]/'
'following-sibling::div/a/@href')
for sponsor_url in cosponsors:
sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?'
'personId=', '')
self.add_sponsor_by_legislator_id(bill, sponsor_id, 'cosponsor')
versions = html.xpath('//label[text()="Original Text:"]/following-sibling::div/a/@href')
for version_url in versions:
media_type = self.mime_from_link(version_url)
version_name = 'Bill Text'
# on_duplicate='error'
bill.add_version_link(version_name, version_url, media_type=media_type)
fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href')
for fiscal in fiscals:
self.scrape_fiscal_note(bill, fiscal)
self.scrape_actions(bill, row['LegislationId'])
yield from self.scrape_votes(bill, row['LegislationId'], session)
yield bill
示例2: scrape_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_version_link [as 别名]
def scrape_bill(self, session, bill_id, chamber):
# https://malegislature.gov/Bills/189/SD2739
session_for_url = self.replace_non_digits(session)
bill_url = 'https://malegislature.gov/Bills/{}/{}'.format(session_for_url, bill_id)
try:
response = requests.get(bill_url)
except requests.exceptions.RequestException as e:
self.warning(u'Server Error on {}'.format(bill_url))
return False
html = response.text
page = lxml.html.fromstring(html)
if not page.xpath('//div[contains(@class, "followable")]/h1/text()'):
self.warning(u'Server Error on {}'.format(bill_url))
return False
bill_title = page.xpath('//div[@id="contentContainer"]/div/div/h2/text()')[0]
bill_id = re.sub(r'[^S|H|D|\d]', '', bill_id)
bill = Bill(bill_id, legislative_session=session, chamber=chamber,
title=bill_title, classification='bill')
bill_summary = None
if page.xpath('//p[@id="pinslip"]/text()'):
bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0]
if bill_summary:
bill.add_abstract(bill_summary, 'summary')
bill.add_source(bill_url)
# https://malegislature.gov/Bills/189/SD2739 has a presenter
# https://malegislature.gov/Bills/189/S2168 no sponsor
# Find the non-blank text of the dt following Sponsor or Presenter,
# including any child link text.
sponsor = page.xpath(
'//dt[text()="Sponsor:" or text()="Presenter:"]/'
'following-sibling::dd/descendant-or-self::*/text()[normalize-space()]')
if sponsor:
sponsor = sponsor[0].strip()
bill.add_sponsorship(sponsor, classification='primary', primary=True,
entity_type='person')
self.scrape_cosponsors(bill, bill_url)
version = page.xpath("//div[contains(@class, 'modalBtnGroup')]/"
"a[contains(text(), 'Download PDF') and not(@disabled)]/@href")
if version:
version_url = "https://malegislature.gov{}".format(version[0])
bill.add_version_link('Bill Text', version_url, media_type='application/pdf')
# yield back votes and bill
yield from self.scrape_actions(bill, bill_url, session)
yield bill
示例3: scrape_bills
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_version_link [as 别名]
def scrape_bills(self, session):
session_key = SESSION_KEYS[session]
measures_response = self.api_client.get('measures', page=500, session=session_key)
legislators = index_legislators(self, session_key)
for measure in measures_response:
bid = '{} {}'.format(measure['MeasurePrefix'], measure['MeasureNumber'])
chamber = self.chamber_code[bid[0]]
bill = Bill(
bid.replace(' ', ''),
legislative_session=session,
chamber=chamber,
title=measure['RelatingTo'],
classification=self.bill_types[measure['MeasurePrefix'][1:]]
)
bill.add_abstract(measure['MeasureSummary'].strip(), note='summary')
for sponsor in measure['MeasureSponsors']:
legislator_code = sponsor['LegislatoreCode'] # typo in API
if legislator_code:
try:
legislator = legislators[legislator_code]
except KeyError:
logger.warn('Legislator {} not found in session {}'.format(
legislator_code, session))
legislator = legislator_code
bill.add_sponsorship(
name=legislator,
classification={'Chief': 'primary', 'Regular': 'cosponsor'}[
sponsor['SponsorLevel']],
entity_type='person',
primary=True if sponsor['SponsorLevel'] == 'Chief' else False
)
bill.add_source(
"https://olis.leg.state.or.us/liz/{session}/Measures/Overview/{bid}".format(
session=session_key, bid=bid.replace(' ', ''))
)
for document in measure['MeasureDocuments']:
# TODO: probably mixing documents & versions here - should revisit
try:
bill.add_version_link(document['VersionDescription'], document['DocumentUrl'],
media_type='application/pdf')
except ValueError:
logger.warn('Duplicate link found for {}'.format(document['DocumentUrl']))
for action in measure['MeasureHistoryActions']:
classifiers = self.determine_action_classifiers(action['ActionText'])
when = datetime.datetime.strptime(action['ActionDate'], '%Y-%m-%dT%H:%M:%S')
when = self.tz.localize(when)
bill.add_action(action['ActionText'], when,
chamber=self.chamber_code[action['Chamber']],
classification=classifiers)
yield bill
示例4: scrape_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_version_link [as 别名]
def scrape_bill(self, chamber, session, bill_id):
bill_num = bill_id.split()[1]
url = ("%s/GetLegislation?biennium=%s&billNumber"
"=%s" % (self._base_url, self.biennium, bill_num))
page = self.get(url)
page = lxml.etree.fromstring(page.content)
page = xpath(page, "//wa:Legislation")[0]
title = xpath(page, "string(wa:LongDescription)")
bill_type = xpath(
page,
"string(wa:ShortLegislationType/wa:LongLegislationType)")
bill_type = bill_type.lower()
if bill_type == 'gubernatorial appointment':
return
bill = Bill(bill_id, legislative_session=session, chamber=chamber,
title=title, classification=[bill_type])
fake_source = ("http://apps.leg.wa.gov/billinfo/"
"summary.aspx?bill=%s&year=%s" % (
bill_num, session[0:4]))
bill.add_source(fake_source)
try:
for version in self.versions[bill_id]:
bill.add_version_link(note=version['note'],
url=version['url'],
media_type=version['media_type'])
except KeyError:
self.warning("No versions were found for {}".format(bill_id))
try:
for document in self.documents[bill_num]:
bill.add_document_link(note=document['note'],
url=document['url'],
media_type=document['media_type'])
except KeyError:
pass
self.scrape_sponsors(bill)
self.scrape_actions(bill, bill_num)
self.scrape_hearings(bill, bill_num)
yield from self.scrape_votes(bill)
bill.subject = list(set(self._subjects[bill_id]))
yield bill
示例5: scrape
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_version_link [as 别名]
def scrape(self):
self.session = '2011'
for i, page in enumerate(self.searchLegislation()) :
for legislation_summary in self.parseSearchResults(page) :
title = legislation_summary['Title'].strip()
if title == "":
continue
bill = Bill(name=legislation_summary['Record #'],
session=self.session,
title=title,
type=[legislation_summary['Type'].lower()],
organization=self.jurisdiction.name)
bill.add_source(legislation_summary['URL'])
legislation_details = self.expandLegislationSummary(legislation_summary)
for related_bill in legislation_details.get('Related files', []) :
bill.add_related_bill(name = related_bill,
session = self.session,
relation='other-session',
chamber=None)
for i, sponsor in enumerate(legislation_details.get('Sponsors', [])) :
if i == 0 :
primary = True
sponsorship_type = "Primary"
else :
primary = False
sponsorship_type = "Regular"
bill.add_sponsor(sponsor, sponsorship_type,
'person', primary)
for subject in legislation_details.get(u'Topics', []) :
bill.add_subject(subject)
for attachment in legislation_details.get(u'Attachments', []) :
bill.add_version_link('PDF',
attachment['url'],
mimetype="application/pdf")
yield bill
示例6: scrape_chamber
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_version_link [as 别名]
def scrape_chamber(self, chamber, session):
chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber]
url = ("http://legisweb.state.wy.us/%s/billreference/"
"BillReference.aspx?type=%s" % (session, chamber_abbrev))
page = self.lxmlize(url)
for tr in page.xpath("//table[contains(@id,'cphContent_gvBills')]//tr")[1:]:
bill_id = tr.xpath("string(td[1])").strip()
title = tr.xpath("string(td[2])").strip()
if bill_id[0:2] in ['SJ', 'HJ']:
bill_type = 'joint resolution'
else:
bill_type = 'bill'
bill = Bill(bill_id, legislative_session=session, title=title, chamber=chamber,
classification=bill_type)
yield from self.scrape_digest(bill, chamber)
# versions
for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') +
tr.xpath('td[12]//a')):
# skip references to other bills
if a.text.startswith('See'):
continue
bill.add_version_link(a.text, a.get('href'),
media_type='application/pdf')
# documents
fnote = tr.xpath('td[9]//a')
if fnote:
bill.add_document_link('Fiscal Note', fnote[0].get('href'))
summary = tr.xpath('td[14]//a')
if summary:
bill.add_document_link('Summary', summary[0].get('href'))
bill.add_source(url)
yield bill
示例7: scrape_bill_type
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_version_link [as 别名]
def scrape_bill_type(self, chamber, session, bill_type, type_abbr,
committee_abbr_regex=get_committee_name_regex()):
bills = self.session.query(CABill).filter_by(
session_year=session).filter_by(
measure_type=type_abbr)
for bill in bills:
bill_session = session
if bill.session_num != '0':
bill_session += ' Special Session %s' % bill.session_num
bill_id = bill.short_bill_id
fsbill = Bill(bill_id, session, title='', chamber=chamber)
if ((bill_id.startswith('S') and chamber == 'lower') or
(bill_id.startswith('A') and chamber == 'upper')):
print("!!!! BAD ID/CHAMBER PAIR !!!!", bill)
continue
# # Construct session for web query, going from '20092010' to '0910'
# source_session = session[2:4] + session[6:8]
# # Turn 'AB 10' into 'ab_10'
# source_num = "%s_%s" % (bill.measure_type.lower(),
# bill.measure_num)
# Construct a fake source url
source_url = ('http://leginfo.legislature.ca.gov/faces/'
'billNavClient.xhtml?bill_id=%s') % bill.bill_id
fsbill.add_source(source_url)
fsbill.add_version_link(bill_id, source_url, media_type='text/html')
title = ''
type_ = ['bill']
subject = ''
all_titles = set()
# Get digest test (aka "summary") from latest version.
if bill.versions:
version = bill.versions[-1]
nsmap = version.xml.nsmap
xpath = '//caml:DigestText/xhtml:p'
els = version.xml.xpath(xpath, namespaces=nsmap)
chunks = []
for el in els:
t = etree_text_content(el)
t = re.sub(r'\s+', ' ', t)
t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t)
chunks.append(t)
summary = '\n\n'.join(chunks)
for version in bill.versions:
if not version.bill_xml:
continue
version_date = self._tz.localize(version.bill_version_action_date)
# create a version name to match the state's format
# 02/06/17 - Enrolled
version_date_human = version_date.strftime(
'%m/%d/%y')
version_name = "{} - {}".format(
version_date_human, version.bill_version_action)
version_base = "https://leginfo.legislature.ca.gov/faces"
version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
version_base, version.bill_id, version.bill_version_id)
fsbill.add_version_link(
version_name,
version_url_pdf,
media_type='application/pdf',
date=version_date.date())
# CA is inconsistent in that some bills have a short title
# that is longer, more descriptive than title.
if bill.measure_type in ('AB', 'SB'):
impact_clause = clean_title(version.title)
title = clean_title(version.short_title)
else:
impact_clause = None
if len(version.title) < len(version.short_title) and \
not version.title.lower().startswith('an act'):
title = clean_title(version.short_title)
else:
title = clean_title(version.title)
if title:
all_titles.add(title)
type_ = [bill_type]
if version.appropriation == 'Yes':
type_.append('appropriation')
tags = []
if version.fiscal_committee == 'Yes':
tags.append('fiscal committee')
#.........这里部分代码省略.........
示例8: scrape_bills
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_version_link [as 别名]
def scrape_bills(self, session, year_abr):
# Main Bill information
main_bill_csv = self.access_to_csv('MainBill')
# keep a dictionary of bills (mapping bill_id to Bill obj)
bill_dict = {}
for rec in main_bill_csv:
bill_type = rec["BillType"].strip()
bill_number = int(rec["BillNumber"])
bill_id = bill_type + str(bill_number)
title = rec["Synopsis"]
if bill_type[0] == 'A':
chamber = "lower"
else:
chamber = "upper"
# some bills have a blank title.. just skip it
if not title:
continue
bill = Bill(
bill_id,
title=title,
chamber=chamber,
legislative_session=session,
classification=self._bill_types[bill_type[1:]],
)
if rec['IdenticalBillNumber'].strip():
bill.add_related_bill(
rec['IdenticalBillNumber'].split()[0],
legislative_session=session,
relation_type='companion',
)
# TODO: last session info is in there too
bill_dict[bill_id] = bill
# Sponsors
bill_sponsors_csv = self.access_to_csv('BillSpon')
for rec in bill_sponsors_csv:
bill_type = rec["BillType"].strip()
bill_number = int(rec["BillNumber"])
bill_id = bill_type + str(bill_number)
if bill_id not in bill_dict:
self.warning('unknown bill %s in sponsor database' % bill_id)
continue
bill = bill_dict[bill_id]
name = rec["Sponsor"]
sponsor_type = rec["Type"]
if sponsor_type == 'P':
sponsor_type = "primary"
else:
sponsor_type = "cosponsor"
bill.add_sponsorship(name, classification=sponsor_type, entity_type='person',
primary=sponsor_type == 'primary')
# Documents
bill_document_csv = self.access_to_csv('BillWP')
for rec in bill_document_csv:
bill_type = rec["BillType"].strip()
bill_number = int(rec["BillNumber"])
bill_id = bill_type + str(bill_number)
if bill_id not in bill_dict:
self.warning('unknown bill %s in document database' % bill_id)
continue
bill = bill_dict[bill_id]
document = rec["Document"]
document = document.split('\\')
document = document[-2] + "/" + document[-1]
# doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
htm_url = 'http://www.njleg.state.nj.us/{}/Bills/{}'.format(
year_abr,
document.replace('.DOC', '.HTM'),
)
# name document based _doctype
try:
doc_name = self._doctypes[rec['DocType']]
except KeyError:
raise Exception('unknown doctype %s on %s' %
(rec['DocType'], bill_id))
if rec['Comment']:
doc_name += ' ' + rec['Comment']
# Clean HTMX links.
if htm_url.endswith('HTMX'):
htm_url = re.sub('X$', '', htm_url)
if rec['DocType'] in self._version_types:
if htm_url.endswith('HTM'):
mimetype = 'text/html'
elif htm_url.endswith('wpd'):
mimetype = 'application/vnd.wordperfect'
try:
bill.add_version_link(doc_name, htm_url, media_type=mimetype)
except ValueError:
#.........这里部分代码省略.........
示例9: bill_info
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_version_link [as 别名]
def bill_info(self, bill_link, session, main_url):
bill_page = self.lxmlize(bill_link)
long_title = self.get_node(
bill_page,
'//div[@class="main-content"]/div[1]/div/h2').text.split()
bill_number = long_title[0]
title = ''
for x in range(2, len(long_title)):
title += long_title[x] + ' '
title = title[0:-1]
if not title:
self.error('no title, skipping %s', bill_number)
return
bill_type = 'resolution' if 'LR' in bill_number else 'bill'
bill = Bill(bill_number, session, title, classification=bill_type)
bill.add_source(main_url)
bill.add_source(bill_link)
introduced_by = self.get_node(
bill_page,
'//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/a[1]/text()')
if not introduced_by:
introduced_by = self.get_node(
bill_page,
'//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/text()')
introduced_by = introduced_by.split('Introduced By:')[1].strip()
bill.add_sponsorship(
name=introduced_by,
entity_type='person',
primary=True,
classification='primary',
)
action_nodes = self.get_nodes(
bill_page,
'//div[@class="main-content"]/div[5]//table/tbody/tr')
for action_node in action_nodes:
date = self.get_node(
action_node,
'./td[1]').text
date = datetime.strptime(date, '%b %d, %Y')
# The action node may have an anchor element within it, so
# we grab all the text within.
action = self.get_node(
action_node,
'./td[2]').text_content()
if 'Governor' in action:
actor = 'executive'
elif 'Speaker' in action:
actor = 'legislature'
else:
actor = 'legislature'
action_type = self.action_types(action)
bill.add_action(
action,
date.strftime('%Y-%m-%d'),
chamber=actor,
classification=action_type,
)
# Were in reverse chronological order.
bill.actions.reverse()
# Grabs bill version documents.
version_links = self.get_nodes(
bill_page,
'//div[@class="main-content"]/div[3]/div[2]/'
'div[@class="hidden-xs"]/ul[1]/li/a')
for version_link in version_links:
version_name = version_link.text
version_url = version_link.attrib['href']
# replace Current w/ session number
version_url = version_url.replace('Current', session)
bill.add_version_link(version_name, version_url, media_type='application/pdf')
# Adds any documents related to amendments.
amendment_links = self.get_nodes(
bill_page,
'//div[@class="main-content"]/div[5]/div[2]/table/tr/td[1]/a')
for amendment_link in amendment_links:
amendment_name = amendment_link.text
amendment_url = amendment_link.attrib['href']
bill.add_document_link(amendment_name, amendment_url)
# Related transcripts.
transcript_links = self.get_nodes(
#.........这里部分代码省略.........
示例10: scrape_bills
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_version_link [as 别名]
def scrape_bills(self, chamber, session, subjects):
idex = bill_start_numbers(session)[chamber]
FROM = "ctl00$rilinContent$txtBillFrom"
TO = "ctl00$rilinContent$txtBillTo"
YEAR = "ctl00$rilinContent$cbYear"
blocks = "FOO" # Ugh.
while len(blocks) > 0:
default_headers = get_default_headers(SEARCH_URL)
default_headers[FROM] = idex
default_headers[TO] = idex + MAXQUERY
default_headers[YEAR] = session
idex += MAXQUERY
blocks = self.parse_results_page(self.post(SEARCH_URL,
data=default_headers).text)
blocks = blocks[1:-1]
blocks = self.digest_results_page(blocks)
for block in blocks:
bill = blocks[block]
subs = []
try:
subs = subjects[bill['bill_id']]
except KeyError:
pass
title = bill['title'][len("ENTITLED, "):]
billid = bill['bill_id']
try:
subs = subjects[bill['bill_id']]
except KeyError:
subs = []
for b in BILL_NAME_TRANSLATIONS:
if billid[:len(b)] == b:
billid = BILL_NAME_TRANSLATIONS[b] + billid[len(b) + 1:].split()[0]
b = Bill(
billid,
title=title,
chamber=chamber,
legislative_session=session,
classification=self.get_type_by_name(bill['bill_id']),
)
b.subject = subs
# keep bill ID around
self._bill_id_by_type[(chamber, re.findall(r'\d+', billid)[0])] = billid
self.process_actions(bill['actions'], b)
sponsors = bill['sponsors'][len("BY"):].strip()
sponsors = sponsors.split(",")
sponsors = [s.strip() for s in sponsors]
for href in bill['bill_id_hrefs']:
b.add_version_link(
href.text, href.attrib['href'],
media_type="application/pdf")
for sponsor in sponsors:
b.add_sponsorship(
sponsor, entity_type='person', classification='primary', primary=True)
b.add_source(SEARCH_URL)
yield b
示例11: scrape
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_version_link [as 别名]
#.........这里部分代码省略.........
bill_id = number_link.text_content().replace('No.', '')
bill_id = bill_id.replace('.', '').replace(' ', '')
# put one space back in between type and number
bill_id = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', bill_id)
title = title.text_content().strip()
title = re.sub(r'^Title', '', title)
chamber = 'lower' if 'H' in bill_id else 'upper'
classification = 'bill' if 'B' in bill_id else 'resolution'
bill = Bill(bill_id, legislative_session=session, chamber=chamber,
title=title, classification=classification)
bill.add_source(number_link.xpath('a/@href')[0])
# get bill from API
bill_api_url = ('http://search-prod.lis.state.oh.us/solarapi/v1/'
'general_assembly_{}/{}/{}/'.format(
session,
'bills' if 'B' in bill_id else 'resolutions',
bill_id.lower().replace(' ', '')
))
data = self.get(bill_api_url).json()
# add title if no short title
if not bill.title:
bill.title = data['items'][0]['longtitle']
bill.add_title(data['items'][0]['longtitle'], 'long title')
# this stuff is version-specific
for version in data['items']:
version_name = version["version"]
version_link = base_url+version["pdfDownloadLink"]
bill.add_version_link(version_name, version_link, media_type='application/pdf')
# we'll use latest bill_version for everything else
bill_version = data['items'][0]
bill.add_source(bill_api_url)
# subjects
for subj in bill_version["subjectindexes"]:
try:
bill.add_subject(subj["primary"])
except KeyError:
pass
try:
secondary_subj = subj["secondary"]
except KeyError:
secondary_subj = ""
if secondary_subj:
bill.add_subject(secondary_subj)
# sponsors
sponsors = bill_version["sponsors"]
for sponsor in sponsors:
sponsor_name = self.get_sponsor_name(sponsor)
bill.add_sponsorship(
sponsor_name,
classification='primary',
entity_type='person',
primary=True
)
cosponsors = bill_version["cosponsors"]
for sponsor in cosponsors:
sponsor_name = self.get_sponsor_name(sponsor)
示例12: scrape_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_version_link [as 别名]
def scrape_bill(self, bill_id):
old = self.api('bills/' + bill_id + '?')
# not needed
old.pop('id')
old.pop('state')
old.pop('level', None)
old.pop('country', None)
old.pop('created_at')
old.pop('updated_at')
old.pop('action_dates')
old.pop('+bill_type',None)
old.pop('+subject', None)
old.pop('+scraped_subjects', None)
old.pop('subjects', [])
classification = old.pop('type')
# ca weirdness
if 'fiscal committee' in classification:
classification.remove('fiscal committee')
if 'urgency' in classification:
classification.remove('urgency')
if 'local program' in classification:
classification.remove('local program')
if 'tax levy' in classification:
classification.remove('tax levy')
if classification[0] in ['miscellaneous', 'jres', 'cres']:
return
if classification == ['memorial resolution'] and self.state == 'ar':
classification = ['memorial']
if classification == ['concurrent memorial resolution'] and self.state == 'ar':
classification = ['concurrent memorial']
if classification == ['joint session resolution'] and self.state == 'il':
classification = ['joint resolution']
if classification == ['legislative resolution'] and self.state == 'ny':
classification = ['resolution']
if classification == ['address'] and self.state == 'nh':
classification = ['resolution']
if not old['title'] and self.state == 'me':
old['title'] = '(unknown)'
chamber = old.pop('chamber')
if self.state in ('ne', 'dc'):
chamber = 'legislature'
elif chamber in ('joint', 'conference'):
chamber = 'legislature'
new = Bill(old.pop('bill_id'), old.pop('session'), old.pop('title'),
chamber=chamber, classification=classification)
abstract = old.pop('summary', None)
if abstract:
new.add_abstract(abstract, note='')
for title in old.pop('alternate_titles'):
new.add_title(title)
for doc in old.pop('documents'):
new.add_document_link(doc['name'], doc['url'], on_duplicate='ignore')
for doc in old.pop('versions'):
new.add_version_link(doc['name'], doc['url'], media_type=doc.pop('mimetype', ''))
for subj in old.pop('scraped_subjects', []):
if subj:
new.add_subject(subj)
for spon in old.pop('sponsors'):
if spon.get('committee_id') is not None:
entity_type = 'organization'
elif spon.get('leg_id') is not None:
entity_type = 'person'
else:
entity_type = ''
new.add_sponsorship(spon['name'], spon['type'], entity_type,
spon['type'] == 'primary')
for act in old.pop('actions'):
actor = act['actor']
if actor.lower() in ('governor', 'mayor', 'secretary of state'):
actor = 'executive'
elif actor.lower() == 'house' or (actor.lower().startswith('lower (') and self.state == 'ca'):
actor = 'lower'
elif actor.lower() in ('senate', 'upper`') or (actor.lower().startswith('upper (') and self.state == 'ca'):
actor = 'upper'
elif actor in ('joint', 'other', 'Data Systems', 'Speaker', 'clerk',
'Office of the Legislative Fiscal Analyst', 'Became Law w',
'conference') or (actor.lower().startswith('legislature (') and self.state == 'ca'):
actor = 'legislature'
if actor in ('committee', 'sponsor') and self.state == 'pr':
actor = 'legislature'
# nebraska & DC
if actor in ('upper','council') and self.state in ('ne', 'dc'):
actor = 'legislature'
#.........这里部分代码省略.........
示例13: scrape
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_version_link [as 别名]
def scrape(self) :
three_days_ago = datetime.datetime.now() - datetime.timedelta(3)
for matter in self.matters(three_days_ago) :
matter_id = matter['MatterId']
date = matter['MatterIntroDate']
title = matter['MatterTitle']
identifier = matter['MatterFile']
if not all((date, title, identifier)) :
continue
bill_session = self.session(self.toTime(date))
bill_type = BILL_TYPES[matter['MatterTypeName']]
if identifier.startswith('S'):
alternate_identifiers = [identifier]
identifier = identifier[1:]
else:
alternate_identifiers = []
bill = Bill(identifier=identifier,
legislative_session=bill_session,
title=title,
classification=bill_type,
from_organization={"name":"Chicago City Council"})
legistar_web = self.legislation_detail_url(matter_id)
legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format(matter_id)
bill.add_source(legistar_web, note='web')
bill.add_source(legistar_api, note='api')
for identifier in alternate_identifiers:
bill.add_identifier(identifier)
for action, vote in self.actions(matter_id) :
act = bill.add_action(**action)
if action['description'] == 'Referred' :
body_name = matter['MatterBodyName']
if body_name != 'City Council' :
act.add_related_entity(body_name,
'organization',
entity_id = _make_pseudo_id(name=body_name))
result, votes = vote
if result :
vote_event = VoteEvent(legislative_session=bill.legislative_session,
motion_text=action['description'],
organization=action['organization'],
classification=None,
start_date=action['date'],
result=result,
bill=bill)
vote_event.add_source(legistar_web)
vote_event.add_source(legistar_api + '/histories')
for vote in votes :
raw_option = vote['VoteValueName'].lower()
clean_option = self.VOTE_OPTIONS.get(raw_option,
raw_option)
vote_event.vote(clean_option,
vote['VotePersonName'].strip())
yield vote_event
for sponsorship in self.sponsorships(matter_id) :
bill.add_sponsorship(**sponsorship)
for topic in self.topics(matter_id) :
bill.add_subject(topic['MatterIndexName'].strip())
for attachment in self.attachments(matter_id) :
if attachment['MatterAttachmentName'] :
bill.add_version_link(attachment['MatterAttachmentName'],
attachment['MatterAttachmentHyperlink'],
media_type="application/pdf")
bill.extras = {'local_classification' : matter['MatterTypeName']}
text = self.text(matter_id)
if text :
if text['MatterTextPlain'] :
bill.extras['plain_text'] = text['MatterTextPlain']
if text['MatterTextRtf'] :
bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '')
yield bill
示例14: _scrape_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_version_link [as 别名]
#.........这里部分代码省略.........
cosponsor['shortName'],
entity_type='person',
classification='cosponsor',
primary=False,
)
# List companion bill.
same_as = bill_active_version.get('sameAs', {})
# Check whether "sameAs" property is populated with at least one bill.
if same_as['items']:
# Get companion bill ID.
companion_bill_id = same_as['items'][0]['basePrintNo']
# Build companion bill session.
start_year = same_as['items'][0]['session']
end_year = start_year + 1
companion_bill_session = '-'.join([str(start_year), str(end_year)])
# Attach companion bill data.
bill.add_related_bill(
companion_bill_id,
companion_bill_session,
relation_type='companion',
)
# Parse actions.
chamber_map = {
'senate': 'upper',
'assembly': 'lower',
}
for action in bill_data['actions']['items']:
chamber = chamber_map[action['chamber'].lower()]
action_datetime = datetime.datetime.strptime(action['date'], '%Y-%m-%d')
action_date = action_datetime.date()
types, _ = NYBillScraper.categorizer.categorize(action['text'])
bill.add_action(
action['text'],
action_date.strftime('%Y-%m-%d'),
chamber=chamber,
classification=types,
)
# Handling of sources follows. Sources serving either chamber
# maintain duplicate data, so we can see certain bill data
# through either chamber's resources. However, we have to refer
# to a specific chamber's resources if we want to grab certain
# specific information such as vote data.
#
# As such, I'm placing all potential sources in the interest of
# thoroughness. - Andy Lo
# List Open Legislation API endpoint as a source.
api_url = self.api_client.root + self.api_client.resources['bill'].format(
session_year=session,
bill_id=bill_id,
summary='',
detail='')
bill.add_source(api_url)
bill.add_source(senate_url)
bill.add_source(assembly_url)
# Chamber-specific processing.
if bill_chamber == 'upper':
# Collect votes.
for vote_data in bill_data['votes']['items']:
yield self._parse_senate_votes(vote_data, bill, api_url)
elif bill_chamber == 'lower':
assembly = AssemblyBillPage(self, session, bill, details)
assembly.build()
# A little strange the way it works out, but the Assembly
# provides the HTML version documents and the Senate provides
# the PDF version documents.
amendments = bill_data['amendments']['items']
for key, amendment in amendments.items():
version = amendment['printNo']
html_version = version + ' HTML'
html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\
'{}&term={}'.format(bill_id, self.term_start_year)
bill.add_version_link(
html_version,
html_url,
on_duplicate='ignore',
media_type='text/html',
)
pdf_version = version + ' PDF'
pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\
.format(self.term_start_year, bill_id)
bill.add_version_link(
pdf_version,
pdf_url,
on_duplicate='ignore',
media_type='application/pdf',
)
yield bill
示例15: scrape_senate_bills
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_version_link [as 别名]
def scrape_senate_bills(self, chamber, insert, session, year):
doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution',
8: 'joint resolution'}
for docnum, bill_type in doc_type.items():
parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/' \
'HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
links = self.scrape_links(parentpage_url)
count = 0
for link in links:
count += 1
page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)
page = self.get(page_path).text
page = page.replace(u"\xa0", " ")
root = lxml.html.fromstring(page)
bill_id = root.xpath('string(/html/body/div[@id="content"]' +
'/table[1]/tr[1]/td[1]/font)')
title = self.get_node(
root,
'//div[@id="content"]/table/tr[preceding-sibling::tr/td/'
'b[contains(text(), "By:")]]/td/em/text()')
bill = Bill(bill_id,
legislative_session=session,
chamber=chamber,
title=title,
classification=bill_type
)
bill.subject = list(set(self.subject_mapping[bill_id]))
for table in root.xpath('//div[@id="content"]/table'):
if 'Bill Text' in table.text_content():
bill_text = table.xpath("string(tr/td[2]/a/@href)")
text_url = "http://www.leg.state.nv.us" + bill_text
bill.add_version_link(note="Bill Text",
url=text_url,
media_type='application/pdf')
primary, secondary = self.scrape_sponsors(page)
for leg in primary:
bill.add_sponsorship(name=leg,
classification='primary',
entity_type='person',
primary=True)
for leg in secondary:
bill.add_sponsorship(name=leg,
classification='cosponsor',
entity_type='person',
primary=False)
minutes_count = 2
for mr in root.xpath('//table[4]/tr/td[3]/a'):
minutes = mr.xpath("string(@href)")
minutes_url = "http://www.leg.state.nv.us" + minutes
minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
minutes_date = mr.xpath(minutes_date_path).split()
minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Agenda"
# bill.add_document(minutes_date, minutes_url)
bill.add_document_link(note=minutes_date,
url=minutes_url)
minutes_count = minutes_count + 1
self.scrape_actions(root, bill, "upper")
yield from self.scrape_votes(page, page_path, bill, insert, year)
bill.add_source(page_path)
yield bill