本文整理汇总了Python中pupa.scrape.Bill.subject方法的典型用法代码示例。如果您正苦于以下问题:Python Bill.subject方法的具体用法?Python Bill.subject怎么用?Python Bill.subject使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pupa.scrape.Bill
的用法示例。
在下文中一共展示了Bill.subject方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: handle_list_item
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import subject [as 别名]
def handle_list_item(self, item):
bill_id = item.text.strip()
title = item.xpath("string(../following-sibling::td[1])").strip()
sponsor = item.xpath("string(../following-sibling::td[2])").strip()
bill_url = item.attrib['href'] + '/ByCategory'
if bill_id.startswith(('SB ', 'HB ', 'SPB ', 'HPB ')):
bill_type = 'bill'
elif bill_id.startswith(('HR ', 'SR ')):
bill_type = 'resolution'
elif bill_id.startswith(('HJR ', 'SJR ')):
bill_type = 'joint resolution'
elif bill_id.startswith(('SCR ', 'HCR ')):
bill_type = 'concurrent resolution'
elif bill_id.startswith(('SM ', 'HM ')):
bill_type = 'memorial'
else:
raise ValueError('Failed to identify bill type.')
bill = Bill(bill_id, self.kwargs['session'], title,
chamber='lower' if bill_id[0] == 'H' else 'upper',
classification=bill_type)
bill.add_source(bill_url)
# normalize id from HB 0004 to H4
subj_bill_id = re.sub('(H|S)\w+ 0*(\d+)', r'\1\2', bill_id)
bill.subject = list(self.kwargs['subjects'][subj_bill_id])
sponsor = re.sub(r'^(?:Rep|Sen)\.\s', "", sponsor)
for sp in sponsor.split(', '):
bill.add_sponsorship(sp, 'primary', 'person', True)
yield from self.scrape_page_items(BillDetail, url=bill_url, obj=bill)
yield bill
示例2: scrape_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import subject [as 别名]
def scrape_bill(self, chamber, session, bill_id, url):
html = self.get(url).text
doc = lxml.html.fromstring(html)
doc.make_links_absolute(url)
title = doc.xpath('//h3[@class="h3billright"]')[0].text_content()
# TODO: grab summary (none present at time of writing)
if 'B' in bill_id:
_type = ['bill']
elif 'J' in bill_id:
_type = ['joint resolution']
else:
raise ValueError('unknown bill type ' + bill_id)
bill = Bill(
bill_id, legislative_session=session, chamber=chamber, title=title,
classification=_type)
bill.add_source(url)
# process sponsors
sponsors = _get_td(doc, 'All Sponsors:').text_content()
sponsors = sponsors.replace('Delegates ', '')
sponsors = sponsors.replace('Delegate ', '')
sponsors = sponsors.replace('Senator ', '')
sponsors = sponsors.replace('Senators ', '')
sponsor_type = 'primary'
for sponsor in re.split(', (?:and )?', sponsors):
sponsor = sponsor.strip()
if not sponsor:
continue
bill.add_sponsorship(
sponsor,
sponsor_type,
primary=sponsor_type == 'primary',
entity_type='person',
)
sponsor_type = 'cosponsor'
# subjects
subject_list = []
for heading in ('Broad Subject(s):', 'Narrow Subject(s):'):
subjects = _get_td(doc, heading).xpath('a/text()')
subject_list += [s.split(' -see also-')[0] for s in subjects if s]
bill.subject = subject_list
# documents
yield from self.scrape_documents(bill, url.replace('stab=01', 'stab=02'))
# actions
self.scrape_actions(bill, url.replace('stab=01', 'stab=03'))
yield bill
示例3: scrape_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import subject [as 别名]
def scrape_bill(self, chamber, session, bill_id):
bill_num = bill_id.split()[1]
url = ("%s/GetLegislation?biennium=%s&billNumber"
"=%s" % (self._base_url, self.biennium, bill_num))
page = self.get(url)
page = lxml.etree.fromstring(page.content)
page = xpath(page, "//wa:Legislation")[0]
title = xpath(page, "string(wa:LongDescription)")
bill_type = xpath(
page,
"string(wa:ShortLegislationType/wa:LongLegislationType)")
bill_type = bill_type.lower()
if bill_type == 'gubernatorial appointment':
return
bill = Bill(bill_id, legislative_session=session, chamber=chamber,
title=title, classification=[bill_type])
fake_source = ("http://apps.leg.wa.gov/billinfo/"
"summary.aspx?bill=%s&year=%s" % (
bill_num, session[0:4]))
bill.add_source(fake_source)
try:
for version in self.versions[bill_id]:
bill.add_version_link(note=version['note'],
url=version['url'],
media_type=version['media_type'])
except KeyError:
self.warning("No versions were found for {}".format(bill_id))
try:
for document in self.documents[bill_num]:
bill.add_document_link(note=document['note'],
url=document['url'],
media_type=document['media_type'])
except KeyError:
pass
self.scrape_sponsors(bill)
self.scrape_actions(bill, bill_num)
self.scrape_hearings(bill, bill_num)
yield from self.scrape_votes(bill)
bill.subject = list(set(self._subjects[bill_id]))
yield bill
示例4: handle_page
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import subject [as 别名]
def handle_page(self):
bills = self.doc.xpath('//ul[@class="linkSect"]/li')
for bill in bills:
link = bill.getchildren()[0]
bill_id = str(link.text_content())
if not bill_id.startswith(('S', 'H')):
continue
# create a bill
desc = bill.xpath('text()')[0].strip()
chamber = {
'H': 'lower',
'S': 'upper',
}[bill_id[0]]
bill_type = {
'B': 'bill',
'J': 'joint resolution',
'R': 'resolution'
}[bill_id[1]]
bill = Bill(bill_id, self.kwargs['session'], desc,
chamber=chamber, classification=bill_type)
bill_url = link.get('href')
sponsor_url = BASE_URL + URL_PATTERNS['sponsors'].format(
self.kwargs['session_id'],
bill_id.replace(' ', ''),
)
list(self.scrape_page_items(BillSponsorPage, url=sponsor_url, obj=bill))
yield from self.scrape_page_items(BillDetailPage, url=bill_url, obj=bill)
bill.subject = self.kwargs['subjects'][bill_id]
bill.add_source(bill_url)
yield bill
next_url = self.doc.xpath('//a/b[text()="More..."]/../@href')
if next_url:
yield from self.scrape_page_items(BillListPage, url=next_url[0], **self.kwargs)
示例5: scrape_bill_2012
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import subject [as 别名]
def scrape_bill_2012(self, chamber, session, bill_id, url):
html = self.get(url).text
doc = lxml.html.fromstring(html)
doc.make_links_absolute(url)
# find <a name="Title">, get parent dt, get parent dl, then dd n dl
title = doc.xpath('//a[@name="Title"][1]/../../dd[1]/text()')[0].strip()
summary = doc.xpath('//font[@size="3"]/p/text()')[0].strip()
if 'B' in bill_id:
_type = ['bill']
elif 'J' in bill_id:
_type = ['joint resolution']
bill = Bill(
bill_id,
legislative_session=session,
classification=_type,
chamber=chamber,
title=title,
)
bill.add_abstract(summary, note='summary')
bill.add_source(url)
self.parse_bill_sponsors(doc, bill) # sponsors
self.parse_bill_actions(doc, bill) # actions
self.parse_bill_documents(doc, bill) # documents and versions
yield from self.parse_bill_votes(doc, bill) # votes
# subjects
subjects = []
for subj in doc.xpath('//a[contains(@href, "/subjects/")]'):
subjects.append(subj.text.split('-see also-')[0])
bill.subject = subjects
# add bill to collection
self.save_bill(bill)
示例6: scrape_bill_list
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import subject [as 别名]
def scrape_bill_list(self, chamber, session, url):
if 'joint_resolution' in url:
bill_type = 'joint resolution'
elif 'resolution' in url:
bill_type = 'resolution'
elif 'bill' in url:
bill_type = 'bill'
try:
data = self.get(url).text
except scrapelib.HTTPError:
self.warning('skipping URL %s' % url)
return
doc = lxml.html.fromstring(data)
doc.make_links_absolute(url)
bill_list = doc.xpath('//ul[@class="infoLinks"]/li/div[@class="row-fluid"]')
for b in bill_list:
bill_url = b.xpath('./div[@class="span3"]/a/@href')[0]
bill_id = bill_url.rsplit('/', 1)[-1]
bill_id = bill_id.upper()
title = b.xpath(
'./div[@class="span6"]/text()'
)[0].replace(' - Relating to: ', '').strip()
bill = Bill(
bill_id,
legislative_session=session,
title=title,
chamber=chamber,
classification=bill_type,
)
bill.subject = list(set(self.subjects[bill_id]))
yield from self.scrape_bill_history(bill, bill_url, chamber)
yield bill
示例7: scrape_bill_type
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import subject [as 别名]
def scrape_bill_type(self, chamber, session, bill_type, type_abbr,
committee_abbr_regex=get_committee_name_regex()):
bills = self.session.query(CABill).filter_by(
session_year=session).filter_by(
measure_type=type_abbr)
for bill in bills:
bill_session = session
if bill.session_num != '0':
bill_session += ' Special Session %s' % bill.session_num
bill_id = bill.short_bill_id
fsbill = Bill(bill_id, session, title='', chamber=chamber)
if ((bill_id.startswith('S') and chamber == 'lower') or
(bill_id.startswith('A') and chamber == 'upper')):
print("!!!! BAD ID/CHAMBER PAIR !!!!", bill)
continue
# # Construct session for web query, going from '20092010' to '0910'
# source_session = session[2:4] + session[6:8]
# # Turn 'AB 10' into 'ab_10'
# source_num = "%s_%s" % (bill.measure_type.lower(),
# bill.measure_num)
# Construct a fake source url
source_url = ('http://leginfo.legislature.ca.gov/faces/'
'billNavClient.xhtml?bill_id=%s') % bill.bill_id
fsbill.add_source(source_url)
fsbill.add_version_link(bill_id, source_url, media_type='text/html')
title = ''
type_ = ['bill']
subject = ''
all_titles = set()
# Get digest test (aka "summary") from latest version.
if bill.versions:
version = bill.versions[-1]
nsmap = version.xml.nsmap
xpath = '//caml:DigestText/xhtml:p'
els = version.xml.xpath(xpath, namespaces=nsmap)
chunks = []
for el in els:
t = etree_text_content(el)
t = re.sub(r'\s+', ' ', t)
t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t)
chunks.append(t)
summary = '\n\n'.join(chunks)
for version in bill.versions:
if not version.bill_xml:
continue
version_date = self._tz.localize(version.bill_version_action_date)
# create a version name to match the state's format
# 02/06/17 - Enrolled
version_date_human = version_date.strftime(
'%m/%d/%y')
version_name = "{} - {}".format(
version_date_human, version.bill_version_action)
version_base = "https://leginfo.legislature.ca.gov/faces"
version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
version_base, version.bill_id, version.bill_version_id)
fsbill.add_version_link(
version_name,
version_url_pdf,
media_type='application/pdf',
date=version_date.date())
# CA is inconsistent in that some bills have a short title
# that is longer, more descriptive than title.
if bill.measure_type in ('AB', 'SB'):
impact_clause = clean_title(version.title)
title = clean_title(version.short_title)
else:
impact_clause = None
if len(version.title) < len(version.short_title) and \
not version.title.lower().startswith('an act'):
title = clean_title(version.short_title)
else:
title = clean_title(version.title)
if title:
all_titles.add(title)
type_ = [bill_type]
if version.appropriation == 'Yes':
type_.append('appropriation')
tags = []
if version.fiscal_committee == 'Yes':
tags.append('fiscal committee')
#.........这里部分代码省略.........
示例8: scrape_bill_list
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import subject [as 别名]
def scrape_bill_list(self, url):
bill_list = self._get_bill_list(url)
for bill_info in bill_list:
(bill_id, ) = bill_info.xpath('td[1]/font/input/@value')
(sponsor, ) = bill_info.xpath('td[2]/font/input/@value')
(subject, ) = bill_info.xpath('td[3]//text()')
subject = subject.strip()
chamber = self.CHAMBERS[bill_id[0]]
if 'B' in bill_id:
bill_type = 'bill'
elif 'JR' in bill_id:
bill_type = 'joint resolution'
elif 'R' in bill_id:
bill_type = 'resolution'
else:
raise AssertionError(
"Unknown bill type for bill '{}'".format(bill_id))
bill = Bill(
bill_id,
legislative_session=self.session,
chamber=chamber,
title='',
classification=bill_type,
)
if subject:
bill.subject = [subject]
if sponsor:
bill.add_sponsorship(
name=sponsor,
entity_type='person',
classification='primary',
primary=True,
)
bill.add_source(url)
bill_url = ('http://alisondb.legislature.state.al.us/Alison/'
'SESSBillStatusResult.aspx?BILL={}'.format(bill_id))
bill.add_source(bill_url)
bill_html = self._get_bill_response(bill_url)
if bill_html is None:
self.warning("Bill {} has no webpage, and will be skipped".
format(bill_id))
continue
bill_doc = lxml.html.fromstring(bill_html)
if (bill_doc.xpath('//span[@id="ContentPlaceHolder1_lblShotTitle"]')):
title = bill_doc.xpath(
'//span[@id="ContentPlaceHolder1_lblShotTitle"]'
)[0].text_content().strip()
if not title:
title = "[No title given by state]"
bill.title = title
version_url_base = (
'http://alisondb.legislature.state.al.us/ALISON/'
'SearchableInstruments/{0}/PrintFiles/{1}-'.
format(self.session, bill_id))
versions = bill_doc.xpath(
'//table[@class="box_versions"]/tr/td[2]/font/text()')
for version in versions:
name = version
if version == "Introduced":
version_url = version_url_base + 'int.pdf'
elif version == "Engrossed":
version_url = version_url_base + 'eng.pdf'
elif version == "Enrolled":
version_url = version_url_base + 'enr.pdf'
else:
raise NotImplementedError(
"Unknown version type found: '{}'".format(name))
bill.add_version_link(
name,
version_url,
media_type='application/pdf',
on_duplicate='ignore',
)
# Fiscal notes exist, but I can't figure out how to build their URL
fiscal_notes = bill_doc.xpath(
'//table[@class="box_fiscalnote"]')[1:]
for fiscal_note in fiscal_notes:
pass
# Budget Isolation Resolutions are handled as extra actions/votes
birs = bill_doc.xpath(
'//div[@class="box_bir"]//table//table/tr')[1:]
for bir in birs:
bir_action = bir.xpath('td[1]')[0].text_content().strip()
# Sometimes ALISON's database puts another bill's
# actions into the BIR action list; ignore these
if bill_id not in bir_action:
self.warning(
"BIR action found ({}) ".format(bir_action) +
"that doesn't match the bill ID ({})".format(bill_id))
#.........这里部分代码省略.........
示例9: scrape_senate_bills
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import subject [as 别名]
def scrape_senate_bills(self, chamber, insert, session, year):
doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution',
8: 'joint resolution'}
for docnum, bill_type in doc_type.items():
parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/' \
'HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
links = self.scrape_links(parentpage_url)
count = 0
for link in links:
count += 1
page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)
page = self.get(page_path).text
page = page.replace(u"\xa0", " ")
root = lxml.html.fromstring(page)
bill_id = root.xpath('string(/html/body/div[@id="content"]' +
'/table[1]/tr[1]/td[1]/font)')
title = self.get_node(
root,
'//div[@id="content"]/table/tr[preceding-sibling::tr/td/'
'b[contains(text(), "By:")]]/td/em/text()')
bill = Bill(bill_id,
legislative_session=session,
chamber=chamber,
title=title,
classification=bill_type
)
bill.subject = list(set(self.subject_mapping[bill_id]))
for table in root.xpath('//div[@id="content"]/table'):
if 'Bill Text' in table.text_content():
bill_text = table.xpath("string(tr/td[2]/a/@href)")
text_url = "http://www.leg.state.nv.us" + bill_text
bill.add_version_link(note="Bill Text",
url=text_url,
media_type='application/pdf')
primary, secondary = self.scrape_sponsors(page)
for leg in primary:
bill.add_sponsorship(name=leg,
classification='primary',
entity_type='person',
primary=True)
for leg in secondary:
bill.add_sponsorship(name=leg,
classification='cosponsor',
entity_type='person',
primary=False)
minutes_count = 2
for mr in root.xpath('//table[4]/tr/td[3]/a'):
minutes = mr.xpath("string(@href)")
minutes_url = "http://www.leg.state.nv.us" + minutes
minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
minutes_date = mr.xpath(minutes_date_path).split()
minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Agenda"
# bill.add_document(minutes_date, minutes_url)
bill.add_document_link(note=minutes_date,
url=minutes_url)
minutes_count = minutes_count + 1
self.scrape_actions(root, bill, "upper")
yield from self.scrape_votes(page, page_path, bill, insert, year)
bill.add_source(page_path)
yield bill
示例10: scrape_assem_bills
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import subject [as 别名]
def scrape_assem_bills(self, chamber, insert, session, year):
doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution',
6: 'joint resolution', 9: 'petition'}
for docnum, bill_type in doc_type.items():
parentpage_url = 'http://www.leg.state.nv.us/Session/%s/' \
'Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
links = self.scrape_links(parentpage_url)
count = 0
for link in links:
count = count + 1
page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)
page = self.get(page_path).text
page = page.replace(u"\xa0", " ")
root = lxml.html.fromstring(page)
root.make_links_absolute("http://www.leg.state.nv.us/")
bill_id = root.xpath('string(/html/body/div[@id="content"]'
'/table[1]/tr[1]/td[1]/font)')
title = self.get_node(
root,
'//div[@id="content"]/table/tr[preceding-sibling::tr/td/'
'b[contains(text(), "By:")]]/td/em/text()')
bill = Bill(bill_id, legislative_session=session, chamber=chamber,
title=title, classification=bill_type)
bill.subject = list(set(self.subject_mapping[bill_id]))
billtext = root.xpath("//b[text()='Bill Text']")[0].getparent().getnext()
text_urls = billtext.xpath("./a")
for text_url in text_urls:
version_name = text_url.text.strip()
version_url = text_url.attrib['href']
bill.add_version_link(note=version_name, url=version_url,
media_type='application/pdf')
primary, secondary = self.scrape_sponsors(page)
for leg in primary:
bill.add_sponsorship(classification='primary',
name=leg, entity_type='person',
primary=True)
for leg in secondary:
bill.add_sponsorship(classification='cosponsor',
name=leg, entity_type='person',
primary=False)
minutes_count = 2
for mr in root.xpath('//table[4]/tr/td[3]/a'):
minutes = mr.xpath("string(@href)")
minutes_url = "http://www.leg.state.nv.us" + minutes
minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
minutes_date = mr.xpath(minutes_date_path).split()
minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes"
bill.add_document_link(note=minutes_date, url=minutes_url)
minutes_count += 1
self.scrape_actions(root, bill, "lower")
yield from self.scrape_votes(page, page_path, bill, insert, year)
bill.add_source(page_path)
yield bill
示例11: parse_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import subject [as 别名]
def parse_bill(self, chamber, session, bill_id, url):
try:
page = self.lxmlize(url)
except scrapelib.HTTPError as e:
self.logger.warning(e)
return
last_action = self.parse_bill_field(
page, 'Last Action').xpath('text()')[0]
if 'WITHDRAWN' in last_action.upper():
self.info("{} Withdrawn, skipping".format(bill_id))
return
version = self.parse_bill_field(page, 'Bill Documents')
source_url = version.xpath('a[1]/@href')[0]
version_title = version.xpath('a[1]/text()')[0].strip()
if version is None:
# Bill withdrawn
self.logger.warning('Bill withdrawn.')
return
else:
if source_url.endswith('.doc'):
mimetype = 'application/msword'
elif source_url.endswith('.pdf'):
mimetype = 'application/pdf'
title = self.parse_bill_field(page, 'Title').text_content()
# actions = self.get_nodes(
# page,
# '//div[@class="StandardText leftDivMargin"]/'
# 'div[@class="StandardText"][last()]//text()[normalize-space()]')
if 'CR' in bill_id:
bill_type = 'concurrent resolution'
elif 'JR' in bill_id:
bill_type = 'joint resolution'
elif 'R' in bill_id:
bill_type = 'resolution'
else:
bill_type = 'bill'
bill = Bill(bill_id, legislative_session=session, chamber=chamber,
title=title, classification=bill_type)
bill.subject = self._subjects[bill_id]
bill.add_source(url)
bill.add_version_link(version_title, source_url, media_type=mimetype)
self.parse_actions(page, bill, chamber)
self.parse_subjects(page, bill)
# LM is "Locally Mandated fiscal impact"
fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]')
for fiscal_note in fiscal_notes:
source_url = fiscal_note.attrib['href']
if source_url.endswith('.doc'):
mimetype = 'application/msword'
elif source_url.endswith('.pdf'):
mimetype = 'application/pdf'
bill.add_document_link(
"Fiscal Note", source_url, media_type=mimetype)
for link in page.xpath("//td/span/a[contains(@href, 'Legislator-Profile')]"):
bill.add_sponsorship(link.text.strip(), classification='primary',
entity_type='person', primary=True)
bdr_no = self.parse_bill_field(page, 'Bill Request Number')
if bdr_no.xpath('text()'):
bdr = bdr_no.xpath('text()')[0].strip()
bill.extras["BDR"] = bdr
yield bill
示例12: _parse_senate_billpage
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import subject [as 别名]
def _parse_senate_billpage(self, bill_url, year):
bill_page = self.lxmlize(bill_url)
# get all the info needed to record the bill
# TODO probably still needs to be fixed
bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content()
bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content()
bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content()
# bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content()
bill_type = "bill"
triplet = bill_id[:3]
if triplet in bill_types:
bill_type = bill_types[triplet]
subs = []
bid = bill_id.replace(" ", "")
if bid in self._subjects:
subs = self._subjects[bid]
self.info("With subjects for this bill")
self.info(bid)
bill = Bill(
bill_id,
title=bill_desc,
legislative_session=year,
classification=bill_type,
)
bill.subject = subs
bill.add_abstract(bill_desc, note='abstract')
bill.add_source(bill_url)
if bill_title:
bill.add_title(bill_title)
# Get the primary sponsor
sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0]
bill_sponsor = sponsor.text_content()
# bill_sponsor_link = sponsor.attrib.get('href')
bill.add_sponsorship(
bill_sponsor,
entity_type='person',
classification='primary',
primary=True,
)
# cosponsors show up on their own page, if they exist
cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]')
if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'):
self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href'])
# get the actions
action_url = bill_page.xpath('//a[@id="hlAllActions"]')
if len(action_url) > 0:
action_url = action_url[0].attrib['href']
self._parse_senate_actions(bill, action_url)
# stored on a separate page
versions_url = bill_page.xpath('//a[@id="hlFullBillText"]')
if len(versions_url) > 0 and versions_url[0].attrib.get('href'):
self._parse_senate_bill_versions(bill, versions_url[0].attrib['href'])
yield bill
示例13: scrape_bills
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import subject [as 别名]
def scrape_bills(self, chamber, session, subjects):
idex = bill_start_numbers(session)[chamber]
FROM = "ctl00$rilinContent$txtBillFrom"
TO = "ctl00$rilinContent$txtBillTo"
YEAR = "ctl00$rilinContent$cbYear"
blocks = "FOO" # Ugh.
while len(blocks) > 0:
default_headers = get_default_headers(SEARCH_URL)
default_headers[FROM] = idex
default_headers[TO] = idex + MAXQUERY
default_headers[YEAR] = session
idex += MAXQUERY
blocks = self.parse_results_page(self.post(SEARCH_URL,
data=default_headers).text)
blocks = blocks[1:-1]
blocks = self.digest_results_page(blocks)
for block in blocks:
bill = blocks[block]
subs = []
try:
subs = subjects[bill['bill_id']]
except KeyError:
pass
title = bill['title'][len("ENTITLED, "):]
billid = bill['bill_id']
try:
subs = subjects[bill['bill_id']]
except KeyError:
subs = []
for b in BILL_NAME_TRANSLATIONS:
if billid[:len(b)] == b:
billid = BILL_NAME_TRANSLATIONS[b] + billid[len(b) + 1:].split()[0]
b = Bill(
billid,
title=title,
chamber=chamber,
legislative_session=session,
classification=self.get_type_by_name(bill['bill_id']),
)
b.subject = subs
# keep bill ID around
self._bill_id_by_type[(chamber, re.findall(r'\d+', billid)[0])] = billid
self.process_actions(bill['actions'], b)
sponsors = bill['sponsors'][len("BY"):].strip()
sponsors = sponsors.split(",")
sponsors = [s.strip() for s in sponsors]
for href in bill['bill_id_hrefs']:
b.add_version_link(
href.text, href.attrib['href'],
media_type="application/pdf")
for sponsor in sponsors:
b.add_sponsorship(
sponsor, entity_type='person', classification='primary', primary=True)
b.add_source(SEARCH_URL)
yield b
示例14: scrape_actions
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import subject [as 别名]
def scrape_actions(self, session, href):
page = self.lxmlize(href)
(bid, ) = page.xpath('//h1[@id="page-title"]/text()')
bid = re.sub(r"^Bill Actions for ", "", bid)
subjects = self.subjects.get(bid, [])
# some pages say "Measure Number Breakdown", others "Bill..."
table = page.xpath("//table[contains(@summary, 'Number Breakdown')]")
table = table[0]
ttrows = page.xpath("//div[@id='application']/p")
descr = ttrows[-2]
title = re.sub("\s+", " ", descr.text_content()).strip()
ttrows = ttrows[:-1]
chamber = {
"H": "lower",
"S": "upper"
}[bid[0]]
type_ = bid[1:3]
bill_type = "bill"
if type_.startswith("B"):
bill_type = "bill"
if type_.startswith("R"):
bill_type = "resolution"
if type_ == "CR":
bill_type = "concurrent resolution"
bill = Bill(bid,
legislative_session=session,
chamber=chamber,
title=title,
classification=bill_type)
bill.subject = subjects
bill.add_source(href)
for row in ttrows:
if isinstance(row, lxml.html.HtmlComment):
continue # ignore HTML comments, no text_content()
sponsors = row.text_content().strip()
sinf = re.match(
"(?i)introduced by( (rep\.|sen\.))? (?P<sponsors>.*)",
sponsors
)
if sinf:
sponsors = sinf.groupdict()
for sponsor in [
x.strip() for x in sponsors['sponsors'].split(",")
]:
bill.add_sponsorship(sponsor, classification='primary',
entity_type='person', primary=True)
dt = None
oldchamber = 'other'
for row in table.xpath(".//tr"):
if row.text_content().strip() == '':
continue
if "Meeting Description" in [
x.strip() for x in row.xpath(".//th/text()")
]:
continue
row = row.xpath("./*")
row = [x.text_content().strip() for x in row]
if len(row) > 3:
row = row[:3]
date, chamber, action = row
try:
chamber = {
"House": "lower",
"Senate": "upper"
}[chamber]
oldchamber = chamber
except KeyError:
chamber = oldchamber
if date != '':
dt = datetime.strptime("%s %s" % (date, self.year), "%m/%d %Y")
classif = self.categorizer.categorize(action)
bill.add_action(chamber=chamber, description=action,
date=dt.strftime('%Y-%m-%d'),
classification=classif['classification'])
version_url = page.xpath("//a[contains(text(), 'Versions')]")
if len(version_url) == 1:
href = version_url[0].attrib['href']
bill = self.scrape_versions(bill, href)
yield bill
示例15: _parse_senate_billpage
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import subject [as 别名]
def _parse_senate_billpage(self, bill_url, year):
bill_page = self.lxmlize(bill_url)
# get all the info needed to record the bill
# TODO probably still needs to be fixed
bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content()
bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content()
bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content()
# bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content()
bill_type = "bill"
triplet = bill_id[:3]
if triplet in bill_types:
bill_type = bill_types[triplet]
subs = []
bid = bill_id.replace(" ", "")
if bid in self._subjects:
subs = self._subjects[bid]
self.info("With subjects for this bill")
self.info(bid)
if bid == 'XXXXXX':
self.info("Skipping Junk Bill")
return
bill = Bill(
bill_id,
title=bill_desc,
chamber='upper',
legislative_session=self._session_id,
classification=bill_type,
)
bill.subject = subs
bill.add_abstract(bill_desc, note='abstract')
bill.add_source(bill_url)
if bill_title:
bill.add_title(bill_title)
# Get the primary sponsor
sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0]
bill_sponsor = sponsor.text_content()
# bill_sponsor_link = sponsor.attrib.get('href')
bill.add_sponsorship(
bill_sponsor,
entity_type='person',
classification='primary',
primary=True,
)
# cosponsors show up on their own page, if they exist
cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]')
if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'):
self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href'])
# get the actions
action_url = bill_page.xpath('//a[@id="hlAllActions"]')
if len(action_url) > 0:
action_url = action_url[0].attrib['href']
self._parse_senate_actions(bill, action_url)
# stored on a separate page
versions_url = bill_page.xpath('//a[@id="hlFullBillText"]')
if len(versions_url) > 0 and versions_url[0].attrib.get('href'):
self._parse_senate_bill_versions(bill, versions_url[0].attrib['href'])
amendment_links = bill_page.xpath('//a[contains(@href,"ShowAmendment.asp")]')
for link in amendment_links:
link_text = link.xpath('string(.)').strip()
if 'adopted' in link_text.lower():
link_url = link.xpath('@href')[0]
bill.add_version_link(link_text, link_url, media_type='application/pdf',
on_duplicate='ignore')
yield bill