本文整理汇总了Python中pupa.scrape.Bill.add_document_link方法的典型用法代码示例。如果您正苦于以下问题:Python Bill.add_document_link方法的具体用法?Python Bill.add_document_link怎么用?Python Bill.add_document_link使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pupa.scrape.Bill
的用法示例。
在下文中一共展示了Bill.add_document_link方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: scrape_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_document_link [as 别名]
def scrape_bill(self, chamber, session, bill_id):
bill_num = bill_id.split()[1]
url = ("%s/GetLegislation?biennium=%s&billNumber"
"=%s" % (self._base_url, self.biennium, bill_num))
page = self.get(url)
page = lxml.etree.fromstring(page.content)
page = xpath(page, "//wa:Legislation")[0]
title = xpath(page, "string(wa:LongDescription)")
bill_type = xpath(
page,
"string(wa:ShortLegislationType/wa:LongLegislationType)")
bill_type = bill_type.lower()
if bill_type == 'gubernatorial appointment':
return
bill = Bill(bill_id, legislative_session=session, chamber=chamber,
title=title, classification=[bill_type])
fake_source = ("http://apps.leg.wa.gov/billinfo/"
"summary.aspx?bill=%s&year=%s" % (
bill_num, session[0:4]))
bill.add_source(fake_source)
try:
for version in self.versions[bill_id]:
bill.add_version_link(note=version['note'],
url=version['url'],
media_type=version['media_type'])
except KeyError:
self.warning("No versions were found for {}".format(bill_id))
try:
for document in self.documents[bill_num]:
bill.add_document_link(note=document['note'],
url=document['url'],
media_type=document['media_type'])
except KeyError:
pass
self.scrape_sponsors(bill)
self.scrape_actions(bill, bill_num)
self.scrape_hearings(bill, bill_num)
yield from self.scrape_votes(bill)
bill.subject = list(set(self._subjects[bill_id]))
yield bill
示例2: scrape_chamber
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_document_link [as 别名]
def scrape_chamber(self, chamber, session):
chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber]
url = ("http://legisweb.state.wy.us/%s/billreference/"
"BillReference.aspx?type=%s" % (session, chamber_abbrev))
page = self.lxmlize(url)
for tr in page.xpath("//table[contains(@id,'cphContent_gvBills')]//tr")[1:]:
bill_id = tr.xpath("string(td[1])").strip()
title = tr.xpath("string(td[2])").strip()
if bill_id[0:2] in ['SJ', 'HJ']:
bill_type = 'joint resolution'
else:
bill_type = 'bill'
bill = Bill(bill_id, legislative_session=session, title=title, chamber=chamber,
classification=bill_type)
yield from self.scrape_digest(bill, chamber)
# versions
for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') +
tr.xpath('td[12]//a')):
# skip references to other bills
if a.text.startswith('See'):
continue
bill.add_version_link(a.text, a.get('href'),
media_type='application/pdf')
# documents
fnote = tr.xpath('td[9]//a')
if fnote:
bill.add_document_link('Fiscal Note', fnote[0].get('href'))
summary = tr.xpath('td[14]//a')
if summary:
bill.add_document_link('Summary', summary[0].get('href'))
bill.add_source(url)
yield bill
示例3: scrape_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_document_link [as 别名]
#.........这里部分代码省略.........
bill.add_sponsorship(
sponsors[0],
entity_type='person',
classification='primary',
primary=True,
)
for sponsor in sponsors[1:]:
sponsor = sponsor.strip()
if sponsor:
bill.add_sponsorship(
sponsor,
entity_type='person',
classification='cosponsor',
primary=False,
)
else:
# Committee sponsorship
spons_str = spons_str.strip()
if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str):
spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$',
'', spons_str).title()
spons_str = (spons_str +
" Committee (by request of the governor)")
if spons_str:
bill.add_sponsorship(
spons_str,
entity_type='person',
classification='primary',
primary=True,
)
# Get actions from second myth table
self._current_comm = None
act_rows = doc.xpath('(//table[@class="myth"])[2]//tr')[1:]
for row in act_rows:
date, journal, raw_chamber, action = row.xpath('td')
act_date = datetime.datetime.strptime(date.text_content().strip(),
'%m/%d/%y')
raw_chamber = raw_chamber.text_content().strip()
action = action.text_content().strip()
if raw_chamber == "(H)":
act_chamber = "lower"
elif raw_chamber == "(S)":
act_chamber = "upper"
if re.match("\w+ Y(\d+)", action):
vote_href = journal.xpath('.//a/@href')
if vote_href:
yield from self.parse_vote(bill, action, act_chamber, act_date,
vote_href[0])
action, atype = self.clean_action(action)
match = re.match('^Prefile released (\d+/\d+/\d+)$', action)
if match:
action = 'Prefile released'
act_date = datetime.datetime.strptime(match.group(1), '%m/%d/%y')
bill.add_action(
action, chamber=act_chamber, date=act_date.strftime('%Y-%m-%d'),
classification=atype)
# Get subjects
for subj in doc.xpath('//a[contains(@href, "subject")]/text()'):
bill.add_subject(subj.strip())
# Get versions
text_list_url = (
"http://www.legis.state.ak.us/"
"basis/get_fulltext.asp?session=%s&bill=%s"
) % (session, bill_id)
bill.add_source(text_list_url)
text_doc = lxml.html.fromstring(self.get(text_list_url).text)
text_doc.make_links_absolute(text_list_url)
for link in text_doc.xpath('//a[contains(@href, "get_bill_text")]'):
name = link.xpath('../preceding-sibling::td/text()')[0].strip()
text_url = link.get('href')
bill.add_version_link(name, text_url, media_type="text/html")
# Get documents
doc_list_url = (
"http://www.legis.state.ak.us/"
"basis/get_documents.asp?session=%s&bill=%s"
) % (session, bill_id)
doc_list = lxml.html.fromstring(self.get(doc_list_url).text)
doc_list.make_links_absolute(doc_list_url)
bill.add_source(doc_list_url)
for href in doc_list.xpath('//a[contains(@href, "get_documents")][@onclick]'):
h_name = href.text_content()
h_href = href.attrib['href']
if h_name.strip():
bill.add_document_link(h_name, h_href)
yield bill
示例4: bill_info
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_document_link [as 别名]
def bill_info(self, bill_link, session, main_url):
bill_page = self.lxmlize(bill_link)
long_title = self.get_node(
bill_page,
'//div[@class="main-content"]/div[1]/div/h2').text.split()
bill_number = long_title[0]
title = ''
for x in range(2, len(long_title)):
title += long_title[x] + ' '
title = title[0:-1]
if not title:
self.error('no title, skipping %s', bill_number)
return
bill_type = 'resolution' if 'LR' in bill_number else 'bill'
bill = Bill(bill_number, session, title, classification=bill_type)
bill.add_source(main_url)
bill.add_source(bill_link)
introduced_by = self.get_node(
bill_page,
'//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/a[1]/text()')
if not introduced_by:
introduced_by = self.get_node(
bill_page,
'//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/text()')
introduced_by = introduced_by.split('Introduced By:')[1].strip()
bill.add_sponsorship(
name=introduced_by,
entity_type='person',
primary=True,
classification='primary',
)
action_nodes = self.get_nodes(
bill_page,
'//div[@class="main-content"]/div[5]//table/tbody/tr')
for action_node in action_nodes:
date = self.get_node(
action_node,
'./td[1]').text
date = datetime.strptime(date, '%b %d, %Y')
# The action node may have an anchor element within it, so
# we grab all the text within.
action = self.get_node(
action_node,
'./td[2]').text_content()
if 'Governor' in action:
actor = 'executive'
elif 'Speaker' in action:
actor = 'legislature'
else:
actor = 'legislature'
action_type = self.action_types(action)
bill.add_action(
action,
date.strftime('%Y-%m-%d'),
chamber=actor,
classification=action_type,
)
# Were in reverse chronological order.
bill.actions.reverse()
# Grabs bill version documents.
version_links = self.get_nodes(
bill_page,
'//div[@class="main-content"]/div[3]/div[2]/'
'div[@class="hidden-xs"]/ul[1]/li/a')
for version_link in version_links:
version_name = version_link.text
version_url = version_link.attrib['href']
# replace Current w/ session number
version_url = version_url.replace('Current', session)
bill.add_version_link(version_name, version_url, media_type='application/pdf')
# Adds any documents related to amendments.
amendment_links = self.get_nodes(
bill_page,
'//div[@class="main-content"]/div[5]/div[2]/table/tr/td[1]/a')
for amendment_link in amendment_links:
amendment_name = amendment_link.text
amendment_url = amendment_link.attrib['href']
bill.add_document_link(amendment_name, amendment_url)
# Related transcripts.
transcript_links = self.get_nodes(
#.........这里部分代码省略.........
示例5: _parse_house_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_document_link [as 别名]
#.........这里部分代码省略.........
bill_sponsor = clean_text(table_rows[0][1].text_content())
# try:
# bill_sponsor_link = table_rows[0][1][0].attrib['href']
# except IndexError:
# return
bill.add_sponsorship(
bill_sponsor,
entity_type='person',
classification='primary',
primary=True,
)
# check for cosponsors
sponsors_url, = bill_page.xpath(
"//a[contains(@href, 'CoSponsors.aspx')]/@href")
self._parse_cosponsors_from_bill(bill, sponsors_url)
# actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0]
# actions_link = '%s/%s' % (self._house_base_url,actions_link_tag.attrib['href'])
# actions_link = re.sub("content", "print", actions_link)
actions_link, = bill_page.xpath(
"//a[contains(@href, 'BillActions.aspx')]/@href")
yield from self._parse_house_actions(bill, actions_link)
# get bill versions
doc_tags = bill_page.xpath('//div[@class="BillDocuments"][1]/span')
for doc_tag in reversed(doc_tags):
doc = clean_text(doc_tag.text_content())
text_url = '%s%s' % (
self._house_base_url,
doc_tag[0].attrib['href']
)
bill.add_document_link(doc, text_url, media_type='text/html')
# get bill versions
version_tags = bill_page.xpath('//div[@class="BillDocuments"][2]/span')
for version_tag in reversed(version_tags):
version = clean_text(version_tag.text_content())
for vurl in version_tag.xpath(".//a"):
if vurl.text == 'PDF':
mimetype = 'application/pdf'
else:
mimetype = 'text/html'
bill.add_version_link(version, vurl.attrib['href'], media_type=mimetype,
on_duplicate='ignore')
# house bill versions
# everything between the row containing "Bill Text"" and the next div.DocHeaderRow
version_rows = bill_page.xpath(
'//div[contains(text(),"Bill Text")]/'
'following-sibling::div[contains(@class,"DocRow") '
'and count(preceding-sibling::div[contains(@class,"DocHeaderRow")])=1]')
for row in version_rows:
# some rows are just broken links, not real versions
if row.xpath('.//div[contains(@class,"textType")]/a/@href'):
version = row.xpath('.//div[contains(@class,"textType")]/a/text()')[0].strip()
path = row.xpath('.//div[contains(@class,"textType")]/a/@href')[0].strip()
if '.pdf' in path:
mimetype = 'application/pdf'
else:
mimetype = 'text/html'
bill.add_version_link(version, path, media_type=mimetype,
on_duplicate='ignore')
# house bill summaries
示例6: scrape_senate_bills
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_document_link [as 别名]
def scrape_senate_bills(self, chamber, insert, session, year):
doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution',
8: 'joint resolution'}
for docnum, bill_type in doc_type.items():
parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/' \
'HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
links = self.scrape_links(parentpage_url)
count = 0
for link in links:
count += 1
page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)
page = self.get(page_path).text
page = page.replace(u"\xa0", " ")
root = lxml.html.fromstring(page)
bill_id = root.xpath('string(/html/body/div[@id="content"]' +
'/table[1]/tr[1]/td[1]/font)')
title = self.get_node(
root,
'//div[@id="content"]/table/tr[preceding-sibling::tr/td/'
'b[contains(text(), "By:")]]/td/em/text()')
bill = Bill(bill_id,
legislative_session=session,
chamber=chamber,
title=title,
classification=bill_type
)
bill.subject = list(set(self.subject_mapping[bill_id]))
for table in root.xpath('//div[@id="content"]/table'):
if 'Bill Text' in table.text_content():
bill_text = table.xpath("string(tr/td[2]/a/@href)")
text_url = "http://www.leg.state.nv.us" + bill_text
bill.add_version_link(note="Bill Text",
url=text_url,
media_type='application/pdf')
primary, secondary = self.scrape_sponsors(page)
for leg in primary:
bill.add_sponsorship(name=leg,
classification='primary',
entity_type='person',
primary=True)
for leg in secondary:
bill.add_sponsorship(name=leg,
classification='cosponsor',
entity_type='person',
primary=False)
minutes_count = 2
for mr in root.xpath('//table[4]/tr/td[3]/a'):
minutes = mr.xpath("string(@href)")
minutes_url = "http://www.leg.state.nv.us" + minutes
minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
minutes_date = mr.xpath(minutes_date_path).split()
minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Agenda"
# bill.add_document(minutes_date, minutes_url)
bill.add_document_link(note=minutes_date,
url=minutes_url)
minutes_count = minutes_count + 1
self.scrape_actions(root, bill, "upper")
yield from self.scrape_votes(page, page_path, bill, insert, year)
bill.add_source(page_path)
yield bill
示例7: scrape
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_document_link [as 别名]
def scrape(self):
for leg_summary in self.legislation(created_after=datetime.datetime(2014, 1, 1)) :
leg_type = BILL_TYPES[leg_summary['Type']]
bill = Bill(identifier=leg_summary['File\xa0#'],
title=leg_summary['Title'],
legislative_session=None,
classification=leg_type,
from_organization={"name":"New York City Council"})
bill.add_source(leg_summary['url'])
leg_details = self.legDetails(leg_summary['url'])
history = self.history(leg_summary['url'])
bill.add_title(leg_details['Name'],
note='created by administrative staff')
if 'Summary' in leg_details :
bill.add_abstract(leg_details['Summary'], note='')
if leg_details['Law number'] :
bill.add_identifier(leg_details['Law number'],
note='law number')
for sponsorship in self._sponsors(leg_details.get('Sponsors', [])) :
sponsor, sponsorship_type, primary = sponsorship
bill.add_sponsorship(sponsor, sponsorship_type,
'person', primary,
entity_id = make_pseudo_id(name=sponsor))
for attachment in leg_details.get('Attachments', []) :
bill.add_document_link(attachment['label'],
attachment['url'],
media_type="application/pdf")
history = list(history)
if history :
earliest_action = min(self.toTime(action['Date'])
for action in history)
bill.legislative_session = self.sessions(earliest_action)
else :
bill.legislative_session = str(self.SESSION_STARTS[0])
for action in history :
action_description = action['Action']
if not action_description :
continue
action_class = ACTION_CLASSIFICATION[action_description]
action_date = self.toDate(action['Date'])
responsible_org = action['Action\xa0By']
if responsible_org == 'City Council' :
responsible_org = 'New York City Council'
elif responsible_org == 'Administration' :
responsible_org = 'Mayor'
if responsible_org == 'Town Hall Meeting' :
continue
else :
act = bill.add_action(action_description,
action_date,
organization={'name': responsible_org},
classification=action_class)
if 'url' in action['Action\xa0Details'] :
action_detail_url = action['Action\xa0Details']['url']
if action_class == 'committee-referral' :
action_details = self.actionDetails(action_detail_url)
referred_committee = action_details['Action text'].rsplit(' to the ', 1)[-1]
act.add_related_entity(referred_committee,
'organization',
entity_id = make_pseudo_id(name=referred_committee))
result, votes = self.extractVotes(action_detail_url)
if votes :
action_vote = VoteEvent(legislative_session=bill.legislative_session,
motion_text=action_description,
organization={'name': responsible_org},
classification=action_class,
start_date=action_date,
result=result,
bill=bill)
action_vote.add_source(action_detail_url)
for option, voter in votes :
action_vote.vote(option, voter)
yield action_vote
text = self.text(leg_summary['url'])
if text :
bill.extras = {'local_classification' : leg_summary['Type'],
'full_text' : text}
else :
bill.extras = {'local_classification' : leg_summary['Type']}
#.........这里部分代码省略.........
示例8: test_full_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_document_link [as 别名]
def test_full_bill():
create_jurisdiction()
person = Person.objects.create(id='person-id', name='Adam Smith')
org = ScrapeOrganization(name='House', classification='lower')
com = ScrapeOrganization(name='Arbitrary Committee', classification='committee',
parent_id=org._id)
oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act',
classification='tax bill', from_organization=org._id)
bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act',
classification='tax bill', from_organization=org._id)
bill.subject = ['taxes', 'axes']
bill.add_identifier('SB 9')
bill.add_title('Tack & Axe Tax Act')
bill.add_action('introduced in house', '1900-04-01', chamber='lower')
act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower')
act.add_related_entity('arbitrary committee', 'organization', com._id)
bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session")
bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person',
primary=False, entity_id=person.id)
bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person',
primary=True)
bill.add_abstract('This is an act about axes and taxes and tacks.', note="official")
bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf',
media_type='application/pdf')
bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html')
bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html')
bill.add_source('http://example.com/source')
# import bill
oi = OrganizationImporter('jid')
oi.import_data([org.as_dict(), com.as_dict()])
pi = PersonImporter('jid')
pi.json_to_db_id['person-id'] = 'person-id'
# Since we have to create this person behind the back of the import
# transaction, we'll fake the json-id to db-id, since they match in this
# case. This is *really* getting at some implementation detail, but it's
# the cleanest way to ensure we short-circut the json id lookup.
BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()])
# get bill from db and assert it imported correctly
b = Bill.objects.get(identifier='HB 1')
assert b.from_organization.classification == 'lower'
assert b.identifier == bill.identifier
assert b.title == bill.title
assert b.classification == bill.classification
assert b.subject == ['taxes', 'axes']
assert b.abstracts.get().note == 'official'
# other_title, other_identifier added
assert b.other_titles.get().title == 'Tack & Axe Tax Act'
assert b.other_identifiers.get().identifier == 'SB 9'
# actions
actions = list(b.actions.all())
assert len(actions) == 2
# ensure order was preserved (if this breaks it'll be intermittent)
assert actions[0].organization == Organization.objects.get(classification='lower')
assert actions[0].description == "introduced in house"
assert actions[1].description == "sent to arbitrary committee"
assert (actions[1].related_entities.get().organization ==
Organization.objects.get(classification='committee'))
# related_bills were added
rb = b.related_bills.get()
assert rb.identifier == 'HB 99'
# and bill got resolved
assert rb.related_bill.identifier == 'HB 99'
# sponsors added, linked & unlinked
sponsorships = b.sponsorships.all()
assert len(sponsorships) == 2
for ss in sponsorships:
if ss.primary:
assert ss.person is None
assert ss.organization is None
else:
assert ss.person == person
# versions & documents with their links
versions = b.versions.all()
assert len(versions) == 1
assert versions[0].links.count() == 1
documents = b.documents.all()
assert len(documents) == 1
assert documents[0].links.count() == 2
# sources
assert b.sources.count() == 1
示例9: scrape_bill_page
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_document_link [as 别名]
def scrape_bill_page(self, chamber, session, bill_url, bill_abbreviation):
page = self.lxmlize(bill_url)
author = self.get_one_xpath(
page,
"//a[@id='ctl00_PageBody_LinkAuthor']/text()")
def sbp(x): return self.scrape_bare_page(page.xpath(
"//a[contains(text(), '%s')]" % (x))[0].attrib['href'])
authors = [x.text for x in sbp("Authors")]
try:
digests = sbp("Digests")
except IndexError:
digests = []
try:
versions = sbp("Text")
except IndexError:
versions = []
try:
amendments = sbp("Amendments")
except IndexError:
amendments = []
title = page.xpath(
"//span[@id='ctl00_PageBody_LabelShortTitle']/text()")[0]
actions = page.xpath(
"//div[@id='ctl00_PageBody_PanelBillInfo']/"
"/table[@style='font-size:small']/tr")
bill_id = page.xpath(
"//span[@id='ctl00_PageBody_LabelBillID']/text()")[0]
bill_type = self._bill_types[bill_abbreviation[1:]]
bill = Bill(bill_id,
legislative_session=session,
chamber=chamber,
title=title,
classification=bill_type)
bill.add_source(bill_url)
authors.remove(author)
bill.add_sponsorship(author,
classification='primary',
entity_type='person',
primary=True)
for author in authors:
bill.add_sponsorship(author,
classification='cosponsor',
entity_type='person',
primary=False)
for digest in digests:
bill.add_document_link(note=digest.text,
url=digest.attrib['href'],
media_type="application/pdf")
for version in versions:
bill.add_version_link(note=version.text,
url=version.attrib['href'],
media_type="application/pdf")
for amendment in amendments:
bill.add_version_link(note=amendment.text,
url=amendment.attrib['href'],
media_type="application/pdf")
flags = {
"prefiled": ["filing"],
"referred to the committee": ["referral-committee"],
"sent to the house": ['passage'],
"ordered returned to the house": ['passage'],
"ordered to the senate": ['passage'],
"signed by the governor": ['executive-signature'],
"sent to the governor": ['executive-receipt'],
}
try:
votes_link = page.xpath("//a[text() = 'Votes']")[0]
yield from self.scrape_votes(bill, votes_link.attrib['href'])
except IndexError:
# Some bills don't have any votes
pass
for action in actions:
date, chamber, page, text = [x.text for x in action.xpath(".//td")]
session_year = self.jurisdiction.legislative_sessions[-1]['start_date'][0:4]
# Session is April -> June. Prefiles look like they're in
# January at earliest.
date += '/{}'.format(session_year)
date = dt.datetime.strptime(date, '%m/%d/%Y')
chamber = self._chambers[chamber]
cat = []
for flag in flags:
if flag in text.lower():
cat += flags[flag]
#.........这里部分代码省略.........
示例10: scrape_bills
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_document_link [as 别名]
#.........这里部分代码省略.........
media_type='text/html')
# amendments
# ex: http://billstatus.ls.state.ms.us/2018/pdf/history/HB/HB1040.xml
for amd in details_root.xpath('//AMENDMENTS/*'):
if amd.tag == 'HAM':
name = amd.xpath('HAM_DESC[1]/text()')[0]
name = append_parens(amd, 'HAM_DISP', name)
name = append_parens(amd, 'HAM_VDESC', name)
pdf_url = amd.xpath('string(HAM_PDF'
')').replace("../", "")
html_url = amd.xpath('string(HAM_OTHER'
')').replace("../", "")
elif amd.tag == 'SAM':
name = amd.xpath('SAM_DESC[1]/text()')[0]
name = append_parens(amd, 'SAM_DISP', name)
name = append_parens(amd, 'SAM_VDESC', name)
pdf_url = amd.xpath('string(SAM_PDF'
')').replace("../", "")
html_url = amd.xpath('string(SAM_OTHER'
')').replace("../", "")
elif amd.tag == 'AMRPT':
name = amd.xpath('AMRPT_DESC[1]/text()')[0]
pdf_url = amd.xpath('string(AMRPT_PDF'
')').replace("../", "")
html_url = amd.xpath('string(AMRPT_OTHER'
')').replace("../", "")
pdf_url = 'http://billstatus.ls.state.ms.us/' + pdf_url
html_url = 'http://billstatus.ls.state.ms.us/' + html_url
if 'adopted' in name.lower() or 'amendment report' in name.lower():
bill.add_version_link(name, pdf_url,
on_duplicate='ignore',
media_type='application/pdf')
bill.add_version_link(name, html_url,
on_duplicate='ignore',
media_type='text/html')
# avoid duplicate votes
seen_votes = set()
# Actions
for action in details_root.xpath('//HISTORY/ACTION'):
# action_num = action.xpath('string(ACT_NUMBER)').strip()
# action_num = int(action_num)
act_vote = action.xpath('string(ACT_VOTE)').replace("../../../..", "")
action_desc = action.xpath('string(ACT_DESC)')
date, action_desc = action_desc.split(" ", 1)
date = date + "/" + session[0:4]
date = datetime.strptime(date, "%m/%d/%Y")
if action_desc.startswith("(H)"):
actor = "lower"
action = action_desc[4:]
elif action_desc.startswith("(S)"):
actor = "upper"
action = action_desc[4:]
else:
actor = "executive"
action = action_desc
if "Veto" in action and actor == 'executive':
version_path = details_root.xpath("string(//VETO_OTHER)")
version_path = version_path.replace("../../../../", "")
version_url = "http://billstatus.ls.state.ms.us/" + version_path
bill.add_document_link("Veto", version_url)
atype = 'other'
for prefix, prefix_type in self._action_types:
if action.startswith(prefix):
atype = prefix_type
break
bill.add_action(action, self._tz.localize(date),
chamber=actor,
classification=atype if atype != 'other' else None)
# use committee names as scraped subjects
subjects = details_root.xpath('//H_NAME/text()')
subjects += details_root.xpath('//S_NAME/text()')
for subject in subjects:
if subject not in bill.subject:
bill.add_subject(subject)
if act_vote:
vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote
if vote_url not in seen_votes:
seen_votes.add(vote_url)
yield from self.scrape_votes(vote_url, action,
date, actor, bill)
bill.add_source(bill_details_url)
yield bill
示例11: get_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_document_link [as 别名]
def get_bill(self, matter):
'''Make Bill object from given matter.'''
'''
Currently, NYC Legistar does not have conventional "Types" for
three newly added committees: https://legistar.council.nyc.gov/Departments.aspx
We communicated the issue to NYC, and until we learn more, we will
skip the bills attached to those committees.
'''
orgs_without_type = ['Charter Revision Commission 2019',
'New York City Advisory Commission on Property Tax Reform',
'Democratic Conference of the Council of the City of New York']
if matter['MatterBodyName'].strip() in orgs_without_type:
return None
matter_id = matter['MatterId']
if matter_id in DUPLICATED_ACTIONS:
return None
date = matter['MatterIntroDate']
title = matter['MatterName']
identifier = matter['MatterFile']
if not all((date, title, identifier)):
return None
leg_type = BILL_TYPES[matter['MatterTypeName']]
bill_session = self.sessions(self.toTime(date))
bill = Bill(identifier=identifier,
title=title,
classification=leg_type,
legislative_session=bill_session,
from_organization={"name": "New York City Council"})
legistar_web = matter['legistar_url']
legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)
bill.add_source(legistar_web, note='web')
bill.add_source(legistar_api, note='api')
if matter['MatterTitle']:
bill.add_title(matter['MatterTitle'])
if matter['MatterEXText5']:
bill.add_abstract(matter['MatterEXText5'], note='')
try:
for sponsorship in self.sponsorships(matter_id):
bill.add_sponsorship(**sponsorship)
except KeyError:
self.version_errors.append(legistar_web)
return None
for attachment in self.attachments(matter_id):
if attachment['MatterAttachmentId'] == 103315: # Duplicate
return None
if attachment['MatterAttachmentName']:
bill.add_document_link(attachment['MatterAttachmentName'],
attachment['MatterAttachmentHyperlink'],
media_type='application/pdf')
for topic in self.topics(matter_id) :
bill.add_subject(topic['MatterIndexName'].strip())
for relation in self.relations(matter_id):
try:
related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId'])
except scrapelib.HTTPError:
return None
else:
date = related_bill['MatterIntroDate']
related_bill_session = self.session(self.toTime(date))
identifier = related_bill['MatterFile']
bill.add_related_bill(identifier=identifier,
legislative_session=related_bill_session,
relation_type='companion')
try:
text = self.text(matter_id)
except KeyError:
self.version_errors.append(legistar_web)
return None
bill.extras['local_classification'] = matter['MatterTypeName']
if text:
if text['MatterTextPlain']:
bill.extras['plain_text'] = text['MatterTextPlain'].replace(u'\u0000', '')
if text['MatterTextRtf']:
bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '')
return bill
示例12: scrape_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_document_link [as 别名]
def scrape_bill(self, bill_id):
old = self.api('bills/' + bill_id + '?')
# not needed
old.pop('id')
old.pop('state')
old.pop('level', None)
old.pop('country', None)
old.pop('created_at')
old.pop('updated_at')
old.pop('action_dates')
old.pop('+bill_type',None)
old.pop('+subject', None)
old.pop('+scraped_subjects', None)
old.pop('subjects', [])
classification = old.pop('type')
# ca weirdness
if 'fiscal committee' in classification:
classification.remove('fiscal committee')
if 'urgency' in classification:
classification.remove('urgency')
if 'local program' in classification:
classification.remove('local program')
if 'tax levy' in classification:
classification.remove('tax levy')
if classification[0] in ['miscellaneous', 'jres', 'cres']:
return
if classification == ['memorial resolution'] and self.state == 'ar':
classification = ['memorial']
if classification == ['concurrent memorial resolution'] and self.state == 'ar':
classification = ['concurrent memorial']
if classification == ['joint session resolution'] and self.state == 'il':
classification = ['joint resolution']
if classification == ['legislative resolution'] and self.state == 'ny':
classification = ['resolution']
if classification == ['address'] and self.state == 'nh':
classification = ['resolution']
if not old['title'] and self.state == 'me':
old['title'] = '(unknown)'
chamber = old.pop('chamber')
if self.state in ('ne', 'dc'):
chamber = 'legislature'
elif chamber in ('joint', 'conference'):
chamber = 'legislature'
new = Bill(old.pop('bill_id'), old.pop('session'), old.pop('title'),
chamber=chamber, classification=classification)
abstract = old.pop('summary', None)
if abstract:
new.add_abstract(abstract, note='')
for title in old.pop('alternate_titles'):
new.add_title(title)
for doc in old.pop('documents'):
new.add_document_link(doc['name'], doc['url'], on_duplicate='ignore')
for doc in old.pop('versions'):
new.add_version_link(doc['name'], doc['url'], media_type=doc.pop('mimetype', ''))
for subj in old.pop('scraped_subjects', []):
if subj:
new.add_subject(subj)
for spon in old.pop('sponsors'):
if spon.get('committee_id') is not None:
entity_type = 'organization'
elif spon.get('leg_id') is not None:
entity_type = 'person'
else:
entity_type = ''
new.add_sponsorship(spon['name'], spon['type'], entity_type,
spon['type'] == 'primary')
for act in old.pop('actions'):
actor = act['actor']
if actor.lower() in ('governor', 'mayor', 'secretary of state'):
actor = 'executive'
elif actor.lower() == 'house' or (actor.lower().startswith('lower (') and self.state == 'ca'):
actor = 'lower'
elif actor.lower() in ('senate', 'upper`') or (actor.lower().startswith('upper (') and self.state == 'ca'):
actor = 'upper'
elif actor in ('joint', 'other', 'Data Systems', 'Speaker', 'clerk',
'Office of the Legislative Fiscal Analyst', 'Became Law w',
'conference') or (actor.lower().startswith('legislature (') and self.state == 'ca'):
actor = 'legislature'
if actor in ('committee', 'sponsor') and self.state == 'pr':
actor = 'legislature'
# nebraska & DC
if actor in ('upper','council') and self.state in ('ne', 'dc'):
actor = 'legislature'
#.........这里部分代码省略.........
示例13: parse_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_document_link [as 别名]
def parse_bill(self, chamber, session, bill_id, url):
try:
page = self.lxmlize(url)
except scrapelib.HTTPError as e:
self.logger.warning(e)
return
last_action = self.parse_bill_field(
page, 'Last Action').xpath('text()')[0]
if 'WITHDRAWN' in last_action.upper():
self.info("{} Withdrawn, skipping".format(bill_id))
return
version = self.parse_bill_field(page, 'Bill Documents')
source_url = version.xpath('a[1]/@href')[0]
version_title = version.xpath('a[1]/text()')[0].strip()
if version is None:
# Bill withdrawn
self.logger.warning('Bill withdrawn.')
return
else:
if source_url.endswith('.doc'):
mimetype = 'application/msword'
elif source_url.endswith('.pdf'):
mimetype = 'application/pdf'
title = self.parse_bill_field(page, 'Title').text_content()
# actions = self.get_nodes(
# page,
# '//div[@class="StandardText leftDivMargin"]/'
# 'div[@class="StandardText"][last()]//text()[normalize-space()]')
if 'CR' in bill_id:
bill_type = 'concurrent resolution'
elif 'JR' in bill_id:
bill_type = 'joint resolution'
elif 'R' in bill_id:
bill_type = 'resolution'
else:
bill_type = 'bill'
bill = Bill(bill_id, legislative_session=session, chamber=chamber,
title=title, classification=bill_type)
bill.subject = self._subjects[bill_id]
bill.add_source(url)
bill.add_version_link(version_title, source_url, media_type=mimetype)
self.parse_actions(page, bill, chamber)
self.parse_subjects(page, bill)
# LM is "Locally Mandated fiscal impact"
fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]')
for fiscal_note in fiscal_notes:
source_url = fiscal_note.attrib['href']
if source_url.endswith('.doc'):
mimetype = 'application/msword'
elif source_url.endswith('.pdf'):
mimetype = 'application/pdf'
bill.add_document_link(
"Fiscal Note", source_url, media_type=mimetype)
for link in page.xpath("//td/span/a[contains(@href, 'Legislator-Profile')]"):
bill.add_sponsorship(link.text.strip(), classification='primary',
entity_type='person', primary=True)
bdr_no = self.parse_bill_field(page, 'Bill Request Number')
if bdr_no.xpath('text()'):
bdr = bdr_no.xpath('text()')[0].strip()
bill.extras["BDR"] = bdr
yield bill
示例14: scrape_assem_bills
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_document_link [as 别名]
def scrape_assem_bills(self, chamber, insert, session, year):
doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution',
6: 'joint resolution', 9: 'petition'}
for docnum, bill_type in doc_type.items():
parentpage_url = 'http://www.leg.state.nv.us/Session/%s/' \
'Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
links = self.scrape_links(parentpage_url)
count = 0
for link in links:
count = count + 1
page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)
page = self.get(page_path).text
page = page.replace(u"\xa0", " ")
root = lxml.html.fromstring(page)
root.make_links_absolute("http://www.leg.state.nv.us/")
bill_id = root.xpath('string(/html/body/div[@id="content"]'
'/table[1]/tr[1]/td[1]/font)')
title = self.get_node(
root,
'//div[@id="content"]/table/tr[preceding-sibling::tr/td/'
'b[contains(text(), "By:")]]/td/em/text()')
bill = Bill(bill_id, legislative_session=session, chamber=chamber,
title=title, classification=bill_type)
bill.subject = list(set(self.subject_mapping[bill_id]))
billtext = root.xpath("//b[text()='Bill Text']")[0].getparent().getnext()
text_urls = billtext.xpath("./a")
for text_url in text_urls:
version_name = text_url.text.strip()
version_url = text_url.attrib['href']
bill.add_version_link(note=version_name, url=version_url,
media_type='application/pdf')
primary, secondary = self.scrape_sponsors(page)
for leg in primary:
bill.add_sponsorship(classification='primary',
name=leg, entity_type='person',
primary=True)
for leg in secondary:
bill.add_sponsorship(classification='cosponsor',
name=leg, entity_type='person',
primary=False)
minutes_count = 2
for mr in root.xpath('//table[4]/tr/td[3]/a'):
minutes = mr.xpath("string(@href)")
minutes_url = "http://www.leg.state.nv.us" + minutes
minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
minutes_date = mr.xpath(minutes_date_path).split()
minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes"
bill.add_document_link(note=minutes_date, url=minutes_url)
minutes_count += 1
self.scrape_actions(root, bill, "lower")
yield from self.scrape_votes(page, page_path, bill, insert, year)
bill.add_source(page_path)
yield bill
示例15: scrape_bills
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_document_link [as 别名]
#.........这里部分代码省略.........
document = document.split('\\')
document = document[-2] + "/" + document[-1]
# doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
htm_url = 'http://www.njleg.state.nj.us/{}/Bills/{}'.format(
year_abr,
document.replace('.DOC', '.HTM'),
)
# name document based _doctype
try:
doc_name = self._doctypes[rec['DocType']]
except KeyError:
raise Exception('unknown doctype %s on %s' %
(rec['DocType'], bill_id))
if rec['Comment']:
doc_name += ' ' + rec['Comment']
# Clean HTMX links.
if htm_url.endswith('HTMX'):
htm_url = re.sub('X$', '', htm_url)
if rec['DocType'] in self._version_types:
if htm_url.endswith('HTM'):
mimetype = 'text/html'
elif htm_url.endswith('wpd'):
mimetype = 'application/vnd.wordperfect'
try:
bill.add_version_link(doc_name, htm_url, media_type=mimetype)
except ValueError:
self.warning("Couldn't find a document for bill {}".format(bill_id))
pass
else:
bill.add_document_link(doc_name, htm_url)
# Votes
next_year = int(year_abr) + 1
vote_info_list = [
'A%s' % year_abr,
'A%s' % next_year,
'S%s' % year_abr,
'S%s' % next_year,
'CA%s-%s' % (year_abr, next_year),
'CS%s-%s' % (year_abr, next_year),
]
for filename in vote_info_list:
s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename
try:
s_vote_zip, resp = self.urlretrieve(s_vote_url)
except scrapelib.FTPError:
self.warning('could not find %s' % s_vote_url)
continue
zippedfile = zipfile.ZipFile(s_vote_zip)
for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]:
try:
vote_file = io.TextIOWrapper(zippedfile.open(vfile, 'rU'))
except KeyError:
#
# Right, so, 2011 we have an "End" file with more
# vote data than was in the original dump.
#
self.warning("No such file: %s" % (vfile))
continue
vdict_file = csv.DictReader(vote_file)