本文整理汇总了Python中pupa.scrape.Bill.add_subject方法的典型用法代码示例。如果您正苦于以下问题:Python Bill.add_subject方法的具体用法?Python Bill.add_subject怎么用?Python Bill.add_subject使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pupa.scrape.Bill
的用法示例。
在下文中一共展示了Bill.add_subject方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: scrape
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_subject [as 别名]
def scrape(self):
self.session = '2011'
for i, page in enumerate(self.searchLegislation()) :
for legislation_summary in self.parseSearchResults(page) :
title = legislation_summary['Title'].strip()
if title == "":
continue
bill = Bill(name=legislation_summary['Record #'],
session=self.session,
title=title,
type=[legislation_summary['Type'].lower()],
organization=self.jurisdiction.name)
bill.add_source(legislation_summary['URL'])
legislation_details = self.expandLegislationSummary(legislation_summary)
for related_bill in legislation_details.get('Related files', []) :
bill.add_related_bill(name = related_bill,
session = self.session,
relation='other-session',
chamber=None)
for i, sponsor in enumerate(legislation_details.get('Sponsors', [])) :
if i == 0 :
primary = True
sponsorship_type = "Primary"
else :
primary = False
sponsorship_type = "Regular"
bill.add_sponsor(sponsor, sponsorship_type,
'person', primary)
for subject in legislation_details.get(u'Topics', []) :
bill.add_subject(subject)
for attachment in legislation_details.get(u'Attachments', []) :
bill.add_version_link('PDF',
attachment['url'],
mimetype="application/pdf")
yield bill
示例2: scrape_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_subject [as 别名]
def scrape_bill(self, session, chamber, bill_type, url):
bill_html = self.get(url).text
bill_page = lxml.html.fromstring(bill_html)
qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query))
bill_id = '{}{}'.format(qs['billtype'], qs['billnumber'])
versions = bill_page.xpath("//table[contains(@id, 'GridViewVersions')]")[0]
metainf_table = bill_page.xpath('//div[contains(@id, "itemPlaceholder")]//table[1]')[0]
action_table = bill_page.xpath('//div[contains(@id, "UpdatePanel1")]//table[1]')[0]
meta = self.parse_bill_metainf_table(metainf_table)
subs = [s.strip() for s in meta['Report Title'].split(";")]
if "" in subs:
subs.remove("")
b = Bill(bill_id, session, meta['Measure Title'],
chamber=chamber,
classification=bill_type)
if meta['Description']:
b.add_abstract(meta['Description'], 'description')
for subject in subs:
b.add_subject(subject)
if url:
b.add_source(url)
prior_session = '{} Regular Session'.format(str(int(session[:4]) - 1))
companion = meta['Companion'].strip()
if companion:
b.add_related_bill(identifier=companion.replace(u'\xa0', ' '),
legislative_session=prior_session,
relation_type="companion")
prior = bill_page.xpath(
"//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()")[-1]
if 'carried over' in prior.lower():
b.add_related_bill(identifier=bill_id.replace(u'\xa0', ' '),
legislative_session=prior_session,
relation_type="companion")
for sponsor in meta['Introducer(s)']:
b.add_sponsorship(sponsor, 'primary', 'person', True)
versions = self.parse_bill_versions_table(b, versions)
yield from self.parse_bill_actions_table(b, action_table, bill_id, session, url, chamber)
yield b
示例3: scrape_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_subject [as 别名]
#.........这里部分代码省略.........
title=title,
classification=btype
)
bill.add_source(url)
regex_ns = "http://exslt.org/regular-expressions"
version_links = page.xpath(
"//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]",
namespaces={'re': regex_ns})
for link in version_links:
bill.add_version_link(
link.xpath('string()').strip(),
link.attrib['href'],
media_type='text/html',
on_duplicate='ignore'
)
sponsor_links = page.xpath(
"//td[contains(@id, 'tdSponsors')]/a")
for link in sponsor_links:
bill.add_sponsorship(
link.text,
classification='primary',
primary=True,
entity_type='person'
)
actor = chamber
use_row = False
self.debug(bill_id)
for row in page.xpath("//table[contains(@id, 'BillActions')]/tr"):
if 'Date' in row.text_content() and 'Action' in row.text_content():
use_row = True
continue
elif not use_row:
continue
action = row.xpath("string(td[2])").strip()
atypes = []
if action.startswith('First read'):
atypes.append('introduction')
atypes.append('reading-1')
elif action.startswith('Signed by Governor'):
atypes.append('executive-signature')
actor = 'executive'
match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)',
action)
if match:
if match.group(1) in ['Senate',
'House of Representatives']:
first = ''
else:
first = 'committee-'
if match.group(3).lower() == 'passed':
second = 'passage'
elif match.group(3).lower() == 'failed':
second = 'failure'
atypes.append("%s%s" % (first, second))
if 'referred to' in action.lower():
atypes.append('referral-committee')
if 'Motion to amend, Passed Amendment' in action:
atypes.append('amendment-introduction')
atypes.append('amendment-passage')
if 'Veto override, Passed' in action:
atypes.append('veto-override-passage')
elif 'Veto override, Failed' in action:
atypes.append('veto-override-failure')
if 'Delivered to the Governor' in action:
atypes.append('executive-receipt')
match = re.match("First read in (Senate|House)", action)
if match:
if match.group(1) == 'Senate':
actor = 'upper'
else:
actor = 'lower'
date = row.xpath("string(td[1])").strip()
match = re.match('\d{2}/\d{2}/\d{4}', date)
if not match:
self.warning("Bad date: %s" % date)
continue
date = datetime.datetime.strptime(date, "%m/%d/%Y").date()
for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"):
yield from self.scrape_vote(bill, date, link.attrib['href'])
bill.add_action(action, date, chamber=actor, classification=atypes)
for link in page.xpath("//a[contains(@href, 'Keyword')]"):
bill.add_subject(link.text.strip())
yield bill
示例4: scrape
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_subject [as 别名]
#.........这里部分代码省略.........
classification = 'bill' if 'B' in bill_id else 'resolution'
bill = Bill(bill_id, legislative_session=session, chamber=chamber,
title=title, classification=classification)
bill.add_source(number_link.xpath('a/@href')[0])
# get bill from API
bill_api_url = ('http://search-prod.lis.state.oh.us/solarapi/v1/'
'general_assembly_{}/{}/{}/'.format(
session,
'bills' if 'B' in bill_id else 'resolutions',
bill_id.lower().replace(' ', '')
))
data = self.get(bill_api_url).json()
# add title if no short title
if not bill.title:
bill.title = data['items'][0]['longtitle']
bill.add_title(data['items'][0]['longtitle'], 'long title')
# this stuff is version-specific
for version in data['items']:
version_name = version["version"]
version_link = base_url+version["pdfDownloadLink"]
bill.add_version_link(version_name, version_link, media_type='application/pdf')
# we'll use latest bill_version for everything else
bill_version = data['items'][0]
bill.add_source(bill_api_url)
# subjects
for subj in bill_version["subjectindexes"]:
try:
bill.add_subject(subj["primary"])
except KeyError:
pass
try:
secondary_subj = subj["secondary"]
except KeyError:
secondary_subj = ""
if secondary_subj:
bill.add_subject(secondary_subj)
# sponsors
sponsors = bill_version["sponsors"]
for sponsor in sponsors:
sponsor_name = self.get_sponsor_name(sponsor)
bill.add_sponsorship(
sponsor_name,
classification='primary',
entity_type='person',
primary=True
)
cosponsors = bill_version["cosponsors"]
for sponsor in cosponsors:
sponsor_name = self.get_sponsor_name(sponsor)
bill.add_sponsorship(
sponsor_name,
classification='cosponsor',
entity_type='person',
primary=False,
)
try:
action_doc = self.get(base_url+bill_version["action"][0]["link"])
示例5: get_bill_info
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_subject [as 别名]
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url):
"""
Extracts all the requested info for a given bill.
Calls the parent's methods to enter the results into JSON files.
"""
chamber = 'lower' if chamber.lower() == 'house' else chamber
chamber = 'upper' if chamber.lower() == 'senate' else chamber
# Get html and parse
doc = self.lxmlize(bill_detail_url)
# Check if bill hasn't been transmitted to the other chamber yet
transmit_check = self.get_node(
doc,
'//h1[text()[contains(.,"Bills")]]/following-sibling::ul/li/text()'
)
if (transmit_check is not None and
'has not been transmitted' in transmit_check.strip()):
self.logger.debug('Bill has not been transmitted to other chamber '
'... skipping {0}'.format(bill_detail_url))
return
# Get the basic parts of the bill
bill_id = self.get_node(doc, '//h1[contains(@class,"card-title float-left mr-4")]/text()')
self.logger.debug(bill_id)
bill_title_text = self.get_node(
doc,
'//h2[text()[contains(.,"Description")]]/following-sibling::p/text()'
)
if bill_title_text is not None:
bill_title = bill_title_text.strip()
else:
long_desc_url = self.get_node(
doc,
'//a[text()[contains(.,"Long Description")]]/@href'
)
long_desc_page = self.lxmlize(long_desc_url)
long_desc_text = self.get_node(long_desc_page, '//h1/'
'following-sibling::p/text()')
if long_desc_text is not None:
bill_title = long_desc_text.strip()
else:
bill_title = 'No title found.'
self.logger.warning('No title found for {}.'.format(bill_id))
self.logger.debug(bill_title)
bill_type = {'F': 'bill', 'R': 'resolution',
'C': 'concurrent resolution'}[bill_id[1].upper()]
bill = Bill(bill_id, legislative_session=session, chamber=chamber,
title=bill_title, classification=bill_type)
# Add source
bill.add_source(bill_detail_url)
for subject in self._subject_mapping[bill_id]:
bill.add_subject(subject)
# Get companion bill.
companion = doc.xpath('//table[@class="status_info"]//tr[1]/td[2]'
'/a[starts-with(@href, "?")]/text()')
companion = self.make_bill_id(companion[0]) if len(companion) > 0 else None
companion_chamber = self.chamber_from_bill(companion)
if companion is not None:
bill.add_companion(companion, chamber=companion_chamber)
# Grab sponsors
bill = self.extract_sponsors(bill, doc, chamber)
# Add Actions performed on the bill.
bill = self.extract_actions(bill, doc, chamber)
# Get all versions of the bill.
bill = self.extract_versions(bill, doc, chamber, version_list_url)
yield bill
示例6: scrape_matter
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_subject [as 别名]
def scrape_matter(self, matter_link, sess):
matter_types = {
"Additions":"other",
"Administrative Order":"order",
"Annual Evaluation":"other",
"Bid Advertisement":"other",
"Bid Awards":"other",
"Bid Contract":"contract",
"Bid Protest":"other",
"Bid Rejection":"other",
"Birthday Scroll":"commemoration",
"Certificate of Appreciation":"commemoration",
"Change Order":"order",
"Citizen's Presentation":"other",
"Commendation":"commemoration",
"Conflict Waiver":"other",
"Congratulatory Certificate":"commemoration",
"Deferrals":"other",
"Discussion Item":"other",
"Distinguished Visitor":"other",
"Joint Meeting/Workshop":"other",
"Mayoral Veto":"other",
"Miscellaneous":"other",
"Nomination":"nomination",
"Oath of Office":"other",
"Omnibus Reserve":"bill",
"Ordinance":"ordinance",
"Plaque":"commemoration",
"Presentation":"other",
"Proclamation":"proclamation",
"Professional Service Agreement":"contract",
"Public Hearing":"other",
"Report":"other",
"Request for Proposals":"other",
"Request for Qualifications":"other",
"Request to Advertise":"other",
"Resolution":"resolution",
"Resolution of Sympathy":"resolution",
"Service Awards":"commemoration",
"Special Item":"other",
"Special Presentation":"other",
"Supplement":"other",
"Swearing-In":"other",
"Time Sensitive Items":"other",
"Withdrawals":"other",
"Workshop Item":"other",
"Zoning":"other",
"Zoning Resolution":"resolution"
}
matter_doc = self.lxmlize(matter_link)
info_dict = self.matter_table_to_dict(matter_doc)
#we're going to use the year of the intro date as the session
#until/unless we come up with something better
intro_date = datetime.strptime(info_dict["Introduced"],"%m/%d/%Y")
session = sess["identifier"]
category = matter_types[info_dict["File Type"]]
if 'File Name' in info_dict:
title = info_dict["File Name"]
elif "Title" in info_dict and info_dict["Title"].strip():
title = info_dict["Title"].strip()
else:
self.warning("bill has no title")
return
if category == 'other':
bill = Bill(identifier=info_dict["File Number"],
legislative_session=session,
title=title
)
else:
bill = Bill(identifier=info_dict["File Number"],
legislative_session=session,
title=title,
classification=category
)
for spons in info_dict["Sponsors"]:
if spons == "NONE":
continue
try:
name,spons_type = spons.rsplit(",",1)
except ValueError:
name = spons
spons_type = "Sponsor"
primary = True if "Prime Sponsor" in spons_type else False
entity = "person"
if "committee" in name:
entity = committee
bill.add_sponsorship(name,spons_type,entity,primary)
if "Indexes" in info_dict:
for subj in info_dict["Indexes"]:
if subj.strip() and subj.strip() != "NONE":
bill.add_subject(subj.strip())
if "Title" in info_dict and info_dict["Title"].strip():
note = "bill's long title'"
if ("Note" in info_dict and info_dict["Note"].strip()):
note = info_dict["Note"]
bill.add_abstract(abstract=info_dict["Title"],note=note)
self.process_action_table(matter_doc,bill)
bill.add_source(matter_link, note='web')
yield bill
示例7: scrape
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_subject [as 别名]
def scrape(self):
unreachable_urls = []
for leg_summary in self.legislation(created_after=datetime.datetime(2015, 5, 17)) :
title = leg_summary['Title'].strip()
if not title or not leg_summary['Intro\xa0Date'] :
continue
# https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search=
# doesn't have an intro date
bill_type = BILL_TYPES[leg_summary['Type']]
bill_session = self.session(self.toTime(leg_summary['Intro\xa0Date']))
bill = Bill(identifier=leg_summary['Record #'],
legislative_session=bill_session,
title=title,
classification=bill_type,
from_organization={"name":"Chicago City Council"})
bill.add_source(leg_summary['url'])
try :
leg_details = self.legDetails(leg_summary['url'])
except IndexError :
unreachable_urls.append(leg_summary['url'])
yield bill
continue
for related_bill in leg_details.get('Related files', []) :
lower_title = title.lower()
if "sundry" in title or "miscellaneous" in title: #these are ominbus
bill.add_related_bill(identifier = related_bill['label'],
legislative_session = bill.legislative_session,
relation_type='replaces')
#for now we're skipping related bills if they
#don't contain words that make us think they're
#in a ominbus relationship with each other
for i, sponsor in enumerate(leg_details.get('Sponsors', [])) :
if i == 0 :
primary = True
sponsorship_type = "Primary"
else :
primary = False
sponsorship_type = "Regular"
sponsor_name = sponsor['label']
# Does the Mayor/Clerk introduce legisislation as
# individuals role holders or as the OFfice of City
# Clerk and the Office of the Mayor?
entity_type = 'person'
if sponsor_name.startswith(('City Clerk',
'Mendoza, Susana')) :
sponsor_name = 'Office of the City Clerk'
entity_type = 'organization'
elif sponsor_name.startswith(('Emanuel, Rahm',)) :
sponsor_name = 'Office of the Mayor'
entity_type = 'organization'
if not sponsor_name.startswith(('Misc. Transmittal',
'No Sponsor',
'Dept./Agency')) :
bill.add_sponsorship(sponsor_name,
sponsorship_type,
entity_type,
primary,
entity_id = _make_pseudo_id(name=sponsor_name))
if 'Topic' in leg_details :
for subject in leg_details[u'Topic'].split(',') :
bill.add_subject(subject)
for attachment in leg_details.get('Attachments', []) :
if attachment['label'] :
bill.add_version_link(attachment['label'],
attachment['url'],
media_type="application/pdf")
for action in self.history(leg_summary['url']) :
action_description = action['Action']
try :
action_date = self.toTime(action['Date']).date().isoformat()
except AttributeError : # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492
continue
if action_description :
try :
responsible_org = action['Action\xa0By']['label']
except TypeError :
responsible_org = action['Action\xa0By']
if responsible_org == 'City Council' :
responsible_org = 'Chicago City Council'
act = bill.add_action(action_description,
action_date,
organization={'name': responsible_org},
classification=ACTION_CLASSIFICATION[action_description])
if action_description == 'Referred' :
#.........这里部分代码省略.........
示例8: scrape_details
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_subject [as 别名]
def scrape_details(self, bill_detail_url, session, chamber, bill_id):
"""
Create the Bill and add the information obtained from the provided bill_detail_url.
and then yield the bill object.
:param bill_detail_url:
:param session:
:param chamber:
:param bill_id:
:return:
"""
page = self.get(bill_detail_url).text
if 'INVALID BILL NUMBER' in page:
self.warning('INVALID BILL %s' % bill_detail_url)
return
doc = lxml.html.fromstring(page)
doc.make_links_absolute(bill_detail_url)
bill_div = doc.xpath('//div[@style="margin:0 0 40px 0;"]')[0]
bill_type = bill_div.xpath('span/text()')[0]
if 'General Bill' in bill_type:
bill_type = 'bill'
elif 'Concurrent Resolution' in bill_type:
bill_type = 'concurrent resolution'
elif 'Joint Resolution' in bill_type:
bill_type = 'joint resolution'
elif 'Resolution' in bill_type:
bill_type = 'resolution'
else:
raise ValueError('unknown bill type: %s' % bill_type)
# this is fragile, but less fragile than it was
b = bill_div.xpath('./b[text()="Summary:"]')[0]
bill_summary = b.getnext().tail.strip()
bill = Bill(
bill_id,
legislative_session=session, # session name metadata's `legislative_sessions`
chamber=chamber, # 'upper' or 'lower'
title=bill_summary,
classification=bill_type
)
subjects = list(self._subjects[bill_id])
for subject in subjects:
bill.add_subject(subject)
# sponsors
for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'):
bill.add_sponsorship(
name=sponsor,
classification='primary',
primary=True,
entity_type='person'
)
for sponsor in doc.xpath('//a[contains(@href, "committee.php")]/text()'):
sponsor = sponsor.replace(u'\xa0', ' ').strip()
bill.add_sponsorship(
name=sponsor,
classification='primary',
primary=True,
entity_type='organization'
)
# find versions
version_url = doc.xpath('//a[text()="View full text"]/@href')[0]
version_html = self.get(version_url).text
version_doc = lxml.html.fromstring(version_html)
version_doc.make_links_absolute(version_url)
for version in version_doc.xpath('//a[contains(@href, "/prever/")]'):
# duplicate versions with same date, use first appearance
bill.add_version_link(
note=version.text, # Description of the version from the state;
# eg, 'As introduced', 'Amended', etc.
url=version.get('href'),
on_duplicate='ignore',
media_type='text/html' # Still a MIME type
)
# actions
for row in bill_div.xpath('table/tr'):
date_td, chamber_td, action_td = row.xpath('td')
date = datetime.datetime.strptime(date_td.text, "%m/%d/%y")
action_chamber = {'Senate': 'upper',
'House': 'lower',
None: 'legislature'}[chamber_td.text]
action = action_td.text_content()
action = action.split('(House Journal')[0]
action = action.split('(Senate Journal')[0].strip()
atype = action_type(action)
bill.add_action(
#.........这里部分代码省略.........
示例9: _scrape_bills
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_subject [as 别名]
def _scrape_bills(self):
"""
Does the following
1) Scrapes bill data from unitedstates project and saves the data to path specified in UnitedStates module
2) Iterates over bill data and converts each one to an OCD-compliant bill model.
3) Yields the OCD-compliant bill model instance
@return: generator for federal US bills in OCD-compliant format
@rtype: generator
"""
# run scraper first to pull in all the bill data
self._run_unitedstates_bill_scraper()
# iterate over all the files and build and yield Bill objects
for filename in find_files(settings.SCRAPED_DATA_DIR, '.*/data/[0-9]+/bills/[^\/]+/[^\/]+/data.json'):
try:
with open(filename) as json_file:
json_data = json.load(json_file)
# Initialize Object
bill = Bill(constants.TYPE_MAP[json_data['bill_type']]['canonical'] + ' ' + json_data['number'],
json_data['congress'],
json_data['official_title'],
chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber']
)
# add source of data
bill.add_source(json_data['url'], note='all')
# add subjects
for subject in json_data['subjects']:
bill.add_subject(subject)
# add summary
if 'summary' in json_data and json_data['summary'] is not None:
bill.add_abstract(json_data['summary']['text'],
json_data['summary']['as'],
json_data['summary']['date'])
# add titles
for item in json_data['titles']:
bill.add_title(item['title'], item['type'])
# add other/related Bills
for b in json_data['related_bills']:
if 'type' in b and b['type'] == 'bill':
split = b['bill_id'].split('-')
m = UnitedStatesBillScraper.BILL_SPLIT.match(split[0])
bill.add_related_bill(constants.TYPE_MAP[m.group(1)]['canonical'] + ' ' + m.group(2),
legislative_session=split[1],
relation_type='companion')
# add sponsor
bill.add_sponsorship_by_identifier(json_data['sponsor']['name'], 'person', 'person', True,
scheme='thomas_id', identifier=json_data['sponsor']['thomas_id'],
chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber'])
# add cosponsors
for cs in json_data['cosponsors']:
bill.add_sponsorship_by_identifier(cs['name'], 'person', 'person', False,
scheme='thomas_id', identifier=cs['thomas_id'],
chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber'])
# add introduced_at and actions
bill.add_action('date of introduction', datetime_to_date(json_data['introduced_at']),
chamber=constants.TYPE_MAP[json_data['bill_type']]['chamber'],
related_entities=[])
# add other actions
for action in json_data['actions']:
bill.actions.append({'date': datetime_to_date(action['acted_at']),
'type': [action['type']],
'description': action['text'],
'actor': constants.TYPE_MAP[json_data['bill_type']]['chamber'],
'related_entities': []
})
# add bill versions
for version_path in find_files(os.path.join(settings.SCRAPED_DATA_DIR,
'data', bill.legislative_session, 'bills', json_data['bill_type'],
json_data['bill_type'] + json_data['number'],
'text-versions'), '/.*/*\.json'):
try:
with open(version_path) as version_file:
version_json_data = json.load(version_file)
for k, v in version_json_data['urls'].items():
bill.versions.append({'date': datetime_to_date(version_json_data['issued_on']),
'type': version_json_data['version_code'],
'name': constants.VERSION_MAP[version_json_data['version_code']],
'links': [{'mimetype': k, 'url': v}]})
except IOError:
print("Unable to open or parse file with path " + version_path)
continue
# finally yield bill object
yield bill
except IOError:
#.........这里部分代码省略.........
示例10: scrape_bills
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_subject [as 别名]
#.........这里部分代码省略.........
media_type='text/html')
# amendments
# ex: http://billstatus.ls.state.ms.us/2018/pdf/history/HB/HB1040.xml
for amd in details_root.xpath('//AMENDMENTS/*'):
if amd.tag == 'HAM':
name = amd.xpath('HAM_DESC[1]/text()')[0]
name = append_parens(amd, 'HAM_DISP', name)
name = append_parens(amd, 'HAM_VDESC', name)
pdf_url = amd.xpath('string(HAM_PDF'
')').replace("../", "")
html_url = amd.xpath('string(HAM_OTHER'
')').replace("../", "")
elif amd.tag == 'SAM':
name = amd.xpath('SAM_DESC[1]/text()')[0]
name = append_parens(amd, 'SAM_DISP', name)
name = append_parens(amd, 'SAM_VDESC', name)
pdf_url = amd.xpath('string(SAM_PDF'
')').replace("../", "")
html_url = amd.xpath('string(SAM_OTHER'
')').replace("../", "")
elif amd.tag == 'AMRPT':
name = amd.xpath('AMRPT_DESC[1]/text()')[0]
pdf_url = amd.xpath('string(AMRPT_PDF'
')').replace("../", "")
html_url = amd.xpath('string(AMRPT_OTHER'
')').replace("../", "")
pdf_url = 'http://billstatus.ls.state.ms.us/' + pdf_url
html_url = 'http://billstatus.ls.state.ms.us/' + html_url
if 'adopted' in name.lower() or 'amendment report' in name.lower():
bill.add_version_link(name, pdf_url,
on_duplicate='ignore',
media_type='application/pdf')
bill.add_version_link(name, html_url,
on_duplicate='ignore',
media_type='text/html')
# avoid duplicate votes
seen_votes = set()
# Actions
for action in details_root.xpath('//HISTORY/ACTION'):
# action_num = action.xpath('string(ACT_NUMBER)').strip()
# action_num = int(action_num)
act_vote = action.xpath('string(ACT_VOTE)').replace("../../../..", "")
action_desc = action.xpath('string(ACT_DESC)')
date, action_desc = action_desc.split(" ", 1)
date = date + "/" + session[0:4]
date = datetime.strptime(date, "%m/%d/%Y")
if action_desc.startswith("(H)"):
actor = "lower"
action = action_desc[4:]
elif action_desc.startswith("(S)"):
actor = "upper"
action = action_desc[4:]
else:
actor = "executive"
action = action_desc
if "Veto" in action and actor == 'executive':
version_path = details_root.xpath("string(//VETO_OTHER)")
version_path = version_path.replace("../../../../", "")
version_url = "http://billstatus.ls.state.ms.us/" + version_path
bill.add_document_link("Veto", version_url)
atype = 'other'
for prefix, prefix_type in self._action_types:
if action.startswith(prefix):
atype = prefix_type
break
bill.add_action(action, self._tz.localize(date),
chamber=actor,
classification=atype if atype != 'other' else None)
# use committee names as scraped subjects
subjects = details_root.xpath('//H_NAME/text()')
subjects += details_root.xpath('//S_NAME/text()')
for subject in subjects:
if subject not in bill.subject:
bill.add_subject(subject)
if act_vote:
vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote
if vote_url not in seen_votes:
seen_votes.add(vote_url)
yield from self.scrape_votes(vote_url, action,
date, actor, bill)
bill.add_source(bill_details_url)
yield bill
示例11: get_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_subject [as 别名]
def get_bill(self, matter):
'''Make Bill object from given matter.'''
'''
Currently, NYC Legistar does not have conventional "Types" for
three newly added committees: https://legistar.council.nyc.gov/Departments.aspx
We communicated the issue to NYC, and until we learn more, we will
skip the bills attached to those committees.
'''
orgs_without_type = ['Charter Revision Commission 2019',
'New York City Advisory Commission on Property Tax Reform',
'Democratic Conference of the Council of the City of New York']
if matter['MatterBodyName'].strip() in orgs_without_type:
return None
matter_id = matter['MatterId']
if matter_id in DUPLICATED_ACTIONS:
return None
date = matter['MatterIntroDate']
title = matter['MatterName']
identifier = matter['MatterFile']
if not all((date, title, identifier)):
return None
leg_type = BILL_TYPES[matter['MatterTypeName']]
bill_session = self.sessions(self.toTime(date))
bill = Bill(identifier=identifier,
title=title,
classification=leg_type,
legislative_session=bill_session,
from_organization={"name": "New York City Council"})
legistar_web = matter['legistar_url']
legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)
bill.add_source(legistar_web, note='web')
bill.add_source(legistar_api, note='api')
if matter['MatterTitle']:
bill.add_title(matter['MatterTitle'])
if matter['MatterEXText5']:
bill.add_abstract(matter['MatterEXText5'], note='')
try:
for sponsorship in self.sponsorships(matter_id):
bill.add_sponsorship(**sponsorship)
except KeyError:
self.version_errors.append(legistar_web)
return None
for attachment in self.attachments(matter_id):
if attachment['MatterAttachmentId'] == 103315: # Duplicate
return None
if attachment['MatterAttachmentName']:
bill.add_document_link(attachment['MatterAttachmentName'],
attachment['MatterAttachmentHyperlink'],
media_type='application/pdf')
for topic in self.topics(matter_id) :
bill.add_subject(topic['MatterIndexName'].strip())
for relation in self.relations(matter_id):
try:
related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId'])
except scrapelib.HTTPError:
return None
else:
date = related_bill['MatterIntroDate']
related_bill_session = self.session(self.toTime(date))
identifier = related_bill['MatterFile']
bill.add_related_bill(identifier=identifier,
legislative_session=related_bill_session,
relation_type='companion')
try:
text = self.text(matter_id)
except KeyError:
self.version_errors.append(legistar_web)
return None
bill.extras['local_classification'] = matter['MatterTypeName']
if text:
if text['MatterTextPlain']:
bill.extras['plain_text'] = text['MatterTextPlain'].replace(u'\u0000', '')
if text['MatterTextRtf']:
bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '')
return bill
示例12: parse_bill_status_page
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_subject [as 别名]
def parse_bill_status_page(self, status_url, bill_url, session, chamber):
status_page = lxml.html.fromstring(self.get(status_url).text)
# see 2007 HB 2... weird.
bill_re = r'.*?/([A-Z]+)0*(\d+)\.pdf'
bill_xpath = '//a[contains(@href, ".pdf") and contains(@href, "billpdf")]/@href'
bill_id = re.search(bill_re, status_page.xpath(bill_xpath)[0],
re.IGNORECASE).groups()
bill_id = "{0} {1}".format(bill_id[0], int(bill_id[1]))
try:
xp = '//b[text()="Short Title:"]/../following-sibling::td/text()'
title = status_page.xpath(xp).pop()
except IndexError:
title = status_page.xpath('//tr[1]/td[2]')[0].text_content()
# Add bill type.
_bill_id = bill_id.lower()
if 'b' in _bill_id:
classification = 'bill'
elif 'j' in _bill_id or 'jr' in _bill_id:
classification = 'joint resolution'
elif 'cr' in _bill_id:
classification = 'concurrent resolution'
elif 'r' in _bill_id:
classification = 'resolution'
bill = Bill(bill_id, legislative_session=session, chamber=chamber,
title=title, classification=classification)
self.add_actions(bill, status_page)
votes = self.add_votes(bill, status_page, status_url)
tabledata = self._get_tabledata(status_page)
# Add sponsor info.
bill.add_sponsorship(tabledata['primary sponsor:'][0], classification='primary',
entity_type='person', primary=True)
# A various plus fields MT provides.
plus_fields = [
'requester',
('chapter number:', 'chapter'),
'transmittal date:',
'drafter',
'fiscal note probable:',
'bill draft number:',
'preintroduction required:',
'by request of',
'category:']
for x in plus_fields:
if isinstance(x, tuple):
_key, key = x
else:
_key = key = x
key = key.replace(' ', '_')
try:
val = tabledata[_key]
except KeyError:
continue
if len(val) == 1:
val = val[0]
bill.extras[key] = val
# Add bill subjects.
xp = '//th[contains(., "Revenue/Approp.")]/ancestor::table/tr'
subjects = []
for tr in status_page.xpath(xp):
try:
subj = tr.xpath('td')[0].text_content()
except:
continue
subjects.append(subj)
for s in subjects:
bill.add_subject(s)
self.add_fiscal_notes(status_page, bill)
return bill, list(votes)
示例13: parse_bill_status_page
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_subject [as 别名]
def parse_bill_status_page(self, url, page, session, chamber):
# see 2007 HB 2... weird.
parsed_url = urllib.parse.urlparse(url)
parsed_query = dict(urllib.parse.parse_qsl(parsed_url.query))
bill_id = "{0} {1}".format(
parsed_query['P_BLTP_BILL_TYP_CD'],
parsed_query['P_BILL_NO1'])
try:
xp = '//b[text()="Short Title:"]/../following-sibling::td/text()'
title = page.xpath(xp).pop()
except IndexError:
title = page.xpath('//tr[1]/td[2]')[0].text_content()
# Add bill type.
_bill_id = bill_id.lower()
if 'b' in _bill_id:
classification = 'bill'
elif 'j' in _bill_id or 'jr' in _bill_id:
classification = 'joint resolution'
elif 'cr' in _bill_id:
classification = 'concurrent resolution'
elif 'r' in _bill_id:
classification = 'resolution'
bill = Bill(bill_id, legislative_session=session, chamber=chamber,
title=title, classification=classification)
self.add_actions(bill, page)
votes = self.add_votes(bill, page, url)
tabledata = self._get_tabledata(page)
# Add sponsor info.
bill.add_sponsorship(tabledata['primary sponsor:'][0], classification='primary',
entity_type='person', primary=True)
# A various plus fields MT provides.
plus_fields = [
'requester',
('chapter number:', 'chapter'),
'transmittal date:',
'drafter',
'fiscal note probable:',
'bill draft number:',
'preintroduction required:',
'by request of',
'category:']
for x in plus_fields:
if isinstance(x, tuple):
_key, key = x
else:
_key = key = x
key = key.replace(' ', '_')
try:
val = tabledata[_key]
except KeyError:
continue
if len(val) == 1:
val = val[0]
bill.extras[key] = val
# Add bill subjects.
xp = '//th[contains(., "Revenue/Approp.")]/ancestor::table/tr'
subjects = []
for tr in page.xpath(xp):
try:
subj = tr.xpath('td')[0].text_content()
except IndexError:
continue
subjects.append(subj)
for s in subjects:
bill.add_subject(s)
self.add_fiscal_notes(page, bill)
return bill, list(votes)
示例14: scrape
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_subject [as 别名]
def scrape(self) :
three_days_ago = datetime.datetime.now() - datetime.timedelta(3)
for matter in self.matters(three_days_ago) :
matter_id = matter['MatterId']
date = matter['MatterIntroDate']
title = matter['MatterTitle']
identifier = matter['MatterFile']
if not all((date, title, identifier)) :
continue
bill_session = self.session(self.toTime(date))
bill_type = BILL_TYPES[matter['MatterTypeName']]
if identifier.startswith('S'):
alternate_identifiers = [identifier]
identifier = identifier[1:]
else:
alternate_identifiers = []
bill = Bill(identifier=identifier,
legislative_session=bill_session,
title=title,
classification=bill_type,
from_organization={"name":"Chicago City Council"})
legistar_web = self.legislation_detail_url(matter_id)
legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format(matter_id)
bill.add_source(legistar_web, note='web')
bill.add_source(legistar_api, note='api')
for identifier in alternate_identifiers:
bill.add_identifier(identifier)
for action, vote in self.actions(matter_id) :
act = bill.add_action(**action)
if action['description'] == 'Referred' :
body_name = matter['MatterBodyName']
if body_name != 'City Council' :
act.add_related_entity(body_name,
'organization',
entity_id = _make_pseudo_id(name=body_name))
result, votes = vote
if result :
vote_event = VoteEvent(legislative_session=bill.legislative_session,
motion_text=action['description'],
organization=action['organization'],
classification=None,
start_date=action['date'],
result=result,
bill=bill)
vote_event.add_source(legistar_web)
vote_event.add_source(legistar_api + '/histories')
for vote in votes :
raw_option = vote['VoteValueName'].lower()
clean_option = self.VOTE_OPTIONS.get(raw_option,
raw_option)
vote_event.vote(clean_option,
vote['VotePersonName'].strip())
yield vote_event
for sponsorship in self.sponsorships(matter_id) :
bill.add_sponsorship(**sponsorship)
for topic in self.topics(matter_id) :
bill.add_subject(topic['MatterIndexName'].strip())
for attachment in self.attachments(matter_id) :
if attachment['MatterAttachmentName'] :
bill.add_version_link(attachment['MatterAttachmentName'],
attachment['MatterAttachmentHyperlink'],
media_type="application/pdf")
bill.extras = {'local_classification' : matter['MatterTypeName']}
text = self.text(matter_id)
if text :
if text['MatterTextPlain'] :
bill.extras['plain_text'] = text['MatterTextPlain']
if text['MatterTextRtf'] :
bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '')
yield bill
示例15: scrape_bill
# 需要导入模块: from pupa.scrape import Bill [as 别名]
# 或者: from pupa.scrape.Bill import add_subject [as 别名]
def scrape_bill(self, bill_id):
old = self.api('bills/' + bill_id + '?')
# not needed
old.pop('id')
old.pop('state')
old.pop('level', None)
old.pop('country', None)
old.pop('created_at')
old.pop('updated_at')
old.pop('action_dates')
old.pop('+bill_type',None)
old.pop('+subject', None)
old.pop('+scraped_subjects', None)
old.pop('subjects', [])
classification = old.pop('type')
# ca weirdness
if 'fiscal committee' in classification:
classification.remove('fiscal committee')
if 'urgency' in classification:
classification.remove('urgency')
if 'local program' in classification:
classification.remove('local program')
if 'tax levy' in classification:
classification.remove('tax levy')
if classification[0] in ['miscellaneous', 'jres', 'cres']:
return
if classification == ['memorial resolution'] and self.state == 'ar':
classification = ['memorial']
if classification == ['concurrent memorial resolution'] and self.state == 'ar':
classification = ['concurrent memorial']
if classification == ['joint session resolution'] and self.state == 'il':
classification = ['joint resolution']
if classification == ['legislative resolution'] and self.state == 'ny':
classification = ['resolution']
if classification == ['address'] and self.state == 'nh':
classification = ['resolution']
if not old['title'] and self.state == 'me':
old['title'] = '(unknown)'
chamber = old.pop('chamber')
if self.state in ('ne', 'dc'):
chamber = 'legislature'
elif chamber in ('joint', 'conference'):
chamber = 'legislature'
new = Bill(old.pop('bill_id'), old.pop('session'), old.pop('title'),
chamber=chamber, classification=classification)
abstract = old.pop('summary', None)
if abstract:
new.add_abstract(abstract, note='')
for title in old.pop('alternate_titles'):
new.add_title(title)
for doc in old.pop('documents'):
new.add_document_link(doc['name'], doc['url'], on_duplicate='ignore')
for doc in old.pop('versions'):
new.add_version_link(doc['name'], doc['url'], media_type=doc.pop('mimetype', ''))
for subj in old.pop('scraped_subjects', []):
if subj:
new.add_subject(subj)
for spon in old.pop('sponsors'):
if spon.get('committee_id') is not None:
entity_type = 'organization'
elif spon.get('leg_id') is not None:
entity_type = 'person'
else:
entity_type = ''
new.add_sponsorship(spon['name'], spon['type'], entity_type,
spon['type'] == 'primary')
for act in old.pop('actions'):
actor = act['actor']
if actor.lower() in ('governor', 'mayor', 'secretary of state'):
actor = 'executive'
elif actor.lower() == 'house' or (actor.lower().startswith('lower (') and self.state == 'ca'):
actor = 'lower'
elif actor.lower() in ('senate', 'upper`') or (actor.lower().startswith('upper (') and self.state == 'ca'):
actor = 'upper'
elif actor in ('joint', 'other', 'Data Systems', 'Speaker', 'clerk',
'Office of the Legislative Fiscal Analyst', 'Became Law w',
'conference') or (actor.lower().startswith('legislature (') and self.state == 'ca'):
actor = 'legislature'
if actor in ('committee', 'sponsor') and self.state == 'pr':
actor = 'legislature'
# nebraska & DC
if actor in ('upper','council') and self.state in ('ne', 'dc'):
actor = 'legislature'
#.........这里部分代码省略.........