本文整理匯總了Python中pyutils.legislation.Bill.add_source方法的典型用法代碼示例。如果您正苦於以下問題:Python Bill.add_source方法的具體用法?Python Bill.add_source怎麽用?Python Bill.add_source使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyutils.legislation.Bill
的用法示例。
在下文中一共展示了Bill.add_source方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: parse_senate_billpage
# 需要導入模塊: from pyutils.legislation import Bill [as 別名]
# 或者: from pyutils.legislation.Bill import add_source [as 別名]
def parse_senate_billpage(self, bill_url, year):
with self.soup_context(bill_url) as bill_page:
# get all the info needed to record the bill
bill_id = bill_page.find(id="lblBillNum").b.font.contents[0]
bill_title = bill_page.find(id="lblBillTitle").font.string
bill_desc = bill_page.find(id="lblBriefDesc").font.contents[0]
bill_lr = bill_page.find(id="lblLRNum").font.string
bill = Bill(year, 'upper', bill_id, bill_desc, bill_url=bill_url,
bill_lr=bill_lr, official_title=bill_title)
bill.add_source(bill_url)
# Get the primary sponsor
bill_sponsor = bill_page.find(id="hlSponsor").i.font.contents[0]
bill_sponsor_link = bill_page.find(id="hlSponsor").href
bill.add_sponsor('primary', bill_sponsor,
sponsor_link=bill_sponsor_link)
# cosponsors show up on their own page, if they exist
cosponsor_tag = bill_page.find(id="hlCoSponsors")
if cosponsor_tag and 'href' in cosponsor_tag:
self.parse_senate_cosponsors(bill, cosponsor_tag['href'])
# get the actions
action_url = bill_page.find(id="hlAllActions")['href']
self.parse_senate_actions(bill, action_url)
# stored on a separate page
versions_url = bill_page.find(id="hlFullBillText")
if versions_url:
self.parse_senate_bill_versions(bill, versions_url['href'])
self.save_bill(bill)
示例2: scrape_bills
# 需要導入模塊: from pyutils.legislation import Bill [as 別名]
# 或者: from pyutils.legislation.Bill import add_source [as 別名]
def scrape_bills(self,chamber,year):
self.log("Getting bill list for %s %s" % (chamber, year))
if chamber == 'upper':
min_id = self.upper_min_id
max_id = self.upper_max_id
elif chamber == 'lower':
min_id = self.lower_min_id
max_id = self.lower_max_id
for id in range(min_id, max_id):
bill_info_url = 'http://dlr.leg.wa.gov/billsummary/default.aspx?year=%s&bill=%s' % (year, id)
with self.soup_context(bill_info_url) as soup:
print('opened %s', id)
bill_id = soup.find('span', id='ctl00_contentRegion_lblShortBillID').string
bill_title = soup.find('span', id='ctl00_contentRegion_lblBriefDescription').string
print('bill_id ', bill_id)
print('bill_title ', bill_title)
session_name = self._session_dict[year]
bill = Bill(session_name, chamber, bill_id, bill_title)
bill.add_source(bill_info_url)
self._scrape_bill_docs(soup, bill)
self._scrape_bill_sponsors(soup, bill)
self._scrape_bill_votes(soup, bill, chamber)
self.add_bill(bill)
示例3: parse_bill
# 需要導入模塊: from pyutils.legislation import Bill [as 別名]
# 或者: from pyutils.legislation.Bill import add_source [as 別名]
def parse_bill(self, chamber, session, bill_id, bill_info_url):
with self.urlopen_context(bill_info_url) as bill_info_data:
bill_info = self.soup_parser(bill_info_data)
version_url = '%s/bill.doc' % bill_id
version_link = bill_info.find(href=version_url)
if not version_link:
# This bill was withdrawn
return
bill_title = version_link.findNext('p').contents[0].strip()
bill = Bill(session, chamber, bill_id, bill_title)
bill.add_version("Most Recent Version",
session_url(session) + version_url)
bill.add_source(bill_info_url)
sponsor_links = bill_info.findAll(href=re.compile(
'legislator/[SH]\d+\.htm'))
for sponsor_link in sponsor_links:
bill.add_sponsor('primary', sponsor_link.contents[0].strip())
action_p = version_link.findAllNext('p')[-1]
for action in action_p.findAll(text=True):
action = action.strip()
if (not action or action == 'last action' or
'Prefiled' in action):
continue
action_date = action.split('-')[0]
action_date = dt.datetime.strptime(action_date, '%b %d')
# Fix:
action_date = action_date.replace(
year=int('20' + session[2:4]))
action = '-'.join(action.split('-')[1:])
if action.endswith('House') or action.endswith('(H)'):
actor = 'lower'
elif action.endswith('Senate') or action.endswith('(S)'):
actor = 'upper'
else:
actor = chamber
bill.add_action(actor, action, action_date)
vote_link = bill_info.find(href=re.compile('.*/vote_history.pdf'))
if vote_link:
bill.add_document(
'vote_history.pdf',
bill_info_url.replace('.htm', '') + "/vote_history.pdf")
self.add_bill(bill)
示例4: scrape_session
# 需要導入模塊: from pyutils.legislation import Bill [as 別名]
# 或者: from pyutils.legislation.Bill import add_source [as 別名]
def scrape_session(self, chamber, session):
if chamber == "lower":
bill_abbr = "HB"
else:
bill_abbr = "SB"
bill_list_url = "http://www.le.state.ut.us/~%s/bills.htm" % (
session.replace(' ', ''))
self.log("Getting bill list for %s, %s" % (session, chamber))
try:
base_bill_list = self.soup_parser(self.urlopen(bill_list_url))
except:
# this session doesn't exist for this year
return
bill_list_link_re = re.compile('.*%s\d+ht.htm$' % bill_abbr)
for link in base_bill_list.findAll('a', href=bill_list_link_re):
bill_list = self.soup_parser(self.urlopen(link['href']))
bill_link_re = re.compile('.*billhtm/%s.*.htm' % bill_abbr)
for bill_link in bill_list.findAll('a', href=bill_link_re):
bill_id = bill_link.find(text=True).strip()
bill_info_url = bill_link['href']
bill_info = self.soup_parser(self.urlopen(bill_info_url))
bill_title, primary_sponsor = bill_info.h3.contents[2].replace(
' ', ' ').strip().split(' -- ')
bill = Bill(session, chamber, bill_id, bill_title)
bill.add_source(bill_info_url)
bill.add_sponsor('primary', primary_sponsor)
status_re = re.compile('.*billsta/%s.*.htm' %
bill_abbr.lower())
status_link = bill_info.find('a', href=status_re)
if status_link:
self.parse_status(bill, status_link['href'])
text_find = bill_info.find(
text="Bill Text (If you are having trouble viewing")
if text_find:
text_link_re = re.compile('.*\.htm')
for text_link in text_find.parent.parent.findAll(
'a', href=text_link_re)[1:]:
version_name = text_link.previous.strip()
bill.add_version(version_name, text_link['href'])
self.add_bill(bill)
示例5: parse_bill
# 需要導入模塊: from pyutils.legislation import Bill [as 別名]
# 或者: from pyutils.legislation.Bill import add_source [as 別名]
def parse_bill(self, chamber, session, bill_id, bill_info_url):
with self.urlopen_context(bill_info_url) as bill_info_data:
bill_info = self.soup_parser(bill_info_data)
version_url = "%s/bill.doc" % bill_id
version_link = bill_info.find(href=version_url)
if not version_link:
# This bill was withdrawn
return
bill_title = version_link.findNext("p").contents[0].strip()
bill = Bill(session, chamber, bill_id, bill_title)
bill.add_version("Most Recent Version", session_url(session) + version_url)
bill.add_source(bill_info_url)
sponsor_links = bill_info.findAll(href=re.compile("legislator/[SH]\d+\.htm"))
for sponsor_link in sponsor_links:
bill.add_sponsor("primary", sponsor_link.contents[0].strip())
action_p = version_link.findAllNext("p")[-1]
for action in action_p.findAll(text=True):
action = action.strip()
if not action or action == "last action" or "Prefiled" in action:
continue
action_date = action.split("-")[0]
action_date = dt.datetime.strptime(action_date, "%b %d")
# Fix:
action_date = action_date.replace(year=int("20" + session[2:4]))
action = "-".join(action.split("-")[1:])
if action.endswith("House") or action.endswith("(H)"):
actor = "lower"
elif action.endswith("Senate") or action.endswith("(S)"):
actor = "upper"
else:
actor = chamber
bill.add_action(actor, action, action_date)
vote_link = bill_info.find(href=re.compile(".*/vote_history.pdf"))
if vote_link:
bill.add_document("vote_history.pdf", bill_info_url.replace(".htm", "") + "/vote_history.pdf")
self.save_bill(bill)
示例6: scrape_bills
# 需要導入模塊: from pyutils.legislation import Bill [as 別名]
# 或者: from pyutils.legislation.Bill import add_source [as 別名]
def scrape_bills(self,chamber,year):
if int(year) %2 == 0:
raise NoDataForYear(year)
#
year = int(year)
oyear = year #save off the original of the session
if chamber == 'upper':
bill_no = 1
abbr = 'SB'
else:
bill_no = 4001
abbr = 'HB'
while True:
(bill_page,year) = self.scrape_bill(year, abbr, bill_no)
# if we can't find a page, we must be done. This is a healthy thing.
if bill_page == None: return
title = ''.join(self.flatten(bill_page.findAll(id='frg_billstatus_ObjectSubject')[0]))
title = title.replace('\n','').replace('\r','')
bill_id = "%s %d" % (abbr, bill_no)
the_bill = Bill("Regular Session %d" % oyear, chamber, bill_id, title)
#sponsors
first = 0
for name in bill_page.findAll(id='frg_billstatus_SponsorList')[0].findAll('a'):
the_bill.add_sponsor(['primary', 'cosponsor'][first], name.string)
first = 1
#versions
for doc in bill_page.findAll(id='frg_billstatus_DocumentGridTable')[0].findAll('tr'):
r = self.parse_doc(the_bill, doc)
if r: the_bill.add_version(*r)
#documents
if 'frg_billstatus_HlaTable' in str(bill_page):
for doc in bill_page.findAll(id='frg_billstatus_HlaTable')[0].findAll('tr'):
r = self.parse_doc(the_bill, doc)
if r: the_bill.add_document(*r)
if 'frg_billstatus_SfaSection' in str(bill_page):
for doc in bill_page.findAll(id='frg_billstatus_SfaSection')[0].findAll('tr'):
r = self.parse_doc(the_bill, doc)
if r: the_bill.add_document(*r)
the_bill.add_source('http://legislature.mi.gov/doc.aspx?%d-%s-%04d' % (year, abbr, bill_no))
self.parse_actions(the_bill, bill_page.findAll(id='frg_billstatus_HistoriesGridView')[0])
self.add_bill(the_bill)
bill_no = bill_no + 1
pass
示例7: scrape_bills
# 需要導入模塊: from pyutils.legislation import Bill [as 別名]
# 或者: from pyutils.legislation.Bill import add_source [as 別名]
def scrape_bills(self, chamber, year):
if year != "2009":
raise NoDataForYear
if chamber == "upper":
other_chamber = "lower"
bill_id = "SB 1"
else:
other_chamber = "upper"
bill_id = "HB 1"
b1 = Bill("2009-2010", chamber, bill_id, "A super bill")
b1.add_source("http://example.com")
b1.add_version("As Introduced", "http://example.com/SB1.html")
b1.add_document("Google", "http://google.com")
b1.add_sponsor("primary", "Bob Smith")
b1.add_sponsor("secondary", "Johnson, Sally")
d1 = datetime.datetime.strptime("1/29/2010", "%m/%d/%Y")
v1 = Vote("upper", d1, "Final passage", True, 2, 0, 0)
v1.yes("Bob Smith")
v1.yes("Sally Johnson")
d2 = datetime.datetime.strptime("1/30/2010", "%m/%d/%Y")
v2 = Vote("lower", d2, "Final passage", False, 0, 1, 1)
v2.no("B. Smith")
v2.other("Sally Johnson")
b1.add_vote(v1)
b1.add_vote(v2)
b1.add_action(chamber, "introduced", d1)
b1.add_action(chamber, "read first time", d1)
b1.add_action(other_chamber, "introduced", d2)
self.save_bill(b1)
示例8: scrape_session
# 需要導入模塊: from pyutils.legislation import Bill [as 別名]
# 或者: from pyutils.legislation.Bill import add_source [as 別名]
def scrape_session(self, chamber, year, prefix, session):
def parse_sponsors(bill, line, chamber):
sponsor_type = None
if chamber == "upper":
leg_chamber = {"primary": "upper", "cosponsor": "lower"}
else:
leg_chamber = {"primary": "lower", "cosponsor": "upper"}
for r in re.split(r"\sand\s|\,|;", line):
r = r.strip()
if r.find("Introduced by") != -1:
sponsor_type = "primary"
r = re.split(r"Introduced by \w+", r)[1]
if r.find("cosponsored by") != -1:
sponsor_type = "cosponsor"
r = re.split(r"cosponsored by \w+", r)[1]
bill.add_sponsor(sponsor_type, r.strip(), chamber=leg_chamber[sponsor_type])
def parse_action(bill, line, actor, date):
line = lxml.html.fromstring(line)
sane = line.text_content()
# "06-18. S. Received from Assembly ................................... 220 "
# "___________ __________________________________________"
# 11
sane = sane.strip()[11:] # take out the date and house
if sane.find("..") != -1:
sane = sane[0 : sane.find(" ..")] # clear out bookkeeping
bill.add_action(actor, sane, date)
for doc in line.findall("a"):
# have this treat amendments better, as they show up like "1" or "3" now..
bill.add_document(doc.text_content(), doc.get("href"))
if sane.find("Ayes") != -1:
self.add_vote(bill, actor, date, sane)
house = "SB" if (chamber == "upper") else "AB"
chambers = {"S": "upper", "A": "lower"}
i = 1
while True:
try:
url = "http://www.legis.state.wi.us/%s/data/%s%s%dhst.html" % (year, prefix, house, i)
body = unicode(self.urlopen(url), "latin-1")
except urllib2.HTTPError as e: # 404tastic
return
page = lxml.html.fromstring(body).cssselect("pre")[0]
# split the history into each line, exluding all blank lines and the title line
history = filter(lambda x: len(x.strip()) > 0, lxml.html.tostring(page).split("\n"))[2:-1]
buffer = ""
bill_id = page.find("a").text_content()
bill_title = None
bill_sponsors = False
current_year = None
action_date = None
current_chamber = None
for line in history:
stop = False
# the year changed
if re.match(r"^(\d{4})[\s]{0,1}$", line):
current_year = int(line.strip())
continue
# the action changed.
if re.match(r"\s+(\d{2})-(\d{2}).\s\s([AS])\.\s", line):
dm = re.findall(r"\s+(\d{2})-(\d{2}).\s\s([AS])\.\s", line)[0]
workdata = buffer
buffer = ""
stop = True
buffer = buffer + " " + line.strip()
if stop and not bill_title:
bill_title = workdata
bill = Bill(session, chamber, bill_id, bill_title)
continue
if stop and not bill_sponsors:
parse_sponsors(bill, workdata, chamber)
bill_sponsors = True
current_chamber = chambers[dm[2]]
action_date = dt.datetime(current_year, int(dm[0]), int(dm[1]))
continue
if stop:
parse_action(bill, workdata, current_chamber, action_date)
# now update the date
current_chamber = chambers[dm[2]]
action_date = dt.datetime(current_year, int(dm[0]), int(dm[1]))
current_chamber = chambers[dm[2]]
action_date = dt.datetime(current_year, int(dm[0]), int(dm[1]))
parse_action(bill, buffer, current_chamber, action_date)
bill.add_source(url)
self.save_bill(bill)
i = i + 1
示例9: scrape_session_new
# 需要導入模塊: from pyutils.legislation import Bill [as 別名]
# 或者: from pyutils.legislation.Bill import add_source [as 別名]
def scrape_session_new(self, chamber, session):
if chamber == "lower":
bill_abbr = "H."
else:
bill_abbr = "S."
bill_list_path = "docs/bills.cfm?Session=%s&Body=%s" % (
session.split('-')[1], bill_abbr[0])
bill_list_url = "http://www.leg.state.vt.us/" + bill_list_path
bill_list = BeautifulSoup(self.urlopen(bill_list_url))
bill_link_re = re.compile('.*?Bill=%s\.\d+.*' % bill_abbr[0])
for bill_link in bill_list.findAll('a', href=bill_link_re):
bill_id = bill_link.string
bill_title = bill_link.parent.findNext('b').string
bill_info_url = "http://www.leg.state.vt.us" + bill_link['href']
bill = Bill(session, chamber, bill_id, bill_title)
bill.add_source(bill_info_url)
info_page = BeautifulSoup(self.urlopen(bill_info_url))
text_links = info_page.findAll('blockquote')[1].findAll('a')
for text_link in text_links:
bill.add_version(text_link.string,
"http://www.leg.state.vt.us" +
text_link['href'])
act_table = info_page.findAll('blockquote')[2].table
for row in act_table.findAll('tr')[1:]:
action = ""
for s in row.findAll('td')[1].findAll(text=True):
action += s + " "
action = action.strip()
match = re.search('Governor on (.*)$', action)
if match:
act_date = parse_exec_date(match.group(1).strip())
actor = 'Governor'
else:
if row['bgcolor'] == 'Salmon':
actor = 'lower'
else:
actor = 'upper'
if row.td.a:
act_date = row.td.a.string
else:
act_date = row.td.string
act_date = re.search(
'\d{1,2}/\d{1,2}/\d{4,4}', act_date).group(0)
act_date = dt.datetime.strptime(act_date, '%m/%d/%Y')
bill.add_action(actor, action, act_date)
vote_link = row.find('a', text='Details')
if vote_link:
vote_url = vote_link.parent['href']
self.parse_vote_new(bill, actor, vote_url)
sponsors = info_page.find(
text='Sponsor(s):').parent.parent.findAll('b')
bill.add_sponsor('primary', sponsors[0].string)
for sponsor in sponsors[1:]:
bill.add_sponsor('cosponsor', sponsor.string)
self.save_bill(bill)
示例10: scrape_session_old
# 需要導入模塊: from pyutils.legislation import Bill [as 別名]
# 或者: from pyutils.legislation.Bill import add_source [as 別名]
def scrape_session_old(self, chamber, session):
if chamber == "lower":
bill_abbr = "H."
chamber_name = "House"
other_chamber = "Senate"
else:
bill_abbr = "S."
chamber_name = "Senate"
other_chamber = "House"
start_date = '1/1/%s' % session.split('-')[0]
data = urllib.urlencode({'Date': start_date,
'Body': bill_abbr[0],
'Session': session.split('-')[1]})
bill_list_url = "http://www.leg.state.vt.us/database/"\
"rintro/results.cfm"
bill_list = BeautifulSoup(urllib2.urlopen(bill_list_url, data))
bill_link_re = re.compile('.*?Bill=%s.\d+.*' % bill_abbr[0])
for bill_link in bill_list.findAll('a', href=bill_link_re):
bill_id = bill_link.string
bill_title = bill_link.parent.parent.findAll('td')[1].string
bill_info_url = "http://www.leg.state.vt.us" + bill_link['href']
bill = Bill(session, chamber, bill_id, bill_title)
bill.add_source(bill_info_url)
info_page = BeautifulSoup(self.urlopen(bill_info_url))
text_links = info_page.findAll('blockquote')[-1].findAll('a')
for text_link in text_links:
bill.add_version(text_link.string,
"http://www.leg.state.vt.us" +
text_link['href'])
sponsors = info_page.find(
text='Sponsor(s):').parent.findNext('td').findAll('b')
bill.add_sponsor('primary', sponsors[0].string)
for sponsor in sponsors[1:]:
bill.add_sponsor('cosponsor', sponsor.string)
# Grab actions from the originating chamber
act_table = info_page.find(
text='%s Status:' % chamber_name).findNext('table')
for row in act_table.findAll('tr')[3:]:
action = row.td.string.replace(' ', '').strip(':')
act_date = row.findAll('td')[1].b.string.replace(' ', '')
if act_date != "":
detail = row.findAll('td')[2].b
if detail and detail.string != "":
action += ": %s" % detail.string.replace(' ', '')
bill.add_action(chamber, action, act_date)
# Grab actions from the other chamber
act_table = info_page.find(
text='%s Status:' % other_chamber).findNext('table')
if act_table:
if chamber == 'upper':
act_chamber = 'lower'
else:
act_chamber = 'upper'
for row in act_table.findAll('tr')[3:]:
action = row.td.string.replace(' ', '').strip(':')
act_date = row.findAll('td')[1].b.string.replace(
' ', '')
if act_date != "":
detail = row.findAll('td')[2].b
if detail and detail.string != "":
action += ": %s" % detail.string.replace(
' ', '')
date = dt.datetime.strptime(act_date, '%m/%d/%Y')
bill.add_action(act_chamber, action, act_date)
self.save_bill(bill)
示例11: parse_house_bill
# 需要導入模塊: from pyutils.legislation import Bill [as 別名]
# 或者: from pyutils.legislation.Bill import add_source [as 別名]
def parse_house_bill(self, url, session):
url = re.sub("content", "print", url)
with self.urlopen_context(url) as bill_page_data:
bill_page = self.soup_parser(bill_page_data)
header_table = bill_page.table
# get all the info needed to record the bill
bill_id = header_table.b.contents[0]
bill_id = clean_text(bill_id)
bill_desc = header_table.findAll('td')[1].contents[0]
bill_desc = clean_text(bill_desc)
lr_label_tag = bill_page.find(text=re.compile("LR Number:"))
bill_lr = lr_label_tag.next.contents[0].strip()
# could substitute the description for the name,
# but keeping it separate for now.
bill = Bill(session, 'lower', bill_id, bill_desc,
bill_url=url, bill_lr=bill_lr)
bill.add_source(url)
# get the sponsors and cosponsors
sponsor_dirty = bill_page.em.contents[0]
m = re.search("(.*)\(.*\)", sponsor_dirty)
if m:
bill_sponsor = m.group(1)
else:
bill_sponsor = sponsor_dirty
# find the table with bill details...it'll be useful later
bill_details_tbl = bill_page.table.nextSibling.nextSibling
bill_sponsor_link = None
if bill_details_tbl.a:
bill_sponsor_link = bill_details_tbl.a['href']
bill.add_sponsor('primary', bill_sponsor,
sponsor_link=bill_sponsor_link)
# check for cosponsors
cosponsor_cell = bill_details_tbl.find(
text=re.compile("CoSponsor")).next
if cosponsor_cell.a:
self.parse_house_cosponsors(bill, cosponsor_cell)
# parse out all the actions
actions_link_tag = bill_page.find(
'a', text='ACTIONS').previous.previous
actions_link = actions_link_tag['href']
actions_link = re.sub("content", "print", actions_link)
self.parse_house_actions(bill, actions_link)
# get bill versions
version_tags = bill_page.findAll(href=re.compile("biltxt"))
if version_tags:
for version_tag in version_tags:
if version_tag.b:
version = clean_text(version_tag.b.contents[0])
text_url = version_tag['href']
pdf_url = version_tag.previousSibling
pdf_url = pdf_url.previousSibling['href']
bill.add_version(version, text_url, pdf_url=pdf_url)
self.save_bill(bill)
示例12: scrape_session
# 需要導入模塊: from pyutils.legislation import Bill [as 別名]
# 或者: from pyutils.legislation.Bill import add_source [as 別名]
def scrape_session(self, chamber, year):
if chamber == "upper":
bill_abbr = "SB|SCR|SJR"
elif chamber == "lower":
bill_abbr = "HB|HCR|HJR"
# Sessions last 2 years, 1993-1994 was the 18th
session = str(18 + ((int(year) - 1993) / 2))
year2 = str(int(year) + 1)
# Full calendar year
date1 = "0101" + year[2:]
date2 = "1231" + year2[2:]
# Get bill list
bill_list_url = "http://www.legis.state.ak.us/" "basis/range_multi.asp?session=%s&date1=%s&date2=%s" % (
session,
date1,
date2,
)
self.log("Getting bill list for %s %s (this may take a long time)." % (chamber, session))
bill_list = self.soup_parser(self.urlopen(bill_list_url))
# Find bill links
re_str = "bill=%s\d+" % bill_abbr
links = bill_list.findAll(href=re.compile(re_str))
for link in links:
bill_id = link.contents[0].replace(" ", "")
bill_name = link.parent.parent.findNext("td").find("font").contents[0].strip()
bill = Bill(session, chamber, bill_id, bill_name.strip())
# Get the bill info page and strip malformed t
info_url = "http://www.legis.state.ak.us/basis/%s" % link["href"]
info_page = self.soup_parser(self.urlopen(info_url))
bill.add_source(info_url)
# Get sponsors
spons_str = info_page.find(text="SPONSOR(s):").parent.parent.contents[1]
sponsors_match = re.match(" (SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})", spons_str)
if sponsors_match:
sponsors = sponsors_match.group(2).split(",")
bill.add_sponsor("primary", sponsors[0].strip())
for sponsor in sponsors[1:]:
bill.add_sponsor("cosponsor", sponsor.strip())
else:
# Committee sponsorship
bill.add_sponsor("committee", spons_str.strip())
# Get actions
act_rows = info_page.findAll("table", "myth")[1].findAll("tr")[1:]
for row in act_rows:
cols = row.findAll("td")
act_date = cols[0].font.contents[0]
act_date = dt.datetime.strptime(act_date, "%m/%d/%y")
if cols[2].font.string == "(H)":
act_chamber = "lower"
elif cols[2].font.string == "(S)":
act_chamber = "upper"
else:
act_chamber = chamber
action = cols[3].font.contents[0].strip()
if re.match("\w+ Y(\d+) N(\d+)", action):
vote = self.parse_vote(bill, action, act_chamber, act_date, cols[1].a["href"])
bill.add_vote(vote)
bill.add_action(act_chamber, action, act_date)
# Get subjects
bill["subjects"] = []
subject_link_re = re.compile(".*subject=\w+$")
for subject_link in info_page.findAll("a", href=subject_link_re):
subject = subject_link.contents[0].strip()
bill["subjects"].append(subject)
# Get versions
text_list_url = "http://www.legis.state.ak.us/" "basis/get_fulltext.asp?session=%s&bill=%s" % (
session,
bill_id,
)
text_list = self.soup_parser(self.urlopen(text_list_url))
bill.add_source(text_list_url)
text_link_re = re.compile("^get_bill_text?")
for text_link in text_list.findAll("a", href=text_link_re):
text_name = text_link.parent.previousSibling.contents[0]
text_name = text_name.strip()
text_url = "http://www.legis.state.ak.us/basis/%s" % (text_link["href"])
bill.add_version(text_name, text_url)
self.add_bill(bill)
示例13: get_bill_info
# 需要導入模塊: from pyutils.legislation import Bill [as 別名]
# 或者: from pyutils.legislation.Bill import add_source [as 別名]
def get_bill_info(self, session, sub, bill_id):
bill_detail_url = (
"http://www.ncga.state.nc.us/gascripts/"
"BillLookUp/BillLookUp.pl?bPrintable=true"
"&Session=%s&BillID=%s&votesToView=all" % (session[0:4] + sub, bill_id)
)
# parse the bill data page, finding the latest html text
if bill_id[0] == "H":
chamber = "lower"
else:
chamber = "upper"
bill_data = self.urlopen(bill_detail_url)
bill_soup = self.soup_parser(bill_data)
bill_title = bill_soup.findAll(
"div", style="text-align: center; font: bold" " 20px Arial; margin-top: 15px;" " margin-bottom: 8px;"
)[0].contents[0]
bill = Bill(session + sub, chamber, bill_id, bill_title)
bill.add_source(bill_detail_url)
# get all versions
links = bill_soup.findAll("a", href=re.compile("/Sessions/%s/Bills/\w+/HTML" % session[0:4]))
for link in links:
version_name = link.parent.previousSibling.previousSibling
version_name = version_name.contents[0].replace(" ", " ")
version_name = version_name.replace(u"\u00a0", " ")
version_url = "http://www.ncga.state.nc.us" + link["href"]
bill.add_version(version_name, version_url)
# figure out which table has sponsor data
sponsor_table = bill_soup.findAll("th", text="Sponsors", limit=1)[0].findParents("table", limit=1)[0]
sponsor_rows = sponsor_table.findAll("tr")
for leg in sponsor_rows[1].td.findAll("a"):
bill.add_sponsor("primary", leg.contents[0].replace(u"\u00a0", " "))
for leg in sponsor_rows[2].td.findAll("a"):
bill.add_sponsor("cosponsor", leg.contents[0].replace(u"\u00a0", " "))
action_table = bill_soup.findAll("th", text="Chamber", limit=1)[0].findParents("table", limit=1)[0]
for row in action_table.findAll("tr"):
cells = row.findAll("td")
if len(cells) != 3:
continue
act_date, actor, action = map(lambda x: self.flatten(x), cells)
act_date = dt.datetime.strptime(act_date, "%m/%d/%Y")
if actor == "Senate":
actor = "upper"
elif actor == "House":
actor = "lower"
elif action.endswith("Gov."):
actor = "Governor"
bill.add_action(actor, action, act_date)
for vote in bill_soup.findAll("a", href=re.compile("RollCallVoteTranscript")):
self.get_vote(bill, vote["href"])
self.add_bill(bill)
示例14: get_bill_info
# 需要導入模塊: from pyutils.legislation import Bill [as 別名]
# 或者: from pyutils.legislation.Bill import add_source [as 別名]
def get_bill_info(self, session, sub, bill_id):
bill_detail_url = 'http://www.ncga.state.nc.us/gascripts/'\
'BillLookUp/BillLookUp.pl?bPrintable=true'\
'&Session=%s&BillID=%s&votesToView=all' % (
session[0:4] + sub, bill_id)
# parse the bill data page, finding the latest html text
if bill_id[0] == 'H':
chamber = 'lower'
else:
chamber = 'upper'
bill_data = self.urlopen(bill_detail_url)
bill_soup = self.soup_parser(bill_data)
bill_title = bill_soup.findAll('div',
style="text-align: center; font: bold"
" 20px Arial; margin-top: 15px;"
" margin-bottom: 8px;")[0].contents[0]
bill = Bill(session + sub, chamber, bill_id, bill_title)
bill.add_source(bill_detail_url)
# get all versions
links = bill_soup.findAll('a', href=re.compile(
'/Sessions/%s/Bills/\w+/HTML' % session[0:4]))
for link in links:
version_name = link.parent.previousSibling.previousSibling
version_name = version_name.contents[0].replace(' ', ' ')
version_name = version_name.replace(u'\u00a0', ' ')
version_url = 'http://www.ncga.state.nc.us' + link['href']
bill.add_version(version_name, version_url)
# figure out which table has sponsor data
sponsor_table = bill_soup.findAll('th', text='Sponsors',
limit=1)[0].findParents(
'table', limit=1)[0]
sponsor_rows = sponsor_table.findAll('tr')
for leg in sponsor_rows[1].td.findAll('a'):
bill.add_sponsor('primary',
leg.contents[0].replace(u'\u00a0', ' '))
for leg in sponsor_rows[2].td.findAll('a'):
bill.add_sponsor('cosponsor',
leg.contents[0].replace(u'\u00a0', ' '))
action_table = bill_soup.findAll('th', text='Chamber',
limit=1)[0].findParents(
'table', limit=1)[0]
for row in action_table.findAll('tr'):
cells = row.findAll('td')
if len(cells) != 3:
continue
act_date, actor, action = map(lambda x: self.flatten(x), cells)
act_date = dt.datetime.strptime(act_date, '%m/%d/%Y')
if actor == 'Senate':
actor = 'upper'
elif actor == 'House':
actor = 'lower'
elif action.endswith('Gov.'):
actor = 'Governor'
bill.add_action(actor, action, act_date)
for vote in bill_soup.findAll('a', href=re.compile(
'RollCallVoteTranscript')):
self.get_vote(bill, vote['href'])
self.save_bill(bill)
示例15: scrape_new_session
# 需要導入模塊: from pyutils.legislation import Bill [as 別名]
# 或者: from pyutils.legislation.Bill import add_source [as 別名]
def scrape_new_session(self, chamber, session):
"""
Scrapes SD's bill data from 2009 on.
"""
if chamber == 'upper':
bill_abbr = 'SB'
elif chamber == 'lower':
bill_abbr = 'HB'
# Get bill list page
session_url = 'http://legis.state.sd.us/sessions/%s/' % session
bill_list_url = session_url + 'BillList.aspx'
self.log('Getting bill list for %s %s' % (chamber, session))
bill_list = self.soup_parser(self.urlopen(bill_list_url))
# Format of bill link contents
bill_re = re.compile(u'%s\xa0(\d+)' % bill_abbr)
date_re = re.compile('\d{2}/\d{2}/\d{4}')
for bill_link in bill_list.findAll('a'):
if len(bill_link.contents) == 0:
# Empty link
continue
#print bill_link.contents[0]
bill_match = bill_re.search(bill_link.contents[0])
if not bill_match:
continue
# Parse bill ID and name
bill_id = bill_link.contents[0].replace(u'\xa0', ' ')
bill_name = bill_link.findNext().contents[0]
# Download history page
hist_url = session_url + bill_link['href']
history = self.soup_parser(self.urlopen(hist_url))
bill = Bill(session, chamber, bill_id, bill_name)
bill.add_source(hist_url)
# Get all bill versions
text_table = history.findAll('table')[1]
for row in text_table.findAll('tr')[2:]:
#version_date = row.find('td').string
version_path = row.findAll('td')[1].a['href']
version_url = "http://legis.state.sd.us/sessions/%s/%s" % (
session, version_path)
version_name = row.findAll('td')[1].a.contents[0].strip()
bill.add_version(version_name, version_url)
# Get actions
act_table = history.find('table')
for act_row in act_table.findAll('tr')[6:]:
if act_row.find(text='Action'):
continue
# Get the date (if can't find one then this isn't an action)
date_match = date_re.match(act_row.td.a.contents[0])
if not date_match:
continue
act_date = date_match.group(0)
act_date = dt.datetime.strptime(act_date, "%m/%d/%Y")
# Get the action string
action = ""
for node in act_row.findAll('td')[1].contents:
if hasattr(node, 'contents'):
action += node.contents[0]
if node.contents[0].startswith('YEAS'):
# This is a vote!
vote_url = "http://legis.state.sd.us/sessions/"\
"%s/%s" % (session, node['href'])
vote = self.scrape_new_vote(vote_url)
vote['date'] = act_date
bill.add_vote(vote)
else:
action += node
action = action.strip()
# Add action
bill.add_action(chamber, action, act_date)
self.save_bill(bill)