本文整理汇总了Python中pupa.scrape.Organization.add_source方法的典型用法代码示例。如果您正苦于以下问题:Python Organization.add_source方法的具体用法?Python Organization.add_source怎么用?Python Organization.add_source使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pupa.scrape.Organization
的用法示例。
在下文中一共展示了Organization.add_source方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: scrape
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_source [as 别名]
def scrape(self):
sessions = reversed(self.jurisdiction.legislative_sessions)
committee_term_instances = committees_from_sessions(self, sessions)
committees_by_code = build_lookup_dict(self, data_list=committee_term_instances, index_key='code')
for code, instances in committees_by_code.items():
# TODO: Figure out how to edit city council org.
if code == 'CC':
continue
extras = {'tmmis_decision_body_ids': []}
for i, inst in enumerate(instances):
# TODO: Ensure this survives addition of new term (2017)
# so specific year always creates
canonical_i = 0
if i == canonical_i:
o = Organization(name=inst['name'], classification='committee')
extras.update({'description': inst['info']})
o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME)
extras['tmmis_decision_body_ids'].append({inst['term']: inst['decision_body_id']})
o.extras = extras
o.add_source(inst['source_url'])
if instances[canonical_i]['name'] != inst['name']:
# TODO: Add start_date and end_date
o.add_name(inst['name'])
yield o
示例2: scrape
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_source [as 别名]
def scrape(self):
urls = Urls(dict(list=legislators_url), self)
council = Organization(
'Temecula City Council',
classification='legislature')
council.add_source(urls.list.url)
yield council
for tr in urls.list.xpath('//table[2]//tr')[1:]:
# Parse some attributes.
name, role = tr.xpath('td/p[1]//font/text()')
image = tr.xpath('td/img/@src').pop()
# Create legislator.
person = Person(name, image=image)
# Add membership on council.
memb = person.add_membership(council, role=role)
# Add email address.
email, detail_url = tr.xpath('td//a/@href')
email = email[7:]
memb.contact_details.append(
dict(type='email', value=email, note='work'))
# Add sources.
person.add_source(urls.list.url)
person.add_source(detail_url)
yield person
示例3: scrape_approp_subcommittees
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_source [as 别名]
def scrape_approp_subcommittees(self, url):
html = self.get(url).text
doc = lxml.html.fromstring(html)
for strong in doc.xpath('//strong'):
com = Organization(
name=strong.text.strip(),
parent_id={
'name': 'Appropriations',
'classification': 'committee',
},
classification='committee',
)
com.add_source(url)
legislators = strong.getnext().tail.replace('Senators', '').strip()
for leg in re.split(', | and ', legislators):
if leg.endswith('(C)'):
role = 'chairman'
leg = leg[:-4]
elif leg.endswith('(VC)'):
role = 'vice chairman'
leg = leg[:-5]
elif leg.endswith('(MVC)'):
role = 'minority vice chairman'
leg = leg[:-6]
else:
role = 'member'
com.add_member(leg, role=role)
yield com
示例4: scrape_senate_committee
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_source [as 别名]
def scrape_senate_committee(self, url):
html = self.get(url).text
doc = lxml.html.fromstring(html)
headers = doc.xpath('(//div[@class="row"])[2]//h1')
assert len(headers) == 1
name = ' '.join(headers[0].xpath('./text()'))
name = re.sub(r'\s+Committee.*$', '', name)
com = Organization(chamber='upper', name=name, classification='committee')
for member in doc.xpath('(//div[@class="row"])[3]/div[1]/ul[1]/li'):
text = member.text_content()
member_name = member.xpath('./a/text()')[0].replace('Representative ', '')
if 'Committee Chair' in text:
role = 'chair'
elif 'Minority Vice' in text:
role = 'minority vice chair'
elif 'Vice' in text:
role = 'majority vice chair'
else:
role = 'member'
com.add_member(member_name, role=role)
com.add_source(url)
yield com
示例5: scrape_committee
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_source [as 别名]
def scrape_committee(self, term, href, name):
page = self.get(href).text
page = lxml.html.fromstring(page)
page.make_links_absolute(href)
members = page.xpath("//div[@class='view-content']"
"//a[contains(@href, 'members')]")
if '/joint/' in href:
chamber = 'legislature'
elif '/senate/' in href:
chamber = 'upper'
elif '/house/' in href:
chamber = 'lower'
else:
# interim committees and others were causing duplicate committee issues, skipping
self.warning('Failed to identify chamber for {}; skipping'.format(href))
return
cttie = Organization(name, chamber=chamber, classification='committee')
for a in members:
member = a.text
role = a.xpath("ancestor::div/h2[@class='pane-title']/text()")[0].strip()
role = {"Legislative Members": "member",
"Chairman": "chair",
"Vice Chairman": "member"}[role]
if member is None or member.startswith("District"):
continue
member = member.replace('Senator ', '').replace('Representative ', '')
cttie.add_member(member, role=role)
cttie.add_source(href)
yield cttie
示例6: scrape_approp_subcommittees
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_source [as 别名]
def scrape_approp_subcommittees(self):
URL = 'http://www.senate.michigan.gov/committee/appropssubcommittee.html'
html = self.get(URL).text
doc = lxml.html.fromstring(html)
for strong in doc.xpath('//strong'):
com = Organization(
name=strong.text.strip(),
parent_id=self._senate_appropriations,
classification='committee',
)
com.add_source(URL)
legislators = strong.getnext().tail.replace('Senators', '').strip()
for leg in re.split(', | and ', legislators):
if leg.endswith('(C)'):
role = 'chairman'
leg = leg[:-4]
elif leg.endswith('(VC)'):
role = 'vice chairman'
leg = leg[:-5]
elif leg.endswith('(MVC)'):
role = 'minority vice chairman'
leg = leg[:-6]
else:
role = 'member'
com.add_member(leg, role=role)
yield com
示例7: scrape_committee
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_source [as 别名]
def scrape_committee(self, chamber, name, url):
page = self.get(url).text
page = lxml.html.fromstring(page)
if page.xpath("//h3[. = 'Joint Committee']"):
chamber = 'joint'
subcommittee = page.xpath("//h3[@align='center']/text()")[0]
if "Subcommittee" not in subcommittee:
comm = Organization(
chamber=chamber, name=name, classification='committee')
else:
comm = Organization(
name=subcommittee, classification='committee',
parent_id={'classification': chamber, 'name': name})
comm.add_source(url)
for link in page.xpath("//a[contains(@href, 'member=')]"):
member = link.text.strip()
mtype = link.xpath("string(../preceding-sibling::td[1])")
mtype = mtype.strip(": \r\n\t").lower()
comm.add_member(member, mtype)
if not comm._related:
self.warning('not saving %s, appears to be empty' % name)
else:
yield comm
示例8: scrape_lower_committee
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_source [as 别名]
def scrape_lower_committee(self, name, url):
page = self.lxmlize(url)
committee = Organization(chamber='lower', name=name,
classification="committee")
committee.add_source(url)
seen = set()
member_links = self.get_nodes(
page,
'//div[@class="mod-inner"]//a[contains(@href, "mem")]')
for member_link in member_links:
member_name = None
member_role = None
member_name = member_link.text
if member_name is None:
continue
# Figure out if this person is the chair.
if member_link == member_links[0]:
member_role = 'chair'
else:
member_role = 'member'
if name not in seen:
committee.add_member(member_name, member_role)
seen.add(member_name)
return committee
示例9: scrape
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_source [as 别名]
def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)
councillors = page.xpath('//div[@class="entry-content"]//p/strong')
for councillor in councillors:
district = councillor.xpath('./ancestor::p/preceding-sibling::h2')[-1].text_content().split('–'.decode('utf-8'))[0]
name = ' '.join(councillor.text_content().split()[-2:]).replace('-Â'.decode('utf-8'), '')
role = councillor.text_content().replace(name, '').split('-')[0]
if 'SAO' in role or not role:
continue
org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
org.add_source(COUNCIL_PAGE)
yield org
p = Person(primary_org='legislature', name=name, district=district)
p.add_source(COUNCIL_PAGE)
membership = p.add_membership(org, role=role, district=district)
info = councillor.xpath('./ancestor::p/text()')
for contact in info:
if 'NT' in contact:
membership.add_contact_detail('address', contact.strip(), 'legislature')
if 'Tel' in contact:
contact = contact.replace('Tel. ', '').replace('(', '').replace(') ', '-').strip()
membership.add_contact_detail('voice', contact, 'legislature')
if 'Fax' in contact:
contact = contact.replace('Fax ', '').replace('(', '').replace(') ', '-').strip()
membership.add_contact_detail('fax', contact, 'legislature')
email = self.get_email(councillor, './parent::p')
membership.add_contact_detail('email', email)
if 'Website' in councillor.xpath('./parent::p')[0].text_content():
p.add_link(councillor.xpath('./parent::p//a')[1].attrib['href'])
yield p
示例10: test_full_organization
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_source [as 别名]
def test_full_organization():
org = ScrapeOrganization('United Nations', classification='international')
org.add_identifier('un')
org.add_name('UN', start_date='1945')
org.add_contact_detail(type='phone', value='555-555-1234', note='this is fake')
org.add_link('http://example.com/link')
org.add_source('http://example.com/source')
# import org
od = org.as_dict()
OrganizationImporter('jurisdiction-id').import_data([od])
# get person from db and assert it imported correctly
o = Organization.objects.get()
assert 'ocd-organization' in o.id
assert o.name == org.name
assert o.identifiers.all()[0].identifier == 'un'
assert o.identifiers.all()[0].scheme == ''
assert o.other_names.all()[0].name == 'UN'
assert o.other_names.all()[0].start_date == '1945'
assert o.contact_details.all()[0].type == 'phone'
assert o.contact_details.all()[0].value == '555-555-1234'
assert o.contact_details.all()[0].note == 'this is fake'
assert o.links.all()[0].url == 'http://example.com/link'
assert o.sources.all()[0].url == 'http://example.com/source'
示例11: _scrape_lower_special_committees
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_source [as 别名]
def _scrape_lower_special_committees(self):
url = 'http://house.louisiana.gov/H_Cmtes/SpecialCommittees.aspx'
page = self.lxmlize(url)
committee_list = page.xpath('//div[@class="accordion"]')[0]
headers = committee_list.xpath('./h3')
for header in headers:
committee_name_text = header.xpath('string()')
committee_name = committee_name_text.strip()
committee_name = self._normalize_committee_name(committee_name)
chamber = 'legislature' if committee_name.startswith('Joint') else 'lower'
committee = Organization(committee_name, chamber=chamber,
classification='committee')
committee.add_source(url)
committee_memberlist = header.xpath('./following-sibling::div[@class="pane"]'
'//tr[@class="linkStyle2"]')
for row in committee_memberlist:
member_name = row.xpath('normalize-space(string(./th[1]))')
member_name = self._normalize_member_name(member_name)
member_role = row.xpath('normalize-space(string(./th[2]))')
member_role = self._normalize_member_role(member_role)
committee.add_member(member_name, member_role)
yield committee
示例12: scrape_page
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_source [as 别名]
def scrape_page(self, link, chamber=None):
page = self.lxmlize(link.attrib['href'])
comName = link.text
roles = {
"Chair": "chair",
"Vice Chair": "vice-chair",
"Vice-Chair": "vice-chair",
}
committee = Organization(comName,
chamber=chamber,
classification='committee')
committee.add_source(link.attrib['href'])
for member in page.xpath('//div[@class="members"]/' +
'div[@class="roster-item"]'):
details = member.xpath('.//div[@class="member-details"]')[0]
person = details.xpath('./h4')[0].text_content()
# This page does random weird things with whitepace to names
person = ' '.join(person.strip().split())
if not person:
continue
role = details.xpath('./span[@class="member-role"]')
if role:
role = roles[role[0].text]
else:
role = 'member'
committee.add_member(person, role=role)
yield committee
示例13: scrape_chamber
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_source [as 别名]
def scrape_chamber(self, chamber, session):
url = "%s/GetActiveCommittees?biennium=%s" % (self._base_url, session)
page = self.get(url)
page = lxml.etree.fromstring(page.content)
for comm in xpath(page, "//wa:Committee"):
agency = xpath(comm, "string(wa:Agency)")
comm_chamber = {'House': 'lower', 'Senate': 'upper'}[agency]
if comm_chamber != chamber:
continue
name = xpath(comm, "string(wa:Name)")
# comm_id = xpath(comm, "string(wa:Id)")
# acronym = xpath(comm, "string(wa:Acronym)")
phone = xpath(comm, "string(wa:Phone)")
comm = Organization(name, chamber=chamber, classification='committee')
comm.extras['phone'] = phone
self.scrape_members(comm, agency)
comm.add_source(url)
if not comm._related:
self.warning('empty committee: %s', name)
else:
yield comm
示例14: _scrape_upper_committee
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_source [as 别名]
def _scrape_upper_committee(self, name, url2):
cat = "Assignments.asp"
url3 = url2.replace("default.asp", cat)
committee = Organization(name,
chamber="upper",
classification="committee"
)
committee.add_source(url2)
page = self.lxmlize(url3)
members = page.xpath('//table[@id="table38"]//font/a/b')
for link in members:
role = "member"
if link == members[0]:
role = "Chairman"
if link == members[1]:
role = "Vice-Chairman"
name = link.xpath('string()')
name = name.replace('Senator ', '')
name = re.sub('[\s]{2,}', ' ', name).strip()
committee.add_member(name, role)
yield committee
示例15: scrape_comm
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_source [as 别名]
def scrape_comm(self, chamber):
url = 'http://billstatus.ls.state.ms.us/htms/%s_cmtememb.xml' % chamber
comm_page = self.get(url)
root = lxml.etree.fromstring(comm_page.content)
if chamber == 'h':
chamber = "lower"
else:
chamber = "upper"
for mr in root.xpath('//COMMITTEE'):
name = mr.xpath('string(NAME)')
comm = Organization(name,
chamber=chamber,
classification='committee'
)
chair = mr.xpath('string(CHAIR)')
chair = chair.replace(", Chairman", "")
role = "Chairman"
if len(chair) > 0:
comm.add_member(chair, role=role)
vice_chair = mr.xpath('string(VICE_CHAIR)')
vice_chair = vice_chair.replace(", Vice-Chairman", "")
role = "Vice-Chairman"
if len(vice_chair) > 0:
comm.add_member(vice_chair, role=role)
members = mr.xpath('string(MEMBERS)').split(";")
if "" in members:
members.remove("")
for leg in members:
leg = leg.strip()
comm.add_member(leg)
comm.add_source(url)
yield comm