本文整理汇总了Python中pupa.scrape.Organization.add_link方法的典型用法代码示例。如果您正苦于以下问题:Python Organization.add_link方法的具体用法?Python Organization.add_link怎么用?Python Organization.add_link使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pupa.scrape.Organization
的用法示例。
在下文中一共展示了Organization.add_link方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_full_organization
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_link [as 别名]
def test_full_organization():
org = ScrapeOrganization('United Nations', classification='international')
org.add_identifier('un')
org.add_name('UN', start_date='1945')
org.add_contact_detail(type='phone', value='555-555-1234', note='this is fake')
org.add_link('http://example.com/link')
org.add_source('http://example.com/source')
# import org
od = org.as_dict()
OrganizationImporter('jurisdiction-id').import_data([od])
# get person from db and assert it imported correctly
o = Organization.objects.get()
assert 'ocd-organization' in o.id
assert o.name == org.name
assert o.identifiers.all()[0].identifier == 'un'
assert o.identifiers.all()[0].scheme == ''
assert o.other_names.all()[0].name == 'UN'
assert o.other_names.all()[0].start_date == '1945'
assert o.contact_details.all()[0].type == 'phone'
assert o.contact_details.all()[0].value == '555-555-1234'
assert o.contact_details.all()[0].note == 'this is fake'
assert o.links.all()[0].url == 'http://example.com/link'
assert o.sources.all()[0].url == 'http://example.com/source'
示例2: scrape_committees
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_link [as 别名]
def scrape_committees(self, repos):
for repo in repos:
source = "https://raw.githubusercontent.com/unitedstates/congress-legislators/master/{0}".format(repo)
committees = self.fetch_yaml(source)
for committee in committees:
org = Organization(committee["name"], classification="committee")
org.add_source(source)
for key in committee.keys() & {"url", "rss_url"}:
org.add_link(committee[key])
for key in committee.keys() & {"phone", "address"}:
org.add_contact_detail(
type="voice", value=committee[key]
) if key == "phone" else org.add_contact_detail(type=key, value=committee[key])
for key in committee.keys() & {"senate_committee_id", "house_committee_id", "thomas_id"}:
org.add_identifier(committee[key], scheme=key)
if "subcommittees" in committee:
for subcommittee in committee["subcommittees"]:
sub_org = Organization(subcommittee["name"], classification="committee", parent_id=org._id)
sub_org.add_identifier(subcommittee["thomas_id"], scheme="thomas")
sub_org.add_source(source)
for key in subcommittee.keys() & {"phone", "address"}:
sub_org.add_contact_detail(
type="voice", value=committee[key]
) if key == "phone" else sub_org.add_contact_detail(type=key, value=committee[key])
yield sub_org
yield org
示例3: _scrape_committee
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_link [as 别名]
def _scrape_committee(self, committee_name, link, chamber):
"""Scrape individual committee page and add members"""
page = self.get(link).text
page = lxml.html.fromstring(page)
page.make_links_absolute(link)
is_subcommittee = bool(page.xpath('//li/a[text()="Committee"]'))
if is_subcommittee:
# All TN subcommittees are just the name of the parent committee with " Subcommittee"
# at the end
parent_committee_name = re.sub(r'\s*(Study )?Subcommittee\s*', '', committee_name)
com = Organization(
committee_name,
classification='committee',
parent_id=self.parents[parent_committee_name]
)
else:
com = Organization(
committee_name,
chamber=chamber,
classification='committee',
)
self.parents[committee_name] = com._id
OFFICER_SEARCH = '//h2[contains(text(), "Committee Officers")]/' \
'following-sibling::div/ul/li/a'
MEMBER_SEARCH = '//h2[contains(text(), "Committee Members")]/' \
'following-sibling::div/ul/li/a'
for a in (page.xpath(OFFICER_SEARCH) + page.xpath(MEMBER_SEARCH)):
member_name = ' '.join([
x.strip() for x in
a.xpath('text()') + a.xpath('span/text()')
if x.strip()
])
role = a.xpath('small')
if role:
role = role[0].xpath('text()')[0].strip()
else:
role = 'member'
if '(Vacant)' in role:
continue
com.add_member(member_name, role)
com.add_link(link)
com.add_source(link)
return com
示例4: scrape
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_link [as 别名]
def scrape(self):
com_url = 'http://dccouncil.us/committees'
data = self.get(com_url).text
doc = lxml.html.fromstring(data)
doc.make_links_absolute(com_url)
comms = set(
doc.xpath('//a[contains(@href, "dccouncil.us/committees/")]'))
for committee in comms:
url = committee.attrib['href']
name = committee.text_content().strip()
comm_data = self.get(url).text
comm_page = lxml.html.fromstring(comm_data)
comm_page.make_links_absolute(url)
# classify these as belonging to the legislature
committee = Organization(name=name, classification='committee',
chamber='legislature')
if comm_page.xpath('//p[@class="page-summary"]'):
summary = comm_page.xpath(
'//p[@class="page-summary"]')[0].text_content().strip()
committee.extras['summary'] = summary
chair = comm_page.xpath(
"//h4[text()='Chairperson']/following-sibling::p")
chair_name = chair[0].text_content().strip()
chair_name = self.remove_title(chair_name)
committee.add_member(chair_name, role="chair")
members = comm_page.xpath(
"//h4[text()='Councilmembers']/following-sibling::ul")
members = members[0].xpath("./li")
for m in members:
mem_name = m.text_content().strip()
mem_name = self.remove_title(mem_name)
if mem_name != chair_name:
committee.add_member(mem_name)
committee.add_source(url)
committee.add_link(url, note='Official Website')
if not committee._related:
self.warning('empty committee: %s;', name)
else:
yield committee
示例5: get_organizations
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_link [as 别名]
def get_organizations(self):
secretary_of_the_commonwealth = Organization(
name="Office of the Secretary of the Commonwealth, Commonwealth of Virginia",
classification="office"
)
secretary_of_the_commonwealth.add_contact_detail(
type="voice",
value="804-786-2441"
)
secretary_of_the_commonwealth.add_contact_detail(
type="address",
value="1111 East Broad Street, 4th Floor, Richmond, Virginia 23219"
)
secretary_of_the_commonwealth.add_link(
url="https://commonwealth.virginia.gov/",
note="Home page"
)
self._secretary_of_the_commonwealth = secretary_of_the_commonwealth
yield secretary_of_the_commonwealth
示例6: scrape_joint_committee
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_link [as 别名]
def scrape_joint_committee(self, committee_name, url):
if 'state.tn.us' in url:
com = Organization(
committee_name,
chamber='joint',
classification='committee',
)
try:
page = self.get(url).text
except requests.exceptions.ConnectionError:
self.logger.warning("Committee link is broken, skipping")
return
page = lxml.html.fromstring(page)
for el in page.xpath(
"//div[@class='Blurb']/table//tr[2 <= position() and position() < 10]/td[1]"
):
if el.xpath('text()') == ['Vacant']:
continue
(member_name, ) = el.xpath('a/text()')
if el.xpath('text()'):
role = el.xpath('text()')[0].strip(' ,')
else:
role = 'member'
member_name = member_name.replace('Senator', '')
member_name = member_name.replace('Representative', '')
member_name = member_name.strip()
com.add_member(member_name, role)
com.add_link(url)
com.add_source(url)
return com
elif 'gov-opps' in url:
com = Organization(
committee_name,
chamber='joint',
classification='committee',
)
page = self.get(url).text
page = lxml.html.fromstring(page)
links = ['senate', 'house']
for link in links:
chamber_link = self.base_href + '/' + link + '/committees/gov-opps.html'
chamber_page = self.get(chamber_link).text
chamber_page = lxml.html.fromstring(chamber_page)
OFFICER_SEARCH = '//h2[contains(text(), "Committee Officers")]/' \
'following-sibling::div/ul/li/a'
MEMBER_SEARCH = '//h2[contains(text(), "Committee Members")]/' \
'following-sibling::div/ul/li/a'
for a in (
chamber_page.xpath(OFFICER_SEARCH) +
chamber_page.xpath(MEMBER_SEARCH)
):
member_name = ' '.join([
x.strip() for x in
a.xpath('.//text()')
if x.strip()
])
role = a.xpath('small')
if role:
role = role[0].xpath('text()')[0].strip()
member_name = member_name.replace(role, '').strip()
else:
role = 'member'
com.add_member(member_name, role)
com.add_source(chamber_link)
com.add_link(url)
com.add_source(url)
return com
else:
return self._scrape_committee(committee_name, url, 'joint')
示例7: get_organizations
# 需要导入模块: from pupa.scrape import Organization [as 别名]
# 或者: from pupa.scrape.Organization import add_link [as 别名]
def get_organizations(self):
legislature = Organization("United States Congress",
classification='legislature')
self._legislature = legislature
yield legislature
senate = Organization(
name="United States Senate",
classification='upper',
parent_id=legislature._id,
)
self._senate = senate
yield senate
house = Organization(
name="United States House",
classification='lower',
parent_id=legislature._id,
)
self._house = house
yield house
sopr = Organization(
name="Office of Public Record, US Senate",
classification="office",
parent_id=senate._id,
)
sopr.add_contact_detail(type="voice",
value="202-224-0322")
sopr.add_source(url="http://www.senate.gov/pagelayout/legislative/"
"one_item_and_teasers/opr.htm",
note="Profile page")
sopr.add_source(url="http://www.senate.gov/pagelayout/legislative/"
"g_three_sections_with_teasers/lobbyingdisc.htm"
"#lobbyingdisc=lda",
note="Disclosure Home")
sopr.add_link(url="http://soprweb.senate.gov/index.cfm"
"?event=selectfields",
note="Disclosure Search Portal")
sopr.add_link(url="http://soprweb.senate.gov/",
note="Disclosure Electronic Filing System")
self._sopr = sopr
yield sopr
house_clerk = Organization(
name="Office of the Clerk, US House",
classification="office",
parent_id=house._id,
)
house_clerk.add_contact_detail(type="voice",
value="202-225-7000")
house_clerk.add_source(url="http://clerk.house.gov/",
note="Home page")
self._house_clerk = house_clerk
yield house_clerk
yield legislature