本文整理汇总了Python中pupa.scrape.Person类的典型用法代码示例。如果您正苦于以下问题:Python Person类的具体用法?Python Person怎么用?Python Person使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Person类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: scrape_csv
def scrape_csv(self, reader):
for row in reader:
contributor = Person(
name="{Contact First Name} {Contact Last Name}".format(**row)
)
contributor.add_source(SEARCH_URL)
yield contributor
示例2: test_deduplication_no_name_overlap
def test_deduplication_no_name_overlap():
create_person()
# make sure we're not just being ridiculous and avoiding importing anything in the same org
person = ScrapePerson('CM Punk')
pd = person.as_dict()
PersonImporter('jurisdiction-id').import_data([pd])
assert Person.objects.all().count() == 2
示例3: scrape_upper_chamber
def scrape_upper_chamber(self, term):
url = "http://oksenate.gov/Senators/Default.aspx"
html = self.get(url).text
doc = lxml.html.fromstring(html)
doc.make_links_absolute(url)
for a in doc.xpath('//table[@summary]')[0]. \
xpath('.//td//a[contains(@href, "biographies")]'):
tail = a.xpath('..')[0].tail
if tail:
district = tail.split()[1]
else:
district = a.xpath('../../span')[1].text.split()[1]
if a.text is None or a.text.strip() == 'Vacant':
self.warning("District {} appears to be empty".format(district))
continue
else:
match = re.match(r'(.+) \(([A-Z])\)', a.text.strip())
name, party = match.group(1), self._parties[match.group(2)]
url = a.get('href')
person = Person(primary_org='upper',
district=district,
name=name.strip(),
party=party,
)
person.add_link(url)
person.add_source(url)
self.scrape_upper_offices(person, url)
yield person
示例4: get_council
def get_council(self):
council_doc = self.lxmlize(self.COUNCIL_URL)
member_urls = council_doc.xpath(
'//table[@summary="City Directory"]/tr//'
'a[contains(@href, "/directory.aspx?EID=")]/@href')
for member_url in member_urls:
member_doc = self.lxmlize(member_url)
(name, ) = member_doc.xpath('//h1[@class="BioName"]/text()')
(name, ) = re.findall(r'^(?:Mr\.|Mrs\.|Hon\.)?\s*(.*?)\s*$', name)
# Returning everything into a list because the number of values returned varies
# depending on if the person has an email or not
text_list = member_doc.xpath(
'//a[@class="BioLink"]/parent::div/text()')
title = text_list[1].strip()
(title, ) = re.findall(
r'^Title: (Council Member,?(?: Ward \d)|Mayor)\s*$', title)
try:
(image_url, ) = member_doc.xpath(
'//span[@class="BioText"]//img/@src')
except ValueError:
image_url = ''
member = Person(name=name,
image=image_url,
primary_org='legislature',
role=title)
member.add_source(member_url)
yield member
示例5: test_bill_sponsor_by_identifier
def test_bill_sponsor_by_identifier():
create_jurisdiction()
org = create_org()
bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act',
classification='tax bill', chamber='lower')
bill.add_sponsorship_by_identifier(name="SNODGRASS",
classification='sponsor',
entity_type='person',
primary=True,
identifier="TOTALLY_REAL_ID",
scheme="TOTALLY_REAL_SCHEME")
oi = OrganizationImporter('jid')
pi = PersonImporter('jid')
zs = ScrapePerson(name='Zadock Snodgrass')
zs.add_identifier(identifier='TOTALLY_REAL_ID',
scheme='TOTALLY_REAL_SCHEME')
pi.import_data([zs.as_dict()])
za_db = Person.objects.get()
Membership.objects.create(person_id=za_db.id,
organization_id=org.id)
BillImporter('jid', oi, pi).import_data([bill.as_dict()])
obj = Bill.objects.get()
(entry,) = obj.sponsorships.all()
assert entry.person.name == "Zadock Snodgrass"
示例6: scrape_counciler
def scrape_counciler(self, url):
page = self.lxmlize(url)
who, = page.xpath("//h3[@class='subtitle']/text()")
district, = page.xpath("//div[@class='right-bar']//h2/text()")
image, = page.xpath(
"//div[@class='left-bar']//a[@class='image lightbox']//img"
)
member = Person(
primary_org='legislature',
name=who, district=district,
image=image.attrib['src']
)
member.add_source(url)
details = page.xpath("//table[@align='center']//td")
for detail in details:
detail = detail.text_content().strip()
if detail is None or detail == "":
continue
type_, value = detail.split(":", 1)
cdtype = {
"Home Phone": "voice",
"Address": "address",
"Email": "email",
"Cell Phone": "voice",
}[type_]
member.add_contact_detail(type=cdtype,
note=type_,
value=value)
yield member
示例7: scrape_legislator
def scrape_legislator(self, chamber, name, url):
html = self.get(url).text
page = lxml.html.fromstring(html)
page.make_links_absolute(url)
district = page.xpath('//h1[contains(., "DISTRICT")]/text()').pop() \
.split()[1].strip().lstrip('0')
party = page.xpath('//h2').pop().text_content()
party = re.search(r'\((R|D|I)[ \-\]]', party).group(1)
if party == 'D':
party = 'Democratic'
elif party == 'R':
party = 'Republican'
elif party == 'I':
party = 'Independent'
photo_url = page.xpath(
"//img[contains(@src, 'images/members/')]")[0].attrib['src']
leg = Person(name, district=district, party=party, image=photo_url, primary_org=chamber)
leg.add_link(url)
leg.add_source(url)
self.scrape_offices(leg, page)
yield leg
示例8: test_deduplication_same_name
def test_deduplication_same_name():
create_person()
# simplest case- just the same name
person = ScrapePerson('Dwayne Johnson')
pd = person.as_dict()
PersonImporter('jurisdiction-id').import_data([pd])
assert Person.objects.all().count() == 1
示例9: test_deduplication_other_name_exists
def test_deduplication_other_name_exists():
create_person()
# Rocky is already saved in other_names
person = ScrapePerson('Rocky')
pd = person.as_dict()
PersonImporter('jurisdiction-id').import_data([pd])
assert Person.objects.all().count() == 1
示例10: test_multiple_orgs_of_same_class
def test_multiple_orgs_of_same_class():
"""
We should be able to set memberships on organizations with the
same classification within the same jurisdictions
"""
Organization.objects.create(id="fnd", name="Foundation", classification="foundation",
jurisdiction_id="fnd-jid")
Organization.objects.create(id="fdr", name="Federation", classification="foundation",
jurisdiction_id="fnd-jid")
hari = ScrapePerson('Hari Seldon',
primary_org='foundation',
role='founder',
primary_org_name='Foundation')
picard = ScrapePerson('Jean Luc Picard',
primary_org='foundation',
role='founder',
primary_org_name='Federation')
person_imp = PersonImporter('fnd-jid')
person_imp.import_data([hari.as_dict()])
person_imp.import_data([picard.as_dict()])
# try to import a membership
org_imp = OrganizationImporter('fnd-jid')
dumb_imp = DumbMockImporter()
memimp = MembershipImporter('fnd-jid', person_imp, org_imp, dumb_imp)
memimp.import_data([hari._related[0].as_dict(),
picard._related[0].as_dict()])
assert Person.objects.get(name='Hari Seldon').memberships.get().organization.name == 'Foundation'
assert Person.objects.get(name='Jean Luc Picard').memberships.get().organization.name == 'Federation'
示例11: scrape_member
def scrape_member(self, chamber, link):
name = link.text.strip()
leg_url = link.get('href')
district = link.xpath("string(../../td[3])")
party = link.xpath("string(../../td[4])")
# we get email on the next page now
# email = link.xpath("string(../../td[5])")
if party == 'Democrat':
party = 'Democratic'
elif party == 'No Party Specified':
party = 'Independent'
pid = re.search(r"personID=(\d+)", link.attrib['href']).group(1)
photo_url = ("https://www.legis.iowa.gov/photo"
"?action=getPhoto&ga=%s&pid=%s" % (self.latest_session(), pid))
leg = Person(
name=name,
primary_org=chamber,
district=district,
party=party,
image=photo_url)
leg.add_link(leg_url)
leg.add_source(leg_url)
leg_page = lxml.html.fromstring(self.get(link.attrib['href']).text)
self.scrape_member_page(leg, leg_page)
yield leg
示例12: test_deduplication_no_jurisdiction_overlap
def test_deduplication_no_jurisdiction_overlap():
create_person()
# make sure we get a new person if we're in a different org
person = ScrapePerson('Dwayne Johnson')
pd = person.as_dict()
PersonImporter('new-jurisdiction-id').import_data([pd])
assert Person.objects.all().count() == 2
示例13: get_council
def get_council(self):
council_doc = self.lxmlize(self.COUNCIL_URL)
member_urls = council_doc.xpath(
'//table[@summary="City Directory"]/tr//' 'a[contains(@href, "/directory.aspx?EID=")]/@href'
)
for member_url in member_urls:
member_doc = self.lxmlize(member_url)
(name,) = member_doc.xpath('//span[@class="BioName"]/span/text()')
(name,) = re.findall(r"^(?:Mr\.|Mrs\.|Hon\.)?\s*(.*?)\s*$", name)
(title,) = member_doc.xpath('//a[@class="BioLink"]/following-sibling::text()')
(title,) = re.findall(r"^Title: (Council Member(?: Ward \d)|Mayor)\s*$", title)
try:
(image_url,) = member_doc.xpath('//span[@class="BioText"]//img/@src')
except ValueError:
image_url = ""
member = Person(name=name, image=image_url, primary_org="legislature", role=title)
member.add_source(member_url)
yield member
示例14: table_row_to_legislator_and_profile_url
def table_row_to_legislator_and_profile_url(table_row_element, chamber):
"""Derive a Legislator from an HTML table row lxml Element, and a link to their profile"""
td_elements = table_row_element.xpath('td')
(role_element, name_element, district_element, party_element,
phone_element, email_element) = td_elements
# Name comes in the form Last, First
# last_name_first_name = name_element.text_content().strip()
# full_name = last_name_first_name_to_full_name(last_name_first_name)
full_name = name_element.text_content().strip()
district = district_element.text_content().strip()
party = party_element.text_content().strip()
if party == 'Democrat':
party = 'Democratic'
role = role_element.text_content().strip()
address = co_address_from_role(role)
phone = phone_element.text_content().strip()
email = email_element.text_content().strip()
(profile_url, ) = name_element.xpath('a/@href')
print(chamber, district, party)
legislator = Person(primary_org=chamber,
name=full_name,
district=district,
party=party)
legislator.add_contact_detail(type='address', value=address, note='Capitol Office')
legislator.add_contact_detail(type='voice', value=phone, note='Capitol Office')
legislator.add_contact_detail(type='email', value=email, note='Capitol Office')
return legislator, profile_url
示例15: scrape_chamber
def scrape_chamber(self, chamber):
self._party_map = {
'Democrat': 'Democratic',
'Republican': 'Republican',
'Non Affiliated': 'Independent',
'Not Affiliated': 'Independent',
}
if chamber == 'upper':
url = 'http://senate.legis.state.ak.us/'
else:
url = 'http://house.legis.state.ak.us/'
page = self.lxmlize(url)
items = page.xpath('//ul[@class="item"]')[1].getchildren()
for item in items:
photo_url = item.xpath('.//img/@src')[0]
name = item.xpath('.//strong/text()')[0]
leg_url = item.xpath('.//a/@href')[0]
email = item.xpath('.//a[text()="Email Me"]/@href')
if email:
email = email[0].replace('mailto:', '')
else:
self.warning('no email for ' + name)
party = district = None
skip = False
for dt in item.xpath('.//dt'):
dd = dt.xpath('following-sibling::dd')[0].text_content()
label = dt.text.strip()
if label == 'Party:':
party = dd
elif label == 'District:':
district = dd
elif label.startswith('Deceased'):
skip = True
self.warning('skipping deceased ' + name)
break
if skip:
continue
person = Person(
primary_org=chamber,
district=district,
name=name,
party=self._party_map[party],
image=photo_url,
)
person.add_source(leg_url)
person.add_link(leg_url)
# scrape offices
self._scrape_offices(person, leg_url, email)
yield person