本文整理汇总了Python中pupa.scrape.Person.add_source方法的典型用法代码示例。如果您正苦于以下问题:Python Person.add_source方法的具体用法?Python Person.add_source怎么用?Python Person.add_source使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pupa.scrape.Person
的用法示例。
在下文中一共展示了Person.add_source方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: scrape
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_source [as 别名]
def scrape(self):
urls = Urls(dict(list=legislators_url), self)
council = Organization(
'Temecula City Council',
classification='legislature')
council.add_source(urls.list.url)
yield council
for tr in urls.list.xpath('//table[2]//tr')[1:]:
# Parse some attributes.
name, role = tr.xpath('td/p[1]//font/text()')
image = tr.xpath('td/img/@src').pop()
# Create legislator.
person = Person(name, image=image)
# Add membership on council.
memb = person.add_membership(council, role=role)
# Add email address.
email, detail_url = tr.xpath('td//a/@href')
email = email[7:]
memb.contact_details.append(
dict(type='email', value=email, note='work'))
# Add sources.
person.add_source(urls.list.url)
person.add_source(detail_url)
yield person
示例2: bos_scrape_people
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_source [as 别名]
def bos_scrape_people(self):
page = self.lxmlize(MEMBER_LIST)
people = page.xpath(
"//table[@width='100%']//td[@style='TEXT-ALIGN: center']")
for person in people:
image, name = [self.get_one(person, x) for x in [
".//img",
".//a[contains(@href, 'councillors') and (text()!='')]"
]]
role = person.xpath(".//br")[0].tail.strip()
image = image.attrib['src'] # Fallback if we don't get one from the
# homepage.
homepage = name.attrib['href']
name = clean_name(name.text)
info = self.scrape_homepage(homepage)
if info.get('image', None):
image = info['image']
p = Person(name=name, district=role, image=image,
primary_org="legislature", biography=info['bio'])
p.add_link(url=homepage, note='homepage')
p.add_source(homepage)
p.add_source(MEMBER_LIST)
yield p
示例3: scrape_alderman
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_source [as 别名]
def scrape_alderman(self, ward_num):
ward_url = "{}/ward-{}".format(Utils.ALDERMEN_HOME, ward_num)
alderman_url = self.alderman_url(ward_url)
alderman_page = self.lxmlize(alderman_url)
# person's name is the only <h1> tag on the page
name = alderman_page.xpath("//h1/text()")[0]
# initialize person object with appropriate data so that pupa can
# automatically create a membership object linking this person to
# a post in the jurisdiction's "Board of Aldermen" organization
district = "Ward {} Alderman".format(ward_num)
person = Person(name=name, district=district, role="Alderman",
primary_org="legislature")
# set additional fields
person.image = alderman_page.xpath("//div/img/@src")[0]
phone_number = alderman_page.xpath("//strong[text()='Phone:']/../text()")[1].strip()
person.add_contact_detail(type="voice", value=phone_number)
# add sources
person.add_source(alderman_url, note="profile")
person.add_source(ward_url, note="ward")
return person
示例4: get_council
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_source [as 别名]
def get_council(self):
council_doc = self.lxmlize(self.COUNCIL_URL)
member_urls = council_doc.xpath(
'//table[@summary="City Directory"]/tr//'
'a[contains(@href, "/directory.aspx?EID=")]/@href')
for member_url in member_urls:
member_doc = self.lxmlize(member_url)
(name, ) = member_doc.xpath('//h1[@class="BioName"]/text()')
(name, ) = re.findall(r'^(?:Mr\.|Mrs\.|Hon\.)?\s*(.*?)\s*$', name)
# Returning everything into a list because the number of values returned varies
# depending on if the person has an email or not
text_list = member_doc.xpath(
'//a[@class="BioLink"]/parent::div/text()')
title = text_list[1].strip()
(title, ) = re.findall(
r'^Title: (Council Member,?(?: Ward \d)|Mayor)\s*$', title)
try:
(image_url, ) = member_doc.xpath(
'//span[@class="BioText"]//img/@src')
except ValueError:
image_url = ''
member = Person(name=name,
image=image_url,
primary_org='legislature',
role=title)
member.add_source(member_url)
yield member
示例5: scrape_counciler
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_source [as 别名]
def scrape_counciler(self, url):
page = self.lxmlize(url)
who, = page.xpath("//h3[@class='subtitle']/text()")
district, = page.xpath("//div[@class='right-bar']//h2/text()")
image, = page.xpath(
"//div[@class='left-bar']//a[@class='image lightbox']//img"
)
member = Person(
primary_org='legislature',
name=who, district=district,
image=image.attrib['src']
)
member.add_source(url)
details = page.xpath("//table[@align='center']//td")
for detail in details:
detail = detail.text_content().strip()
if detail is None or detail == "":
continue
type_, value = detail.split(":", 1)
cdtype = {
"Home Phone": "voice",
"Address": "address",
"Email": "email",
"Cell Phone": "voice",
}[type_]
member.add_contact_detail(type=cdtype,
note=type_,
value=value)
yield member
示例6: scrape_legislator
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_source [as 别名]
def scrape_legislator(self, chamber, name, url):
html = self.get(url).text
page = lxml.html.fromstring(html)
page.make_links_absolute(url)
district = page.xpath('//h1[contains(., "DISTRICT")]/text()').pop() \
.split()[1].strip().lstrip('0')
party = page.xpath('//h2').pop().text_content()
party = re.search(r'\((R|D|I)[ \-\]]', party).group(1)
if party == 'D':
party = 'Democratic'
elif party == 'R':
party = 'Republican'
elif party == 'I':
party = 'Independent'
photo_url = page.xpath(
"//img[contains(@src, 'images/members/')]")[0].attrib['src']
leg = Person(name, district=district, party=party, image=photo_url, primary_org=chamber)
leg.add_link(url)
leg.add_source(url)
self.scrape_offices(leg, page)
yield leg
示例7: scrape_chamber
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_source [as 别名]
def scrape_chamber(self, session):
session_key = SESSION_KEYS[session]
legislators_reponse = self.api_client.get('legislators', session=session_key)
for legislator in legislators_reponse:
url_name = legislator['WebSiteUrl'].split('/')[-1]
chamber_name = 'house' if legislator['Chamber'] == 'H' else 'senate'
img = 'https://www.oregonlegislature.gov/{}/MemberPhotos/{}.jpg'.format(
chamber_name, url_name
)
party = legislator['Party']
if party == 'Democrat':
party = 'Democratic'
person = Person(name='{} {}'.format(legislator['FirstName'], legislator['LastName']),
primary_org={'S': 'upper', 'H': 'lower'}[legislator['Chamber']],
party=party,
district=legislator['DistrictNumber'],
image=img)
person.add_link(legislator['WebSiteUrl'])
person.add_source(legislator['WebSiteUrl'])
if legislator['CapitolAddress']:
person.add_contact_detail(type='address', value=legislator['CapitolAddress'],
note='Capitol Office')
if legislator['CapitolPhone']:
person.add_contact_detail(type='voice', value=legislator['CapitolPhone'],
note='Capitol Office')
person.add_contact_detail(type='email', value=legislator['EmailAddress'],
note='Capitol Office')
yield person
示例8: scrape_member
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_source [as 别名]
def scrape_member(self, chamber, link):
name = link.text.strip()
leg_url = link.get('href')
district = link.xpath("string(../../td[3])")
party = link.xpath("string(../../td[4])")
# we get email on the next page now
# email = link.xpath("string(../../td[5])")
if party == 'Democrat':
party = 'Democratic'
elif party == 'No Party Specified':
party = 'Independent'
pid = re.search(r"personID=(\d+)", link.attrib['href']).group(1)
photo_url = ("https://www.legis.iowa.gov/photo"
"?action=getPhoto&ga=%s&pid=%s" % (self.latest_session(), pid))
leg = Person(
name=name,
primary_org=chamber,
district=district,
party=party,
image=photo_url)
leg.add_link(leg_url)
leg.add_source(leg_url)
leg_page = lxml.html.fromstring(self.get(link.attrib['href']).text)
self.scrape_member_page(leg, leg_page)
yield leg
示例9: scrape_upper_chamber
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_source [as 别名]
def scrape_upper_chamber(self, term):
url = "http://oksenate.gov/Senators/Default.aspx"
html = self.get(url).text
doc = lxml.html.fromstring(html)
doc.make_links_absolute(url)
for a in doc.xpath('//table[@summary]')[0]. \
xpath('.//td//a[contains(@href, "biographies")]'):
tail = a.xpath('..')[0].tail
if tail:
district = tail.split()[1]
else:
district = a.xpath('../../span')[1].text.split()[1]
if a.text is None or a.text.strip() == 'Vacant':
self.warning("District {} appears to be empty".format(district))
continue
else:
match = re.match(r'(.+) \(([A-Z])\)', a.text.strip())
name, party = match.group(1), self._parties[match.group(2)]
url = a.get('href')
person = Person(primary_org='upper',
district=district,
name=name.strip(),
party=party,
)
person.add_link(url)
person.add_source(url)
self.scrape_upper_offices(person, url)
yield person
示例10: test_full_person
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_source [as 别名]
def test_full_person():
person = ScrapePerson('Tom Sawyer')
person.add_identifier('1')
person.add_name('Tommy', start_date='1880')
person.add_contact_detail(type='phone', value='555-555-1234', note='this is fake')
person.add_link('http://example.com/link')
person.add_source('http://example.com/source')
# import person
pd = person.as_dict()
PersonImporter('jurisdiction-id').import_data([pd])
# get person from db and assert it imported correctly
p = Person.objects.get()
assert 'ocd-person' in p.id
assert p.name == person.name
assert p.identifiers.all()[0].identifier == '1'
assert p.identifiers.all()[0].scheme == ''
assert p.other_names.all()[0].name == 'Tommy'
assert p.other_names.all()[0].start_date == '1880'
assert p.contact_details.all()[0].type == 'phone'
assert p.contact_details.all()[0].value == '555-555-1234'
assert p.contact_details.all()[0].note == 'this is fake'
assert p.links.all()[0].url == 'http://example.com/link'
assert p.sources.all()[0].url == 'http://example.com/source'
示例11: handle_list_item
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_source [as 别名]
def handle_list_item(self, item):
photo_url = item.xpath('./img/@src')[0]
url = item.xpath('.//h5/a/@href')[0]
name_text = item.xpath('.//h5/a/b/text()')[0]
name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text)
name = name_match.group(1).strip()
district = name_match.group(2).lstrip('0').upper()
party_text = name_match.group(3)
party = PARTIES[party_text]
info_texts = [x.strip() for x in item.xpath(
'./div/text()[normalize-space()]'
) if x.strip()]
address = '\n'.join((info_texts[0], info_texts[1]))
phone_text = info_texts[2]
if validate_phone_number(phone_text):
phone = phone_text
email_text = item.xpath('.//a/@href')[1].replace('mailto:', '').strip()
if validate_email_address(email_text):
email = email_text
rep = Person(name=name, district=district, party=party,
primary_org='lower', role='Representative',
image=photo_url)
rep.add_link(url)
rep.add_contact_detail(type='address', value=address, note='capitol')
rep.add_contact_detail(type='voice', value=phone, note='capitol')
rep.add_contact_detail(type='email', value=email, note='capitol')
rep.add_source(self.url)
yield rep
示例12: handle_list_item
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_source [as 别名]
def handle_list_item(self, item):
link = item.xpath('.//div[@class="rep_style"]/a')[0]
name = link.text_content().strip()
if 'Vacant' in name or 'Resigned' in name or 'Pending' in name:
return
party = item.xpath('.//div[@class="party_style"]/text()')[0].strip()
party = {'D': 'Democratic', 'R': 'Republican'}[party]
district = item.xpath('.//div[@class="district_style"]/text()')[0].strip()
leg_url = link.get('href')
split_url = parse.urlsplit(leg_url)
member_id = parse.parse_qs(split_url.query)['MemberId'][0]
image = "http://www.flhouse.gov/FileStores/Web/Imaging/Member/{}.jpg".format(member_id)
rep = Person(name=name, district=district, party=party, primary_org='lower',
role='Representative', image=image)
rep.add_link(leg_url)
rep.add_source(leg_url)
rep.add_source(self.url)
self.scrape_page(RepDetail, leg_url, obj=rep)
return rep
示例13: scrape_member
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_source [as 别名]
def scrape_member(self, chamber, member_url):
member_page = self.get(member_url).text
doc = lxml.html.fromstring(member_page)
photo_url = doc.xpath('//div[@id="bioImage"]/img/@src')[0]
name_pieces = doc.xpath('//span[@id="name"]/text()')[0].split()
full_name = ' '.join(name_pieces[1:-1]).strip()
party = name_pieces[-1]
if party == '(R)':
party = 'Republican'
elif party == '(D)':
party = 'Democratic'
elif party == '(I)':
party = 'Independent'
district = doc.xpath('//span[@id="districtHeader"]/text()')[0].split()[-1]
person = Person(name=full_name, district=district, party=party,
primary_org=chamber, image=photo_url)
person.add_source(member_url)
person.add_link(member_url)
address = '\n'.join(doc.xpath('//div[@id="FrankfortAddresses"]//'
'span[@class="bioText"]/text()'))
phone = None
fax = None
phone_numbers = doc.xpath('//div[@id="PhoneNumbers"]//span[@class="bioText"]/text()')
for num in phone_numbers:
if num.startswith('Annex: '):
num = num.replace('Annex: ', '')
if num.endswith(' (fax)'):
fax = num.replace(' (fax)', '')
else:
phone = num
emails = doc.xpath(
'//div[@id="EmailAddresses"]//span[@class="bioText"]//a/text()'
)
email = reduce(
lambda match, address: address if '@lrc.ky.gov' in str(address) else match,
[None] + emails
)
if phone:
person.add_contact_detail(type='voice', value=phone, note='Capitol Office')
if fax:
person.add_contact_detail(type='fax', value=fax, note='Capitol Office')
if email:
person.add_contact_detail(type='email', value=email, note='Capitol Office')
if address.strip() == "":
self.warning("Missing Capitol Office!!")
else:
person.add_contact_detail(type='address', value=address, note='Capitol Office')
yield person
示例14: get_council
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_source [as 别名]
def get_council(self):
council_doc = self.lxmlize(self.COUNCIL_URL)
member_urls = council_doc.xpath(
'//table[@summary="City Directory"]/tr//' 'a[contains(@href, "/directory.aspx?EID=")]/@href'
)
for member_url in member_urls:
member_doc = self.lxmlize(member_url)
(name,) = member_doc.xpath('//span[@class="BioName"]/span/text()')
(name,) = re.findall(r"^(?:Mr\.|Mrs\.|Hon\.)?\s*(.*?)\s*$", name)
(title,) = member_doc.xpath('//a[@class="BioLink"]/following-sibling::text()')
(title,) = re.findall(r"^Title: (Council Member(?: Ward \d)|Mayor)\s*$", title)
try:
(image_url,) = member_doc.xpath('//span[@class="BioText"]//img/@src')
except ValueError:
image_url = ""
member = Person(name=name, image=image_url, primary_org="legislature", role=title)
member.add_source(member_url)
yield member
示例15: scrape_csv
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_source [as 别名]
def scrape_csv(self, reader):
for row in reader:
contributor = Person(
name="{Contact First Name} {Contact Last Name}".format(**row)
)
contributor.add_source(SEARCH_URL)
yield contributor