本文整理汇总了Python中pupa.scrape.Person.add_link方法的典型用法代码示例。如果您正苦于以下问题:Python Person.add_link方法的具体用法?Python Person.add_link怎么用?Python Person.add_link使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pupa.scrape.Person
的用法示例。
在下文中一共展示了Person.add_link方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: scrape_legislator
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_link [as 别名]
def scrape_legislator(self, chamber, name, url):
html = self.get(url).text
page = lxml.html.fromstring(html)
page.make_links_absolute(url)
district = page.xpath('//h1[contains(., "DISTRICT")]/text()').pop() \
.split()[1].strip().lstrip('0')
party = page.xpath('//h2').pop().text_content()
party = re.search(r'\((R|D|I)[ \-\]]', party).group(1)
if party == 'D':
party = 'Democratic'
elif party == 'R':
party = 'Republican'
elif party == 'I':
party = 'Independent'
photo_url = page.xpath(
"//img[contains(@src, 'images/members/')]")[0].attrib['src']
leg = Person(name, district=district, party=party, image=photo_url, primary_org=chamber)
leg.add_link(url)
leg.add_source(url)
self.scrape_offices(leg, page)
yield leg
示例2: scrape_member
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_link [as 别名]
def scrape_member(self, chamber, member_url):
member_page = self.get(member_url).text
doc = lxml.html.fromstring(member_page)
photo_url = doc.xpath('//div[@id="bioImage"]/img/@src')[0]
name_pieces = doc.xpath('//span[@id="name"]/text()')[0].split()
full_name = ' '.join(name_pieces[1:-1]).strip()
party = name_pieces[-1]
if party == '(R)':
party = 'Republican'
elif party == '(D)':
party = 'Democratic'
elif party == '(I)':
party = 'Independent'
district = doc.xpath('//span[@id="districtHeader"]/text()')[0].split()[-1]
person = Person(name=full_name, district=district, party=party,
primary_org=chamber, image=photo_url)
person.add_source(member_url)
person.add_link(member_url)
address = '\n'.join(doc.xpath('//div[@id="FrankfortAddresses"]//'
'span[@class="bioText"]/text()'))
phone = None
fax = None
phone_numbers = doc.xpath('//div[@id="PhoneNumbers"]//span[@class="bioText"]/text()')
for num in phone_numbers:
if num.startswith('Annex: '):
num = num.replace('Annex: ', '')
if num.endswith(' (fax)'):
fax = num.replace(' (fax)', '')
else:
phone = num
emails = doc.xpath(
'//div[@id="EmailAddresses"]//span[@class="bioText"]//a/text()'
)
email = reduce(
lambda match, address: address if '@lrc.ky.gov' in str(address) else match,
[None] + emails
)
if phone:
person.add_contact_detail(type='voice', value=phone, note='Capitol Office')
if fax:
person.add_contact_detail(type='fax', value=fax, note='Capitol Office')
if email:
person.add_contact_detail(type='email', value=email, note='Capitol Office')
if address.strip() == "":
self.warning("Missing Capitol Office!!")
else:
person.add_contact_detail(type='address', value=address, note='Capitol Office')
yield person
示例3: test_full_person
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_link [as 别名]
def test_full_person():
person = ScrapePerson('Tom Sawyer')
person.add_identifier('1')
person.add_name('Tommy', start_date='1880')
person.add_contact_detail(type='phone', value='555-555-1234', note='this is fake')
person.add_link('http://example.com/link')
person.add_source('http://example.com/source')
# import person
pd = person.as_dict()
PersonImporter('jurisdiction-id').import_data([pd])
# get person from db and assert it imported correctly
p = Person.objects.get()
assert 'ocd-person' in p.id
assert p.name == person.name
assert p.identifiers.all()[0].identifier == '1'
assert p.identifiers.all()[0].scheme == ''
assert p.other_names.all()[0].name == 'Tommy'
assert p.other_names.all()[0].start_date == '1880'
assert p.contact_details.all()[0].type == 'phone'
assert p.contact_details.all()[0].value == '555-555-1234'
assert p.contact_details.all()[0].note == 'this is fake'
assert p.links.all()[0].url == 'http://example.com/link'
assert p.sources.all()[0].url == 'http://example.com/source'
示例4: scrape_chamber
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_link [as 别名]
def scrape_chamber(self, session):
session_key = SESSION_KEYS[session]
legislators_reponse = self.api_client.get('legislators', session=session_key)
for legislator in legislators_reponse:
url_name = legislator['WebSiteUrl'].split('/')[-1]
chamber_name = 'house' if legislator['Chamber'] == 'H' else 'senate'
img = 'https://www.oregonlegislature.gov/{}/MemberPhotos/{}.jpg'.format(
chamber_name, url_name
)
party = legislator['Party']
if party == 'Democrat':
party = 'Democratic'
person = Person(name='{} {}'.format(legislator['FirstName'], legislator['LastName']),
primary_org={'S': 'upper', 'H': 'lower'}[legislator['Chamber']],
party=party,
district=legislator['DistrictNumber'],
image=img)
person.add_link(legislator['WebSiteUrl'])
person.add_source(legislator['WebSiteUrl'])
if legislator['CapitolAddress']:
person.add_contact_detail(type='address', value=legislator['CapitolAddress'],
note='Capitol Office')
if legislator['CapitolPhone']:
person.add_contact_detail(type='voice', value=legislator['CapitolPhone'],
note='Capitol Office')
person.add_contact_detail(type='email', value=legislator['EmailAddress'],
note='Capitol Office')
yield person
示例5: bos_scrape_people
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_link [as 别名]
def bos_scrape_people(self):
page = self.lxmlize(MEMBER_LIST)
people = page.xpath(
"//table[@width='100%']//td[@style='TEXT-ALIGN: center']")
for person in people:
image, name = [self.get_one(person, x) for x in [
".//img",
".//a[contains(@href, 'councillors') and (text()!='')]"
]]
role = person.xpath(".//br")[0].tail.strip()
image = image.attrib['src'] # Fallback if we don't get one from the
# homepage.
homepage = name.attrib['href']
name = clean_name(name.text)
info = self.scrape_homepage(homepage)
if info.get('image', None):
image = info['image']
p = Person(name=name, district=role, image=image,
primary_org="legislature", biography=info['bio'])
p.add_link(url=homepage, note='homepage')
p.add_source(homepage)
p.add_source(MEMBER_LIST)
yield p
示例6: handle_list_item
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_link [as 别名]
def handle_list_item(self, item):
photo_url = item.xpath('./img/@src')[0]
url = item.xpath('.//h5/a/@href')[0]
name_text = item.xpath('.//h5/a/b/text()')[0]
name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text)
name = name_match.group(1).strip()
district = name_match.group(2).lstrip('0').upper()
party_text = name_match.group(3)
party = PARTIES[party_text]
info_texts = [x.strip() for x in item.xpath(
'./div/text()[normalize-space()]'
) if x.strip()]
address = '\n'.join((info_texts[0], info_texts[1]))
phone_text = info_texts[2]
if validate_phone_number(phone_text):
phone = phone_text
email_text = item.xpath('.//a/@href')[1].replace('mailto:', '').strip()
if validate_email_address(email_text):
email = email_text
rep = Person(name=name, district=district, party=party,
primary_org='lower', role='Representative',
image=photo_url)
rep.add_link(url)
rep.add_contact_detail(type='address', value=address, note='capitol')
rep.add_contact_detail(type='voice', value=phone, note='capitol')
rep.add_contact_detail(type='email', value=email, note='capitol')
rep.add_source(self.url)
yield rep
示例7: scrape_upper_chamber
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_link [as 别名]
def scrape_upper_chamber(self, term):
url = "http://oksenate.gov/Senators/Default.aspx"
html = self.get(url).text
doc = lxml.html.fromstring(html)
doc.make_links_absolute(url)
for a in doc.xpath('//table[@summary]')[0]. \
xpath('.//td//a[contains(@href, "biographies")]'):
tail = a.xpath('..')[0].tail
if tail:
district = tail.split()[1]
else:
district = a.xpath('../../span')[1].text.split()[1]
if a.text is None or a.text.strip() == 'Vacant':
self.warning("District {} appears to be empty".format(district))
continue
else:
match = re.match(r'(.+) \(([A-Z])\)', a.text.strip())
name, party = match.group(1), self._parties[match.group(2)]
url = a.get('href')
person = Person(primary_org='upper',
district=district,
name=name.strip(),
party=party,
)
person.add_link(url)
person.add_source(url)
self.scrape_upper_offices(person, url)
yield person
示例8: scrape_member
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_link [as 别名]
def scrape_member(self, chamber, link):
name = link.text.strip()
leg_url = link.get('href')
district = link.xpath("string(../../td[3])")
party = link.xpath("string(../../td[4])")
# we get email on the next page now
# email = link.xpath("string(../../td[5])")
if party == 'Democrat':
party = 'Democratic'
elif party == 'No Party Specified':
party = 'Independent'
pid = re.search(r"personID=(\d+)", link.attrib['href']).group(1)
photo_url = ("https://www.legis.iowa.gov/photo"
"?action=getPhoto&ga=%s&pid=%s" % (self.latest_session(), pid))
leg = Person(
name=name,
primary_org=chamber,
district=district,
party=party,
image=photo_url)
leg.add_link(leg_url)
leg.add_source(leg_url)
leg_page = lxml.html.fromstring(self.get(link.attrib['href']).text)
self.scrape_member_page(leg, leg_page)
yield leg
示例9: handle_list_item
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_link [as 别名]
def handle_list_item(self, item):
link = item.xpath('.//div[@class="rep_style"]/a')[0]
name = link.text_content().strip()
if 'Vacant' in name or 'Resigned' in name or 'Pending' in name:
return
party = item.xpath('.//div[@class="party_style"]/text()')[0].strip()
party = {'D': 'Democratic', 'R': 'Republican'}[party]
district = item.xpath('.//div[@class="district_style"]/text()')[0].strip()
leg_url = link.get('href')
split_url = parse.urlsplit(leg_url)
member_id = parse.parse_qs(split_url.query)['MemberId'][0]
image = "http://www.flhouse.gov/FileStores/Web/Imaging/Member/{}.jpg".format(member_id)
rep = Person(name=name, district=district, party=party, primary_org='lower',
role='Representative', image=image)
rep.add_link(leg_url)
rep.add_source(leg_url)
rep.add_source(self.url)
self.scrape_page(RepDetail, leg_url, obj=rep)
return rep
示例10: scrape_chamber
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_link [as 别名]
def scrape_chamber(self, chamber, session):
if chamber == 'upper':
chamber_slug = 'Senate'
elif chamber == 'lower':
chamber_slug = 'Assembly'
session_slug = self.jurisdiction.session_slugs[session]
leg_base_url = 'http://www.leg.state.nv.us/App/Legislator/A/%s/%s/' % (chamber_slug,
session_slug)
leg_json_url = ('http://www.leg.state.nv.us/App/Legislator/A/api/%s/Legislator?house=%s' %
(session_slug, chamber_slug))
resp = json.loads(self.get(leg_json_url).text)
for item in resp:
# empty district
empty_names = ['District No', 'Vacant']
if any(name in item['FullName'] for name in empty_names):
continue
last, first = item['FullName'].split(",", 1)
item['FullName'] = "{first} {last}".format(last=last,
first=first).strip()
person = Person(name=item['FullName'], district=item['DistrictNbr'],
party=item['Party'], primary_org=chamber,
image=item['PhotoURL'])
leg_url = leg_base_url + item['DistrictNbr']
# hack to get the legislator ID
html = self.get(leg_url).text
for l in html.split('\n'):
if 'GetLegislatorDetails' in l:
leg_id = l.split(',')[1].split("'")[1]
# fetch the json used by the page
leg_details_url = ('https://www.leg.state.nv.us/App/Legislator/A/api/{}/Legislator?id='
.format(session_slug) + leg_id)
leg_resp = json.loads(self.get(leg_details_url).text)
details = leg_resp['legislatorDetails']
address = details['Address1']
address2 = details['Address2']
if address2:
address += ' ' + address2
address += '\n%s, NV %s' % (details['City'], details['Zip'])
phone = details['LCBPhone']
email = details['LCBEmail']
if address:
person.add_contact_detail(type='address', value=address,
note='District Office')
if phone:
person.add_contact_detail(type='voice', value=phone,
note='District Office')
if phone:
person.add_contact_detail(type='email', value=email,
note='District Office')
person.add_link(leg_details_url)
person.add_source(leg_details_url)
yield person
示例11: test_invalid_fields_related_item
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_link [as 别名]
def test_invalid_fields_related_item():
p1 = ScrapePerson('Dwayne')
p1.add_link('http://example.com')
p1 = p1.as_dict()
p1['links'][0]['test'] = 3
with pytest.raises(DataImportError):
PersonImporter('jid').import_data([p1])
示例12: scrape_chamber
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_link [as 别名]
def scrape_chamber(self, chamber):
"""
Scrapes legislators for the current term only
"""
# self.validate_term(term, latest_only=True)
url = BASE_URL % CHAMBERS[chamber].lower()
index = self.get(url).text
html = lxml.html.fromstring(index)
html.make_links_absolute(url)
rows = html.xpath('//div[contains(@class, "row-equal-height")]')
for row in rows:
img_url = row.xpath('.//img/@src')[0]
inner = row.xpath('.//div[@class="vc-column-innner-wrapper"]')[1]
inner_text = inner.text_content()
if 'Resigned' in inner_text or 'Substitute' in inner_text:
continue
name = inner.xpath('p/strong')[0].text.replace(u'\xa0', ' ').strip()
name = re.sub(r'\s+', ' ', name)
party = PARTY[inner.xpath('p/strong')[0].tail.strip()]
email = inner.xpath('p/strong/a')[0].text
district = inner.xpath('p/a')[0].text.replace('District ', '')
person_url = inner.xpath('p/a/@href')[0]
# skip roles for now
role = ''
# for com in inner.xpath('p/a[contains(@href, "committees")]'):
# role = com.tail.strip()
person = Person(name=name, district=district,
party=party, primary_org=chamber,
image=img_url, role=role)
phones = get_phones(inner)
phone = phones.get('home') or phones.get('business')
office_phone = phones.get('office')
address = get_address(inner)
fax = get_fax(inner)
if address:
person.add_contact_detail(type='address', value=address,
note='District Office')
if phone:
person.add_contact_detail(type='voice', value=phone,
note='District Office')
if fax:
person.add_contact_detail(type='fax', value=fax,
note='District Office')
if email:
person.add_contact_detail(type='email', value=email,
note='District Office')
if office_phone:
person.add_contact_detail(type='voice', value=office_phone,
note='Capitol Office')
person.add_source(url)
person.add_link(person_url)
yield person
示例13: scrape_lower
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_link [as 别名]
def scrape_lower(self, chamber):
url = 'http://www.house.mi.gov/mhrpublic/frmRepList.aspx'
table = [
"website",
"district",
"name",
"party",
"location",
"phone",
"email"
]
data = self.get(url).text
doc = lxml.html.fromstring(data)
# skip two rows at top
for row in doc.xpath('//table[@id="grvRepInfo"]/*'):
tds = row.xpath('.//td')
if len(tds) == 0:
continue
metainf = {}
for i in range(0, len(table)):
metainf[table[i]] = tds[i]
district = str(int(metainf['district'].text_content().strip()))
party = metainf['party'].text_content().strip()
phone = metainf['phone'].text_content().strip()
email = metainf['email'].text_content().strip()
leg_url = metainf['website'].xpath("./a")[0].attrib['href']
name = metainf['name'].text_content().strip()
if name == 'Vacant' or re.match(r'^District \d{1,3}$', name):
self.warning('District {} appears vacant, and will be skipped'.format(district))
continue
office = metainf['location'].text_content().strip()
office = re.sub(
' HOB',
' Anderson House Office Building\n124 North Capitol Avenue\nLansing, MI 48933',
office
)
office = re.sub(
' CB',
' State Capitol Building\nLansing, MI 48909',
office
)
photo_url = self.get_photo_url(leg_url)
person = Person(name=name, district=district, party=abbr[party],
primary_org='lower', image=photo_url[0] if photo_url else None)
person.add_link(leg_url)
person.add_source(leg_url)
person.add_contact_detail(type='address', value=office, note='Capitol Office')
person.add_contact_detail(type='voice', value=phone, note='Capitol Office')
person.add_contact_detail(type='email', value=email, note='Capitol Office')
yield person
示例14: scrape_member_page
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_link [as 别名]
def scrape_member_page(self, chamber, url):
page = self.get(url).text
page = lxml.html.fromstring(page)
page.make_links_absolute(url)
for legislator in page.xpath(
"//div[contains(concat(' ', normalize-space(@class), ' '), "
"' memberModule ')]"
):
img = legislator.xpath(
".//div[@class='thumbnail']//img")[0].attrib['src']
data = legislator.xpath(".//div[@class='data']")[0]
homepage = data.xpath(".//a[@class='black']")[0]
full_name = homepage.text_content()
if "Vacant" in full_name:
continue
homepage = homepage.attrib['href']
party = data.xpath(
".//span[@class='partyLetter']")[0].text_content()
party = {"R": "Republican", "D": "Democratic"}[party]
office_lines = data.xpath("child::text()")
phone = office_lines.pop(-1)
office = "\n".join(office_lines)
h3 = data.xpath("./h3")
if len(h3):
h3 = h3[0]
district = h3.xpath("./br")[0].tail.replace("District", ""
).strip()
else:
district = re.findall(
r"\d+\.png",
legislator.attrib['style']
)[-1].split(".", 1)[0]
full_name = re.sub(r"\s+", " ", full_name).strip()
email = (
'rep{0:0{width}}@ohiohouse.gov'
if chamber == 'lower' else
'sd{0:0{width}}@ohiosenate.gov'
).format(int(district), width=2)
leg = Person(name=full_name, district=district,
party=party, primary_org=chamber,
image=img)
leg.add_contact_detail(type='address', value=office, note='Capitol Office')
leg.add_contact_detail(type='voice', value=phone, note='Capitol Office')
leg.add_contact_detail(type='email', value=email, note='Capitol Office')
self.scrape_homepage(leg, chamber, homepage)
leg.add_source(url)
leg.add_link(homepage)
yield leg
示例15: parse_senate
# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import add_link [as 别名]
def parse_senate(self, div, chamber):
name = div.xpath('.//h3/text()')[0]
if name.endswith(' (R)'):
party = 'Republican'
elif name.endswith(' (D)'):
party = 'Democratic'
else:
self.warning('skipping ' + name)
return None
name = name.split(' (')[0]
district = div.xpath(
'.//div[contains(@class, "senator-district")]/div/text()'
)[0].strip().lstrip('0')
photo_url = div.xpath('.//img/@src')[0]
person = Person(
name=name,
party=party,
district=district,
primary_org=chamber,
image=photo_url,
)
url = div.xpath('.//a/@href')[0]
person.add_link(url)
# CA senators have working emails, but they're not putting them on
# their public pages anymore
email = self._construct_email(chamber, name)
person.add_contact_detail(type='email', value=email, note='Senate Office')
office_path = './/div[contains(@class, "{}")]//p'
for addr in div.xpath(office_path.format('views-field-field-senator-capitol-office')):
note = 'Senate Office'
addr, phone = addr.text_content().split('; ')
person.add_contact_detail(type='address', value=addr.strip(), note=note)
person.add_contact_detail(type='voice', value=phone.strip(), note=note)
n = 1
for addr in div.xpath(office_path.format('views-field-field-senator-district-office')):
note = 'District Office #{}'.format(n)
for addr in addr.text_content().strip().splitlines():
try:
addr, phone = addr.strip().replace(u'\xa0', ' ').split('; ')
person.add_contact_detail(type='address', value=addr.strip(), note=note)
person.add_contact_detail(type='voice', value=phone.strip(), note=note)
except ValueError:
addr = addr.strip().replace(u'\xa0', ' ')
person.add_contact_detail(type='address', value=addr.strip(), note=note)
n += 1
return person