当前位置: 首页>>代码示例>>Python>>正文

Python Person.extras['name_first']方法代码示例

本文整理汇总了Python中pupa.scrape.Person.extras['name_first']方法的典型用法代码示例。如果您正苦于以下问题:Python Person.extras['name_first']方法的具体用法?Python Person.extras['name_first']怎么用?Python Person.extras['name_first']使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pupa.scrape.Person的用法示例。


示例1: scrape_chamber

# 需要导入模块: from pupa.scrape import Person [as 别名]
# 或者: from pupa.scrape.Person import extras['name_first'] [as 别名]
    def scrape_chamber(self, chamber=None):
        if chamber == 'upper':
            url = ('http://webserver.rilin.state.ri.us/Documents/Senators.xls')
            rep_type = 'Senator'
            contact_url = 'http://webserver.rilin.state.ri.us/Email/SenEmailListDistrict.asp'
        elif chamber == 'lower':
            url = ('http://webserver.rilin.state.ri.us/Documents/Representatives.xls')
            rep_type = 'Representative'
            contact_url = 'http://webserver.rilin.state.ri.us/Email/RepEmailListDistrict.asp'

        contact_page = self.lxmlize(contact_url)
        contact_info_by_district = {}
        for row in contact_page.xpath('//tr[@valign="TOP"]'):
            tds = row.xpath('td')
            (detail_link, ) = tds[link_col_ix].xpath('.//a/@href')
            # Ignore name (2nd col). We have a regex built up below for the spreadsheet name
            # I don't want to touch
            district, _, email, phone = [td.text_content().strip() for td in tds[:link_col_ix]]
            contact_info_by_district[district] = {
                'email': email,
                'phone': phone,
                'detail_link': detail_link,

        self.urlretrieve(url, 'ri_leg.xls')

        wb = xlrd.open_workbook('ri_leg.xls')
        sh = wb.sheet_by_index(0)

        for rownum in range(1, sh.nrows):
            d = {
                field: sh.cell(rownum, col_num).value
                for field, col_num in excel_mapping.items()

            # Convert float to an int, and then to string, the format required by pupa
            district = str(int(d['district']))
            if d['full_name'].upper() == "VACANT":
                    "District {}'s seat is vacant".format(district))

            contact_info = contact_info_by_district[district]

            # RI is very fond of First M. Last name formats and
            # they're being misparsed upstream, so fix here
            (first, middle, last) = ('', '', '')
            full_name = re.sub(r"^{}(?=\s?[A-Z].*$)".format(rep_type), '', d['full_name']).strip()
            if re.match(r'^\S+\s[A-Z]\.\s\S+$', full_name):
                (first, middle, last) = full_name.split()

            # Note - if we ever need to speed this up, it looks like photo_url can be mapped
            # from the detail_link a la /senators/Paolino/ -> /senators/pictures/Paolino.jpg
            detail_page = self.lxmlize(contact_info['detail_link'])
            (photo_url, ) = detail_page.xpath('//div[@class="ms-WPBody"]//img/@src')

            person = Person(
                primary_org=chamber, district=district, name=full_name,
                party=translate[d['party']], image=photo_url
            person.extras['town_represented'] = d['town_represented']
            person.extras['name_first'] = first
            person.extras['name_middle'] = middle
            person.extras['name_last'] = last

            person.add_contact_detail(type='address', value=d['address'], note='District Office')
                type='voice', value=contact_info['phone'], note='District Office')
                type='email', value=contact_info['email'], note='District Office')


            yield person
