当前位置: 首页>>代码示例>>Python>>正文


Python Event.add_document方法代码示例

本文整理汇总了Python中pupa.scrape.Event.add_document方法的典型用法代码示例。如果您正苦于以下问题:Python Event.add_document方法的具体用法?Python Event.add_document怎么用?Python Event.add_document使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pupa.scrape.Event的用法示例。


在下文中一共展示了Event.add_document方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse_div

# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_document [as 别名]
    def parse_div(self, row, chamber, com):
        cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0]
        # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip()
        title, location, start_date, end_date = self.parse_gcal(cal_link)

        event = Event(
            start_date=start_date,
            end_date=end_date,
            name=title,
            location_name=location,
        )

        event.add_source('http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx')

        for item in row.xpath('.//div[@class="col-xs-12a Item"]'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

            event.add_document(
                description,
                item.xpath('@href')[0],
                media_type="application/pdf",
                on_duplicate="ignore"
            )

        for item in row.xpath('.//div[contains(@class,"ItemContainer")]'
                              '[./div[@class="col-xs-1 Item"]]'):
            description = item.xpath('string(.)').strip()
            agenda = event.add_agenda_item(description=description)

            bill = item.xpath('.//div[@class="col-xs-1 Item"]/a/text()')[0].strip()
            agenda.add_bill(bill)

        video = row.xpath('.//a[./span[@class="OnDemand"]]')
        if video:
            event.add_media_link(
                'Video of Hearing',
                video[0].xpath('@href')[0],
                'text/html'
            )

        if 'subcommittee' in title.lower():
            subcom = title.split('-')[0].strip()
            event.add_participant(
                subcom,
                type='committee',
                note='host',
            )
        else:
            event.add_participant(
                com,
                type='committee',
                note='host',
            )
        yield event
开发者ID:sunlightlabs,项目名称:openstates,代码行数:61,代码来源:events.py

示例2: scrape_house_weekly_schedule

# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_document [as 别名]
    def scrape_house_weekly_schedule(self):
        url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx"
        page = self.lxmlize(url)

        meeting_rows = page.xpath('//table[@id = "table229"]/tr')

        valid_meetings = [row for row in meeting_rows if row.xpath(
            './td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath(
            './td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath(
            './td[2]')[0].text_content()]

        for meeting in valid_meetings:
            try:
                guid = meeting.xpath('./td/a[descendant::img[contains(@src,'
                                     '"PDF-AGENDA.png")]]/@href')[0]
                # self.logger.debug(guid)
                self.warning("logger.debug" + guid)
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee_name = meeting.xpath('./td[1]/text()')[0].strip()
            meeting_string = meeting.xpath('./td[2]')[0].text_content()

            if "@" in meeting_string:
                continue  # Contains no time data.
            date, time, location = ([s.strip() for s in meeting_string.split(
                ',') if s] + [None]*3)[:3]

            # check for time in date because of missing comma
            time_srch = re.search(r'\d{2}:\d{2} (AM|PM)', date)
            if time_srch:
                location = time
                time = time_srch.group()
                date = date.replace(time, '')

            # self.logger.debug(location)
            self.warning("logger.debug" + location)

            year = datetime.datetime.now().year
            datetime_string = ' '.join((date, str(year), time))
            when = datetime.datetime.strptime(datetime_string, '%b %d %Y %I:%M %p')
            when = self._tz.localize(when)

            description = 'Committee Meeting: {}'.format(committee_name)
            # self.logger.debug(description)
            self.warning("logger.debug" + description)

            event = Event(name=description,
                          start_date=self._tz.localize(when),
                          location_name=location)
            event.add_source(url)
            event.add_participant(committee_name, type='committee', note='host')
            event.add_document(note='Agenda', url=guid, text='agenda',
                               media_type='application/pdf')

            yield event
开发者ID:sunlightlabs,项目名称:openstates,代码行数:59,代码来源:events.py

示例3: scrape

# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_document [as 别名]
    def scrape(self):
        EVENTS_URL = 'http://www.akleg.gov/basis/Meeting/Find'
        events = self.lxmlize(EVENTS_URL).xpath('//ul[@id="meetingResults"]/li')
        for info in events:
            event_url = info.xpath('span[@class="col04"]/a/@href')[0]
            doc = self.lxmlize(event_url)

            # Skip events that are placeholders or tentative
            # Also skip whole-chamber events
            if any(x.strip().startswith("No Meeting") for x in
                    doc.xpath('//div[@class="schedule"]//text()')) \
                    or "session" in \
                    info.xpath('span[@class="col01"]/text()')[0].lower():
                continue

            name = " ".join(
                x.strip()
                for x in doc.xpath('//div[@class="schedule"]//text()')
                if x.strip()
            )

            # Skip events with no name
            if not name:
                continue

            event = Event(
                start_date=self._TZ.localize(
                    datetime.datetime.strptime(
                        info.xpath('span[@class="col02"]/text()')[0],
                        self._DATETIME_FORMAT,
                    )
                ),
                name=name,
                location_name=doc.xpath(
                    '//div[@class="heading-container"]/span/text()'
                )[0].title()
            )

            event.add_participant(
                info.xpath('span[@class="col01"]/text()')[0].title(),
                type='committee',
                note='host',
            )

            for document in doc.xpath('//td[@data-label="Document"]/a'):
                event.add_document(
                    document.xpath('text()')[0],
                    url=document.xpath('@href')[0]
                )

            event.add_source(EVENTS_URL)
            event.add_source(event_url.replace(" ", "%20"))

            yield event
开发者ID:neelneelpurk,项目名称:openstates,代码行数:56,代码来源:events.py

示例4: scrape

# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_document [as 别名]
    def scrape(self):
        page = self.lxmlize(calurl)
        events = page.xpath("//table[@class='agenda-body']//tr")[1:]

        for event in events:
            comit_url = event.xpath(
                ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]")

            if len(comit_url) != 1:
                raise Exception

            comit_url = comit_url[0]
            who = self.scrape_participants(comit_url.attrib['href'])

            tds = event.xpath("./*")
            date = tds[0].text_content().strip()
            cttie = tds[1].text_content().strip()
            _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)]
            info = tds[2]
            name = info.xpath("./a[contains(@href, 'raw')]")[0]
            notice = name.attrib['href']
            name = name.text
            time, where = info.xpath("./i/text()")
            what = tds[3].text_content()
            what = what.replace("Items: ", "")
            if "(None)" in what:
                continue
            what = [x.strip() for x in what.split(";")]

            when = ", ".join([date, str(dt.datetime.now().year), time])
            when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p")

            event = Event(
                name=name,
                location_name=where,
                start_date=self._tz.localize(when),
            )

            event.add_source(calurl)

            event.add_committee(cttie, note='host')

            event.add_document("notice", notice, media_type='application/pdf')

            for entry in what:
                item = event.add_agenda_item(entry)
                if entry.startswith('AB') or entry.startswith('SB'):
                    item.add_bill(entry)

            for thing in who:
                event.add_person(thing['name'])

            yield event
开发者ID:sunlightlabs,项目名称:openstates,代码行数:55,代码来源:events.py

示例5: scrape

# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_document [as 别名]
    def scrape(self):
        tz = pytz.timezone("US/Eastern")
        get_short_codes(self)
        page = self.lxmlize(URL)
        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()
            descr = [x.text_content() for x in tds[1].xpath(".//span")]
            if len(descr) != 1:
                raise Exception
            descr = descr[0].replace('.', '').strip()
            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib['href']
            notice_name = notice.text
            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")
            when = pytz.utc.localize(when)
            event = Event(name=descr, start_time=when, classification='committee-meeting',
                          description=descr, location_name=where, timezone=tz.zone)

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [committee]

            for committee in committees:
                if "INFO" not in committee:
                    committee = self.short_ids.get("committee", {"chamber": "unknown",
                                                                 "name": committee})

                else:
                    committee = {
                        "chamber": "joint",
                        "name": committee,
                    }
                event.add_committee(committee['name'], note='host')

            event.add_source(URL)
            event.add_document(notice_name,
                               notice_href,
                               media_type='text/html')
            for bill in self.get_related_bills(notice_href):
                a = event.add_agenda_item(description=bill['descr'])
                a.add_bill(
                    bill['bill_id'],
                    note=bill['type']
                )
            yield event
开发者ID:cliftonmcintosh,项目名称:openstates,代码行数:54,代码来源:events.py

示例6: scrape_event

# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_document [as 别名]
    def scrape_event(self, row):
        date_td = row.xpath('td[1]')[0]
        info_td = row.xpath('td[2]')[0]

        date = date_td.xpath('b')[0].text.strip()
        time = date_td.xpath('b/following-sibling::text()')[0].strip()

        date_and_time = "{} {}".format(date, time)
        start_date = datetime.datetime.strptime(
            date_and_time, '%m/%d/%y %I:%M %p')

        title = info_td.xpath('font[1]/strong')[0].text.strip()

        all_text = info_td.xpath('descendant-or-self::*/text()')
        notes = (line.strip() for line in all_text if line.strip())
        notes = list(notes)
        # Skip the first line, which is the title
        notes = notes[1:]
        # Split out the address
        address = notes[0]
        notes = notes[1:]
        # The rest just becomes the description
        notes = "\n".join(notes)

        event = Event(
            start_date=self._TZ.localize(start_date),
            name=title,
            location_name=address,
            description=notes
        )

        event.add_source(self.URL)

        if info_td.xpath('a[contains(font/text(),"agenda")]'):
            agenda_url = info_td.xpath('a/@href')[0]
            event.add_document(
                "Agenda",
                url=agenda_url
            )

        yield event
开发者ID:sunlightlabs,项目名称:openstates,代码行数:43,代码来源:events.py

示例7: scrape

# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_document [as 别名]
    def scrape(self):
        method = 'events/?state={}&dtstart=1776-07-04'.format(self.state)
        self.events = self.api(method)
        seen = set()
        for event in self.events:
            begin = self._date_parse(event.pop('when'))
            end = self._date_parse(event.pop('end'))
            all_day = event.pop('all_day',False)

            e = Event(name=event.pop('description'),
                      classification=event.pop('type'),
                      location_name=event.pop('location'),
                      timezone=event.pop('timezone'),
                      start_time=begin,
                      end_time=end,
                      all_day=all_day,)
            if len(e.name) >= 300:
                e.name = e.name[:290]

            if len(e.location['name']) >= 100:
                e.location['name'] = e.location['name'][:90]

            composite_key = (e.name, e.description, e.start_time)
            if composite_key in seen:
                print("Duplicate found: %s/%s/%s" % (composite_key))
                continue

            seen.add(composite_key)

            for source in event.pop('sources'):
                if 'retrieved' in source:
                    source.pop('retrieved')
                e.add_source(**source)

            if e.sources == []:
                continue

            ignore = ['country', 'level', 'state', 'created_at', 'updated_at',
                      'notes', '+location_url', 'session', 'id', '+chamber',
                      '+agenda', '+cancelled', '+media_contact', '+contact',
                      '+details']
            # +agenda:
            #   Agenda on old (very old) OpenStates data is actually a string
            #   and not any sort of structured data we can use in the items
            #   schema, and is only present for a handful of events.

            for i in ignore:
                if i in event:
                    event.pop(i)

            for link in ['+link', 'link']:
                if link in event:
                    e.add_source(url=event.pop(link))

            for p in event.pop('participants', []):
                type_ = {
                    "committee": "organization",
                    "legislator": "person",
                    None: None,
                }[p.get('participant_type')]

                if type_ is None:
                    # Garbage data.
                    continue

                e.add_participant(name=p['participant'],
                                  note=p['type'],
                                  type=type_,)

            for b in event.pop('related_bills', []):
                item = e.add_agenda_item(
                    b.pop('description', b.pop('+description', None)))

                item.add_bill(bill=b['bill_id'],
                              note=b.pop('type', b.pop('+type', None)))

            seen_documents = set([])
            for document in event.pop('documents', []):
                if document['url'] in seen_documents:
                    print("XXX: Buggy data in: Duped Document URL: %s (%s)" % (
                        document['url'], document['name']
                    ))
                    continue

                seen_documents.add(document['url'])
                e.add_document(url=document['url'],
                               note=document['name'])

            assert event == {}, "Unknown fields: %s" % (
                ", ".join(event.keys())
            )

            yield e
开发者ID:opencivicdata,项目名称:scrapers-us-state,代码行数:95,代码来源:events.py

示例8: scrape

# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_document [as 别名]
    def scrape(self, window=None) :
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        else:
            n_days_ago = None

        events = self.events(n_days_ago)

        for event, web_event in self._merge_events(events):
            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [part.strip()
                                         for part
                                         in body_name.split('-')]
            else:
                event_name = body_name

            # Events can have an EventAgendaStatusName of "Final", "Final Revised", 
            # and "Final 2nd Revised."
            # We classify these events as "passed."
            status_name = event['EventAgendaStatusName']
            if status_name.startswith('Final'):
                status = 'passed'
            elif status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            location = event["EventLocation"]

            if not location:
                # We expect some events to have no location. LA Metro would
                # like these displayed in the Councilmatic interface. However,
                # OCD requires a value for this field. Add a sane default.
                location = 'Not available'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=location,
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links.
            # Add both the English event GUID, and the Spanish event GUID if
            # it exists, to the extras dict.
            e.extras = {'guid': event['EventGuid']}

            legistar_api_url = self.BASE_URL + '/events/{0}'.format(event['EventId'])
            e.add_source(legistar_api_url, note='api')

            if event.get('SAPEventGuid'):
                e.extras['sap_guid'] = event['SAPEventGuid']

            if 'event_details' in event:
                # if there is not a meeting detail page on legistar
                # don't capture the agenda data from the API
                for item in self.agenda(event):
                    agenda_item = e.add_agenda_item(item["EventItemTitle"])
                    if item["EventItemMatterFile"]:
                        identifier = item["EventItemMatterFile"]
                        agenda_item.add_bill(identifier)

                    if item["EventItemAgendaNumber"]:
                        # To the notes field, add the item number as given in the agenda minutes
                        note = "Agenda number, {}".format(item["EventItemAgendaNumber"])
                        agenda_item['notes'].append(note)

                    # The EventItemAgendaSequence provides 
                    # the line number of the Legistar agenda grid.
                    agenda_item['extras']['item_agenda_sequence'] = item['EventItemAgendaSequence']

                # Historically, the Legistar system has duplicated the EventItemAgendaSequence,
                # resulting in data inaccuracies. The scrape should fail in such cases, until Metro
                # cleans the data.
                item_agenda_sequences = [item['extras']['item_agenda_sequence'] for item in e.agenda]
                if len(item_agenda_sequences) != len(set(item_agenda_sequences)):
                    error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \
                        {event_name} on {event_date} ({legistar_api_url}). \
                        Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.'

                    raise ValueError(error_msg.format(event_name=e.name, 
                                                      event_date=e.start_date.strftime("%B %d, %Y"),
                                                      legistar_api_url=legistar_api_url))

            e.add_participant(name=body_name,
                              type="organization")

            if event.get('SAPEventId'):
                e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']),
                             note='api (sap)')

            if event['EventAgendaFile']:
                e.add_document(note= 'Agenda',
                               url = event['EventAgendaFile'],
                               media_type="application/pdf")
#.........这里部分代码省略.........
开发者ID:datamade,项目名称:scrapers-us-municipal,代码行数:103,代码来源:events.py

示例9: _parse_house_floor_xml_legislative_activity

# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_document [as 别名]
    def _parse_house_floor_xml_legislative_activity(self, xml):
        """
        Parses XML string of House floor updates and yields them in loop.

        @param xml: XML of field update
        @type xml: string
        @return: complete Event object
        @rtype: Event
        """
        tree = self._xml_parser(xml)

        congress = tree.xpath('.//legislative_congress')[0].get('congress')

        house_committees = self._get_current_house_committee_names()
        for fa in tree.xpath('.//floor_action'):
            fa_text = fa.xpath('.//action_description')[0].xpath('string()')

            eastern = pytz.timezone('US/Eastern')
            dt = datetime.datetime.strptime(fa.xpath('action_time')[0].get('for-search'), '%Y%m%dT%H:%M:%S')
            event = Event('House Floor Update on {0} at {1}.'.format(dt.strftime('%Y-%m-%d'), dt.strftime('%H:%M:%S')),
                          eastern.localize(dt).astimezone(pytz.utc),
                          'US/Eastern',
                          '',
                          description=fa_text,
                          classification='floor_update')

            event.set_location("East Capitol Street Northeast & First St SE, Washington, DC 20004",
                               note='House Floor', url='http://www.house.gov',
                               coordinates={'latitude': '38.889931', 'longitude': '-77.009003'})

            event.add_source(self._house_floor_src_url(date_str=tree.xpath('.//legislative_day')[0].get('date')),
                             note="Scraped from the Office of the Clerk, U.S. House of Representatives website.")

            event.extras['act-id'] = fa.get('act-id')
            event.extras['unique-id'] = fa.get('unique-id')

            # bills
            ai_b = event.add_agenda_item(description='Bills referenced by this update.')
            for bill in fa.xpath(".//a[@rel='bill']"):
                bill_name = bill.xpath('string()')
                ai_b.add_bill(bill_name, id=make_pseudo_id(identifier=bill_code_to_id(bill_name), congress=congress),
                              note="Bill was referenced on the House floor.")

            # publaws
            ai_p = event.add_agenda_item(description='Public laws referenced by this update.')
            for law in fa.xpath(".//a[@rel='publaw']"):
                detail_url = '/'.join(law.get('href').split('/')[0:-2]) + '/content-detail.html'
                ai_p.add_bill(law.xpath('string()'),
                              id=make_pseudo_id(**self._public_law_detail_scraper(url=detail_url)),
                              note='Law was referenced on the House floor.')

            # votes
            ai_v = event.add_agenda_item(description='Votes referenced by this update.')
            for vote in fa.xpath(".//a[@rel='vote']"):
                vote_name = vote.xpath('string()')
                ai_v.add_vote(vote_name,
                              id=make_pseudo_id(identifier=vote_code_to_id(vote_name), congress=congress),
                              note='Vote was referenced on the House floor.')

            # reports
            for report in fa.xpath(".//a[@rel='report']"):
                event.add_document('Document referenced by this update.', report.get('href'), media_type='text/html')

            for name in house_committees:
                if name.replace('House ', '') in fa_text:
                    event.add_committee(name, id=make_pseudo_id(name=name))

            # TODO identify legislators and add them as participants?


            yield event
开发者ID:crdunwel,项目名称:scrapers-us-federal,代码行数:73,代码来源:floor_update.py

示例10: scrape_agenda

# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_document [as 别名]
    def scrape_agenda(self, url):
        page = self.lxmlize(url)
        # Get the date/time info:
        date_time = page.xpath("//table[@class='time_place']")
        if date_time == []:
            return

        date_time = date_time[0]
        lines = date_time.xpath("./tr")
        metainf = {}
        for line in lines:
            tds = line.xpath("./td")
            metainf[tds[0].text_content()] = tds[1].text_content()
        date = metainf['DATE:']
        time = metainf['TIME:']
        where = metainf['PLACE:']

        # check for duration in time
        if ' - ' in time:
            start, end = time.split(' - ')
            am_pm_srch = re.search('(?i)(am|pm)', end)
            if am_pm_srch:
                time = ' '.join([start, am_pm_srch.group().upper()])
            else:
                time = start

        fmts = [
            "%A, %B %d, %Y",
            "%A, %B %d, %Y %I:%M %p",
            "%A, %B %d, %Y %I:%M",
        ]

        event_desc = "Meeting Notice"
        if 'Rise' in time:
            datetime = date
            event_desc = "Meeting Notice: Starting at {}".format(time)
        else:
            datetime = "%s %s" % (date, time)
        if "CANCELLED" in datetime.upper():
            return

        transtable = {
            "P.M": "PM",
            "PM.": "PM",
            "P.M.": "PM",
            "A.M.": "AM",
            "POSTPONED": "",
            "RESCHEDULED": "",
            "and Rise of the Senate": "",
        }
        for trans in transtable:
            datetime = datetime.replace(trans, transtable[trans])

        datetime = datetime.strip()

        for fmt in fmts:
            try:
                datetime = dt.datetime.strptime(datetime, fmt)
                break
            except ValueError:
                continue

        event = Event(
            name=event_desc,
            start_date=self._tz.localize(datetime),
            location_name=where,
        )
        event.add_source(url)
        # aight. Let's get us some bills!
        bills = page.xpath("//b/a")
        for bill in bills:
            bill_ft = bill.attrib['href']
            event.add_document(
                bill.text_content(), bill_ft,
                media_type="application/pdf")
            root = bill.xpath('../../*')
            root = [x.text_content() for x in root]
            bill_id = "".join(root)

            if "SCHEDULED FOR" in bill_id:
                continue

            descr = bill.getparent().getparent().getparent().getnext().getnext().text_content()

            for thing in replace:
                bill_id = bill_id.replace(thing, replace[thing])

            item = event.add_agenda_item(descr)
            item.add_bill(bill.text_content())

        committee = page.xpath("//span[@id='lblSession']")[0].text_content()

        event.add_participant(committee, 'committee', note='host')

        yield event
开发者ID:neelneelpurk,项目名称:openstates,代码行数:97,代码来源:events.py

示例11: scrape

# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_document [as 别名]
    def scrape(self):
        meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE)
        meetings_lxml = lxml.html.fromstring(meetings_html)
        
        for meeting_type in ('archive', 'upcoming'):
            for meeting in meetings_lxml.cssselect('#%s tbody tr' % meeting_type):
                
                # attempt to map the cells across table types. 
                # if the sizes mismatch, ignore this one (it's an "empty" message)
                try:
                    cell_mapping = self._organize_cells(meeting_type, meeting.cssselect('td'))
                except:
                    continue

                meeting_title = cell_mapping['title'].text
                meeting_date = datetime.datetime.fromtimestamp(int(cell_mapping['date'].cssselect('span')[0].text))

                e = Event(name=meeting_title, when=meeting_date, location='unknown')
                e.add_source(self.ARLINGTON_MEETING_PAGE)                

                # detect agenda url, if present
                meeting_agenda_url = None
                if len(cell_mapping['agenda'].cssselect('a'))>0:
                    meeting_agenda_url = cell_mapping['agenda'].cssselect('a')[0].attrib.get('href')

                # follow the agenda URL and attempt to extract associated documents
                if meeting_agenda_url is not None:
                    e.add_link(meeting_agenda_url)
                    e.add_document(name='Agenda', url=meeting_agenda_url, mimetype='text/html')                    

                    meeting_agenda_html = self.urlopen(meeting_agenda_url)
                    meeting_agenda_lxml = lxml.html.fromstring(meeting_agenda_html)
                    for link in meeting_agenda_lxml.cssselect('a'):
                        link_url = link.attrib.get('href','')
                        if not len(link_url):
                            continue
                        if 'metaviewer.php' in link_url.lower():
                            # NOTE: application/pdf is a guess, may not always be correct
                            if link.text is not None:
                                e.add_document(name=link.text, url=link_url, mimetype='application/pdf') 

                # skip everything below here for the 'upcoming' table
                if meeting_type=='upcoming':
                    continue

                # detect video
                # TODO: extract actual mp4 files
                video_cell = cell_mapping['video'].cssselect('a')
                if len(video_cell)>0:
                    video_url_match = re.search(r"http://(.*?)'", video_cell[0].attrib.get('onclick',''))
                    if video_url_match is not None:
                        e.add_media_link(name="Video", url=video_url_match.group(0), mimetype='text/html')

                # detect audio
                audio_cell = cell_mapping['audio'].cssselect('a')
                if len(audio_cell)>0:
                    e.add_media_link(name="Audio", url=audio_cell[0].attrib.get('href', ''), mimetype='audio/mpeg')

                # detect minutes
                minutes_cell = cell_mapping['minutes'].cssselect('a')
                if len(minutes_cell)>0:
                    e.add_media_link(name="Minutes", url=minutes_cell[0].attrib.get('href', ''), mimetype='text/html')

                yield e
开发者ID:ChaelCodes,项目名称:scrapers-us-municipal,代码行数:66,代码来源:events.py


注:本文中的pupa.scrape.Event.add_document方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。