本文整理汇总了Python中pupa.scrape.Event.add_media_link方法的典型用法代码示例。如果您正苦于以下问题:Python Event.add_media_link方法的具体用法?Python Event.add_media_link怎么用?Python Event.add_media_link使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pupa.scrape.Event
的用法示例。
在下文中一共展示了Event.add_media_link方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse_div
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_media_link [as 别名]
def parse_div(self, row, chamber, com):
cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0]
# event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip()
title, location, start_date, end_date = self.parse_gcal(cal_link)
event = Event(
start_date=start_date,
end_date=end_date,
name=title,
location_name=location,
)
event.add_source('http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx')
for item in row.xpath('.//div[@class="col-xs-12a Item"]'):
description = item.xpath('string(.)').strip()
agenda = event.add_agenda_item(description=description)
for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'):
description = item.xpath('string(.)').strip()
agenda = event.add_agenda_item(description=description)
event.add_document(
description,
item.xpath('@href')[0],
media_type="application/pdf",
on_duplicate="ignore"
)
for item in row.xpath('.//div[contains(@class,"ItemContainer")]'
'[./div[@class="col-xs-1 Item"]]'):
description = item.xpath('string(.)').strip()
agenda = event.add_agenda_item(description=description)
bill = item.xpath('.//div[@class="col-xs-1 Item"]/a/text()')[0].strip()
agenda.add_bill(bill)
video = row.xpath('.//a[./span[@class="OnDemand"]]')
if video:
event.add_media_link(
'Video of Hearing',
video[0].xpath('@href')[0],
'text/html'
)
if 'subcommittee' in title.lower():
subcom = title.split('-')[0].strip()
event.add_participant(
subcom,
type='committee',
note='host',
)
else:
event.add_participant(
com,
type='committee',
note='host',
)
yield event
示例2: test_full_event
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_media_link [as 别名]
def test_full_event():
j = Jurisdiction.objects.create(id='jid', division_id='did')
event = ScrapeEvent(name="America's Birthday", start_time="2014-07-04", location="America",
all_day=True)
event.add_person("George Washington")
event.add_media_link("fireworks", "http://example.com/fireworks.mov")
EventImporter('jid').import_data([event.as_dict()])
示例3: scrape
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_media_link [as 别名]
#.........这里部分代码省略.........
if not location:
# We expect some events to have no location. LA Metro would
# like these displayed in the Councilmatic interface. However,
# OCD requires a value for this field. Add a sane default.
location = 'Not available'
e = Event(event_name,
start_date=event["start"],
description='',
location_name=location,
status=status)
e.pupa_id = str(event['EventId'])
# Metro requires the EventGuid to build out MediaPlayer links.
# Add both the English event GUID, and the Spanish event GUID if
# it exists, to the extras dict.
e.extras = {'guid': event['EventGuid']}
legistar_api_url = self.BASE_URL + '/events/{0}'.format(event['EventId'])
e.add_source(legistar_api_url, note='api')
if event.get('SAPEventGuid'):
e.extras['sap_guid'] = event['SAPEventGuid']
if 'event_details' in event:
# if there is not a meeting detail page on legistar
# don't capture the agenda data from the API
for item in self.agenda(event):
agenda_item = e.add_agenda_item(item["EventItemTitle"])
if item["EventItemMatterFile"]:
identifier = item["EventItemMatterFile"]
agenda_item.add_bill(identifier)
if item["EventItemAgendaNumber"]:
# To the notes field, add the item number as given in the agenda minutes
note = "Agenda number, {}".format(item["EventItemAgendaNumber"])
agenda_item['notes'].append(note)
# The EventItemAgendaSequence provides
# the line number of the Legistar agenda grid.
agenda_item['extras']['item_agenda_sequence'] = item['EventItemAgendaSequence']
# Historically, the Legistar system has duplicated the EventItemAgendaSequence,
# resulting in data inaccuracies. The scrape should fail in such cases, until Metro
# cleans the data.
item_agenda_sequences = [item['extras']['item_agenda_sequence'] for item in e.agenda]
if len(item_agenda_sequences) != len(set(item_agenda_sequences)):
error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \
{event_name} on {event_date} ({legistar_api_url}). \
Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.'
raise ValueError(error_msg.format(event_name=e.name,
event_date=e.start_date.strftime("%B %d, %Y"),
legistar_api_url=legistar_api_url))
e.add_participant(name=body_name,
type="organization")
if event.get('SAPEventId'):
e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']),
note='api (sap)')
if event['EventAgendaFile']:
e.add_document(note= 'Agenda',
url = event['EventAgendaFile'],
media_type="application/pdf")
if event['EventMinutesFile']:
e.add_document(note= 'Minutes',
url = event['EventMinutesFile'],
media_type="application/pdf")
for audio in event['audio']:
try:
redirect_url = self.head(audio['url']).headers['Location']
except KeyError:
# In some cases, the redirect URL does not yet
# contain the location of the audio file. Skip
# these events, and retry on next scrape.
continue
e.add_media_link(note=audio['label'],
url=redirect_url,
media_type='text/html')
if web_event['Recap/Minutes'] != 'Not\xa0available':
e.add_document(note=web_event['Recap/Minutes']['label'],
url=web_event['Recap/Minutes']['url'],
media_type="application/pdf")
if event['event_details']:
for link in event['event_details']:
e.add_source(**link)
else:
e.add_source('https://metro.legistar.com/Calendar.aspx', note='web')
yield e
示例4: scrape
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_media_link [as 别名]
def scrape(self):
for event, agenda in self.events() :
description = None
location_string = event[u'Meeting Location']
location_list = location_string.split('--', 2)
location = ', '.join(location_list[0:2])
if not location :
continue
when = self.toTime(event[u'Meeting Date'])
event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt
when = when.replace(hour=event_time.hour,
minute=event_time.minute)
status_string = location_list[-1].split('Chicago, Illinois')
if len(status_string) > 1 and status_string[1] :
status_text = status_string[1].lower()
if any(phrase in status_text
for phrase in ('rescheduled to',
'postponed to',
'reconvened to',
'rescheduled to',
'meeting recessed',
'recessed meeting',
'postponed to',
'recessed until',
'deferred',
'time change',
'date change',
'recessed meeting - reconvene',
'cancelled',
'new date and time',
'rescheduled indefinitely',
'rescheduled for',)) :
status = 'cancelled'
elif status_text in ('rescheduled', 'recessed') :
status = 'cancelled'
elif status_text in ('meeting reconvened',
'reconvened meeting',
'recessed meeting',
'reconvene meeting',
'rescheduled hearing',
'rescheduled meeting',) :
status = confirmedOrPassed(when)
elif status_text in ('amended notice of meeting',
'room change',
'amended notice',
'change of location',
'revised - meeting date and time') :
status = confirmedOrPassed(when)
elif 'room' in status_text :
location = status_string[1] + ', ' + location
elif status_text in ('wrong meeting date',) :
continue
else :
print(status_text)
description = status_string[1].replace('--em--', '').strip()
status = confirmedOrPassed(when)
else :
status = confirmedOrPassed(when)
if description :
e = Event(name=event["Name"]["label"],
start_time=when,
description=description,
timezone='US/Central',
location_name=location,
status=status)
else :
e = Event(name=event["Name"]["label"],
start_time=when,
timezone='US/Central',
location_name=location,
status=status)
if event['Video'] != 'Not\xa0available' :
e.add_media_link(note='Recording',
url = event['Video']['url'],
type="recording",
media_type = 'text/html')
self.addDocs(e, event, 'Agenda')
self.addDocs(e, event, 'Notice')
self.addDocs(e, event, 'Transcript')
self.addDocs(e, event, 'Summary')
participant = event["Name"]["label"]
if participant == 'City Council' :
participant = 'Chicago City Council'
elif participant == 'Committee on Energy, Environmental Protection and Public Utilities (inactive)' :
participant = 'Committee on Energy, Environmental Protection and Public Utilities'
e.add_participant(name=participant,
type="organization")
#.........这里部分代码省略.........
示例5: scrape
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_media_link [as 别名]
def scrape(self):
last_events = deque(maxlen=10)
for event, agenda in self.events(since=2011) :
other_orgs = ''
extras = []
if '--em--' in event[u'Meeting Location'] :
location_string, note = event[u'Meeting Location'].split('--em--')[:2]
for each in note.split(' - ') :
if each.startswith('Join') :
other_orgs = each
else :
extras.append(each)
else :
location_string = event[u'Meeting Location']
location_list = location_string.split('-', 2)
location = ', '.join([each.strip() for each in location_list[0:2]])
if not location :
continue
when = self.toTime(event[u'Meeting Date'])
event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt
when = when.replace(hour=event_time.hour,
minute=event_time.minute)
time_string = event['Meeting Time']
if time_string in ('Deferred',) :
status = 'cancelled'
elif self.now() < when :
status = 'confirmed'
else :
status = 'passed'
description = event['Meeting\xa0Topic']
if any(each in description
for each
in ('Multiple meeting items',
'AGENDA TO BE ANNOUNCED')) :
description = ''
event_name = event['Name']
event_id = (event_name, when)
if event_id in last_events :
continue
else :
last_events.append(event_id)
e = Event(name=event_name,
start_time=when,
timezone=self.TIMEZONE,
description=description,
location_name=location,
status=status)
if extras :
e.extras = {'location note' : ' '.join(extras)}
if event['Multimedia'] != 'Not\xa0available' :
e.add_media_link(note='Recording',
url = event['Multimedia']['url'],
type="recording",
media_type = 'text/html')
self.addDocs(e, event, 'Agenda')
self.addDocs(e, event, 'Minutes')
if event['Name'] == 'City Council Stated Meeting' :
participating_orgs = ['New York City Council']
elif 'committee' in event['Name'].lower() :
participating_orgs = [event["Name"]]
else :
participating_orgs = []
if other_orgs :
other_orgs = re.sub('Jointl*y with the ', '', other_orgs)
participating_orgs += re.split(' and the |, the ', other_orgs)
for org in participating_orgs :
e.add_committee(name=org)
if agenda :
e.add_source(event["Meeting Details"]['url'])
for item, _, _ in agenda :
if item["Name"] :
agenda_item = e.add_agenda_item(item["Name"])
if item["File\xa0#"] :
if item['Action'] :
note = item['Action']
else :
note = 'consideration'
agenda_item.add_bill(item["File\xa0#"]['label'],
note=note)
else :
e.add_source(self.EVENTSPAGE)
#.........这里部分代码省略.........
示例6: scrape
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_media_link [as 别名]
def scrape(self):
meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE)
meetings_lxml = lxml.html.fromstring(meetings_html)
for meeting_type in ('archive', 'upcoming'):
for meeting in meetings_lxml.cssselect('#%s tbody tr' % meeting_type):
# attempt to map the cells across table types.
# if the sizes mismatch, ignore this one (it's an "empty" message)
try:
cell_mapping = self._organize_cells(meeting_type, meeting.cssselect('td'))
except:
continue
meeting_title = cell_mapping['title'].text
meeting_date = datetime.datetime.fromtimestamp(int(cell_mapping['date'].cssselect('span')[0].text))
e = Event(name=meeting_title, when=meeting_date, location='unknown')
e.add_source(self.ARLINGTON_MEETING_PAGE)
# detect agenda url, if present
meeting_agenda_url = None
if len(cell_mapping['agenda'].cssselect('a'))>0:
meeting_agenda_url = cell_mapping['agenda'].cssselect('a')[0].attrib.get('href')
# follow the agenda URL and attempt to extract associated documents
if meeting_agenda_url is not None:
e.add_link(meeting_agenda_url)
e.add_document(name='Agenda', url=meeting_agenda_url, mimetype='text/html')
meeting_agenda_html = self.urlopen(meeting_agenda_url)
meeting_agenda_lxml = lxml.html.fromstring(meeting_agenda_html)
for link in meeting_agenda_lxml.cssselect('a'):
link_url = link.attrib.get('href','')
if not len(link_url):
continue
if 'metaviewer.php' in link_url.lower():
# NOTE: application/pdf is a guess, may not always be correct
if link.text is not None:
e.add_document(name=link.text, url=link_url, mimetype='application/pdf')
# skip everything below here for the 'upcoming' table
if meeting_type=='upcoming':
continue
# detect video
# TODO: extract actual mp4 files
video_cell = cell_mapping['video'].cssselect('a')
if len(video_cell)>0:
video_url_match = re.search(r"http://(.*?)'", video_cell[0].attrib.get('onclick',''))
if video_url_match is not None:
e.add_media_link(name="Video", url=video_url_match.group(0), mimetype='text/html')
# detect audio
audio_cell = cell_mapping['audio'].cssselect('a')
if len(audio_cell)>0:
e.add_media_link(name="Audio", url=audio_cell[0].attrib.get('href', ''), mimetype='audio/mpeg')
# detect minutes
minutes_cell = cell_mapping['minutes'].cssselect('a')
if len(minutes_cell)>0:
e.add_media_link(name="Minutes", url=minutes_cell[0].attrib.get('href', ''), mimetype='text/html')
yield e
示例7: scrape
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_media_link [as 别名]
def scrape(self, follow_links=True):
for page in self.eventPages(EVENTSPAGE):
events_table = page.xpath("//table[@class='rgMasterTable']")[0]
for events, headers, rows in self.parseDataTable(events_table) :
if follow_links and type(events['Meeting\xa0Details']) == dict :
detail_url = events['Meeting\xa0Details']['url']
meeting_details = self.lxmlize(detail_url)
agenda_table = meeting_details.xpath(
"//table[@id='ctl00_ContentPlaceHolder1_gridMain_ctl00']")[0]
agenda = self.parseDataTable(agenda_table)
location_string = events[u'Meeting\xa0Location']
location_list = location_string.split('--')
location = ', '.join(location_list[0:2])
when = events[u'Meeting\xa0Date']
time_string = events[u'Meeting\xa0Time']
event_time = datetime.datetime.strptime(time_string,
"%I:%M %p")
when = when.replace(hour=event_time.hour)
status_string = location_list[-1].split('Chicago, Illinois')
if len(status_string) > 1 and status_string[1] :
status_text = status_string[1].lower()
if any(phrase in status_text
for phrase in ('rescheduled to',
'postponed to',
'reconvened to',
'recessed',
'cancelled',
'new date and time',
'rescheduled indefinitely',
'rescheduled for')) :
status = 'cancelled'
elif status_text in ('rescheduled') :
status = 'cancelled'
else :
print(status_text)
elif datetime.datetime.utcnow().replace(tzinfo = pytz.utc) > when :
status = 'confirmed'
else :
status = 'passed'
e = Event(name=events["Name"]["label"],
start_time=when,
timezone='US/Central',
location=location,
status=status)
e.add_source(detail_url)
if events['Video'] != 'Not\xa0available' :
e.add_media_link(note='Recording',
url = events['Video']['url'],
type="recording",
media_type = 'text/html')
addDocs(e, events, 'Agenda')
addDocs(e, events, 'Notice')
addDocs(e, events, 'Transcript')
addDocs(e, events, 'Summary')
for item, _, _ in agenda :
agenda_item = e.add_agenda_item(item["Title"])
agenda_item.add_bill(item["Record #"]['label'])
e.add_participant(name=events["Name"]["label"],
type="organization")
yield e