本文整理汇总了Python中pupa.scrape.Event.extras方法的典型用法代码示例。如果您正苦于以下问题:Python Event.extras方法的具体用法?Python Event.extras怎么用?Python Event.extras使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pupa.scrape.Event
的用法示例。
在下文中一共展示了Event.extras方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: scrape_events_range
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import extras [as 别名]
def scrape_events_range(self, start_date, end_date):
def daterange(start_date, end_date):
number_of_days = int((end_date - start_date).days)
for n in range(number_of_days):
yield start_date + datetime.timedelta(n)
for date in daterange(start_date, end_date):
calendar_day_url = CALENDAR_DAY_TEMPLATE.format(date.year, date.month - 1, date.day)
events = self.extract_events_by_url(calendar_day_url)
for event in events:
tz = pytz.timezone("America/Toronto")
time = datetime.datetime.strptime(event['time'], '%I:%M %p')
start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0))
org_name = event['meeting']
e = Event(
name=org_name,
start_time=start,
timezone=tz.zone,
location_name=event['location'],
status=STATUS_DICT.get(event['meeting_status'])
)
e.extras = {
'meeting_number': event['no'],
'tmmis_meeting_id': event['meeting_id'],
}
e.add_source(calendar_day_url)
e.add_participant(
name=org_name,
type='organization',
)
def is_agenda_available(event):
return event['publishing_status'] in ['Agenda Published', 'Minutes Published']
def is_council(event):
return True if event['meeting'] == self.jurisdiction.name else False
if is_agenda_available(event):
agenda_url_template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE
agenda_url = agenda_url_template.format(event['meeting_id'])
full_identifiers = list(self.full_identifiers(event['meeting_id'], is_council(event)))
e.add_source(agenda_url)
agenda_items = self.agenda_from_url(agenda_url)
for i, item in enumerate(agenda_items):
a = e.add_agenda_item(item['title'])
a.add_classification(item['type'].lower())
a['order'] = str(i)
def normalize_wards(raw):
if not raw:
raw = 'All'
if raw == 'All':
return raw.lower()
else:
return raw.split(', ')
identifier_regex = re.compile(r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$')
[full_identifier] = [id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier']]
a.add_bill(full_identifier)
yield e
示例2: scrape
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import extras [as 别名]
def scrape(self, window=None) :
if window:
n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
else:
n_days_ago = None
events = self.events(n_days_ago)
for event, web_event in self._merge_events(events):
body_name = event["EventBodyName"]
if 'Board of Directors -' in body_name:
body_name, event_name = [part.strip()
for part
in body_name.split('-')]
else:
event_name = body_name
# Events can have an EventAgendaStatusName of "Final", "Final Revised",
# and "Final 2nd Revised."
# We classify these events as "passed."
status_name = event['EventAgendaStatusName']
if status_name.startswith('Final'):
status = 'passed'
elif status_name == 'Draft':
status = 'confirmed'
elif status_name == 'Canceled':
status = 'cancelled'
else:
status = 'tentative'
location = event["EventLocation"]
if not location:
# We expect some events to have no location. LA Metro would
# like these displayed in the Councilmatic interface. However,
# OCD requires a value for this field. Add a sane default.
location = 'Not available'
e = Event(event_name,
start_date=event["start"],
description='',
location_name=location,
status=status)
e.pupa_id = str(event['EventId'])
# Metro requires the EventGuid to build out MediaPlayer links.
# Add both the English event GUID, and the Spanish event GUID if
# it exists, to the extras dict.
e.extras = {'guid': event['EventGuid']}
legistar_api_url = self.BASE_URL + '/events/{0}'.format(event['EventId'])
e.add_source(legistar_api_url, note='api')
if event.get('SAPEventGuid'):
e.extras['sap_guid'] = event['SAPEventGuid']
if 'event_details' in event:
# if there is not a meeting detail page on legistar
# don't capture the agenda data from the API
for item in self.agenda(event):
agenda_item = e.add_agenda_item(item["EventItemTitle"])
if item["EventItemMatterFile"]:
identifier = item["EventItemMatterFile"]
agenda_item.add_bill(identifier)
if item["EventItemAgendaNumber"]:
# To the notes field, add the item number as given in the agenda minutes
note = "Agenda number, {}".format(item["EventItemAgendaNumber"])
agenda_item['notes'].append(note)
# The EventItemAgendaSequence provides
# the line number of the Legistar agenda grid.
agenda_item['extras']['item_agenda_sequence'] = item['EventItemAgendaSequence']
# Historically, the Legistar system has duplicated the EventItemAgendaSequence,
# resulting in data inaccuracies. The scrape should fail in such cases, until Metro
# cleans the data.
item_agenda_sequences = [item['extras']['item_agenda_sequence'] for item in e.agenda]
if len(item_agenda_sequences) != len(set(item_agenda_sequences)):
error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \
{event_name} on {event_date} ({legistar_api_url}). \
Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.'
raise ValueError(error_msg.format(event_name=e.name,
event_date=e.start_date.strftime("%B %d, %Y"),
legistar_api_url=legistar_api_url))
e.add_participant(name=body_name,
type="organization")
if event.get('SAPEventId'):
e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']),
note='api (sap)')
if event['EventAgendaFile']:
e.add_document(note= 'Agenda',
url = event['EventAgendaFile'],
media_type="application/pdf")
#.........这里部分代码省略.........
示例3: scrape_events_range
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import extras [as 别名]
def scrape_events_range(self, start_date, end_date):
def daterange(start_date, end_date):
number_of_days = int((end_date - start_date).days)
for n in range(number_of_days):
yield start_date + dt.timedelta(n)
for date in daterange(start_date, end_date):
events = self.extract_events_by_day(date)
for event in events:
tz = pytz.timezone("America/Toronto")
time = dt.datetime.strptime(event['time'], '%I:%M %p')
start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0))
source_url = CALENDAR_DAY_TEMPLATE.format(start.year, start.month, start.day)
org_name = event['meeting']
e = Event(
name = org_name,
start_time = start,
timezone = tz.zone,
location_name = event['location'],
status=STATUS_DICT.get(event['meeting_status'])
)
e.add_source(source_url)
e.extras = {
'meeting_number': event['no'],
'tmmis_meeting_id': event['meeting_id'],
}
e.add_participant(
name = org_name,
type = 'organization',
)
def is_agenda_available(event):
return event['publishing_status'] in ['Agenda Published', 'Minutes Published']
def is_council(event):
return True if event['meeting'] == self.jurisdiction.name else False
if is_agenda_available(event):
template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE
agenda_url = template.format(event['meeting_id'])
full_identifiers = list(self.full_identifiers(event['meeting_id'], is_council(event)))
e.add_source(agenda_url)
agenda_items = self.agenda_from_url(agenda_url)
for i, item in enumerate(agenda_items):
a = e.add_agenda_item(item['title'])
a.add_classification(item['type'].lower())
a['order'] = str(i)
def normalize_wards(raw):
if not raw: raw = 'All'
if raw == 'All':
return raw.lower()
else:
return raw.split(', ')
wards = normalize_wards(item['wards'])
identifier_regex = re.compile(r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$')
[full_identifier] = [id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier']]
a.add_bill(full_identifier)
if full_identifier not in self.seen_agenda_items:
b = Bill(
# TODO: Fix this hardcode
legislative_session = '2014-2018',
identifier = full_identifier,
title = item['title'],
from_organization = {'name': self.jurisdiction.name},
)
b.add_source(agenda_url)
b.add_document_link(note='canonical', media_type='text/html', url=AGENDA_ITEM_TEMPLATE.format(full_identifier))
b.extras = {
'wards': wards,
}
self.seen_agenda_items.append(full_identifier)
yield b
yield e
示例4: scrape
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import extras [as 别名]
def scrape(self):
last_events = deque(maxlen=10)
for event, agenda in self.events(since=2011) :
other_orgs = ''
extras = []
if '--em--' in event[u'Meeting Location'] :
location_string, note = event[u'Meeting Location'].split('--em--')[:2]
for each in note.split(' - ') :
if each.startswith('Join') :
other_orgs = each
else :
extras.append(each)
else :
location_string = event[u'Meeting Location']
location_list = location_string.split('-', 2)
location = ', '.join([each.strip() for each in location_list[0:2]])
if not location :
continue
when = self.toTime(event[u'Meeting Date'])
event_time = event['iCalendar'].subcomponents[0]['DTSTART'].dt
when = when.replace(hour=event_time.hour,
minute=event_time.minute)
time_string = event['Meeting Time']
if time_string in ('Deferred',) :
status = 'cancelled'
elif self.now() < when :
status = 'confirmed'
else :
status = 'passed'
description = event['Meeting\xa0Topic']
if any(each in description
for each
in ('Multiple meeting items',
'AGENDA TO BE ANNOUNCED')) :
description = ''
event_name = event['Name']
event_id = (event_name, when)
if event_id in last_events :
continue
else :
last_events.append(event_id)
e = Event(name=event_name,
start_time=when,
timezone=self.TIMEZONE,
description=description,
location_name=location,
status=status)
if extras :
e.extras = {'location note' : ' '.join(extras)}
if event['Multimedia'] != 'Not\xa0available' :
e.add_media_link(note='Recording',
url = event['Multimedia']['url'],
type="recording",
media_type = 'text/html')
self.addDocs(e, event, 'Agenda')
self.addDocs(e, event, 'Minutes')
if event['Name'] == 'City Council Stated Meeting' :
participating_orgs = ['New York City Council']
elif 'committee' in event['Name'].lower() :
participating_orgs = [event["Name"]]
else :
participating_orgs = []
if other_orgs :
other_orgs = re.sub('Jointl*y with the ', '', other_orgs)
participating_orgs += re.split(' and the |, the ', other_orgs)
for org in participating_orgs :
e.add_committee(name=org)
if agenda :
e.add_source(event["Meeting Details"]['url'])
for item, _, _ in agenda :
if item["Name"] :
agenda_item = e.add_agenda_item(item["Name"])
if item["File\xa0#"] :
if item['Action'] :
note = item['Action']
else :
note = 'consideration'
agenda_item.add_bill(item["File\xa0#"]['label'],
note=note)
else :
e.add_source(self.EVENTSPAGE)
#.........这里部分代码省略.........
示例5: lower_parse_page
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import extras [as 别名]
#.........这里部分代码省略.........
date = re.sub(r"\s+", " ", date)
date = re.sub(".*POSTPONED NEW DATE", "", date).strip()
# Due to the html structure this shouldn't be an elif
# It needs to fire twice in the same loop iteration
if value.tag == 'th' and value.get("class") == 'commtitle':
coms = value.xpath('.//div[contains(@class,"comm-txt")]/text()')
elif key.tag == 'td':
key = key.text_content().strip()
value = value.text_content().strip()
value = value.replace(u'\x96', '-')
value = re.sub(r"\s+", " ", value)
metainf[key] = value
time = metainf['Time:']
repl = {
"A.M.": "AM",
"P.M.": "PM",
}
drepl = {
"Sept": "Sep"
}
for r in repl:
time = time.replace(r, repl[r])
for r in drepl:
date = date.replace(r, drepl[r])
time = re.sub("-.*", "", time)
time = time.strip()
year = dt.datetime.now().year
date = "%s %s %s" % (
date,
year,
time
)
if "tbd" in date.lower():
continue
date = date.replace(' PLEASE NOTE NEW TIME', '')
# Check if the event has been postponed.
postponed = 'POSTPONED' in date
if postponed:
date = date.replace(' POSTPONED', '')
date_formats = ["%B %d %Y %I:%M %p", "%b. %d %Y %I:%M %p"]
datetime = None
for fmt in date_formats:
try:
datetime = dt.datetime.strptime(date, fmt)
except ValueError:
pass
# If the datetime can't be parsed, bail.
if datetime is None:
return
title_key = set(metainf) & set([
'Public Hearing:', 'Summit:', 'Roundtable:',
'Public Roundtable:', 'Public Meeting:', 'Public Forum:',
'Meeting:'])
assert len(title_key) == 1, "Couldn't determine event title."
title_key = list(title_key).pop()
title = metainf[title_key]
title = re.sub(
r"\*\*Click here to view public hearing notice\*\*",
"",
title
)
# If event was postponed, add a warning to the title.
if postponed:
title = 'POSTPONED: %s' % title
event = Event(
name=title,
start_date=self._tz.localize(datetime),
location_name=metainf['Place:'],
)
event.extras = {'contact': metainf['Contact:']}
if 'Media Contact:' in metainf:
event.extras.update(media_contact=metainf['Media Contact:'])
event.add_source(url)
for com in coms:
event.add_participant(
com.strip(),
type='committee',
note='host',
)
participant = event.participants[-1]
participant['extras'] = {'chamber': self.classify_committee(com)},
yield event
示例6: scrape_events_range
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import extras [as 别名]
def scrape_events_range(self, start_date, end_date):
def daterange(start_date, end_date):
number_of_days = int((end_date - start_date).days)
for n in range(number_of_days):
yield start_date + dt.timedelta(n)
for date in daterange(start_date, end_date):
calendar_day_url = CALENDAR_DAY_TEMPLATE.format(date.year, date.month - 1, date.day)
events = self.extract_events_by_url(calendar_day_url)
for event in events:
tz = pytz.timezone("America/Toronto")
time = dt.datetime.strptime(event["time"], "%I:%M %p")
start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0))
org_name = event["meeting"]
e = Event(
name=org_name,
start_time=start,
timezone=tz.zone,
location_name=event["location"],
status=STATUS_DICT.get(event["meeting_status"]),
)
e.extras = {"meeting_number": event["no"], "tmmis_meeting_id": event["meeting_id"]}
e.add_source(calendar_day_url)
e.add_participant(name=org_name, type="organization")
def is_agenda_available(event):
return event["publishing_status"] in ["Agenda Published", "Minutes Published"]
def is_council(event):
return True if event["meeting"] == self.jurisdiction.name else False
if is_agenda_available(event):
agenda_url_template = (
AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE
)
agenda_url = agenda_url_template.format(event["meeting_id"])
full_identifiers = list(self.full_identifiers(event["meeting_id"], is_council(event)))
event_map_url_template = (
"http://app.toronto.ca/tmmis/getAddressList.do?function=getMeetingAddressList&meetingId={}"
)
event_map_url = event_map_url_template.format(event["meeting_id"])
addresses_d = self.addressesByAgendaId(event_map_url)
e.add_source(agenda_url)
agenda_items = self.agenda_from_url(agenda_url)
for i, item in enumerate(agenda_items):
a = e.add_agenda_item(item["title"])
a.add_classification(item["type"].lower())
a["order"] = str(i)
def normalize_wards(raw):
if not raw:
raw = "All"
if raw == "All":
return raw.lower()
else:
return raw.split(", ")
wards = normalize_wards(item["wards"])
identifier_regex = re.compile(r"^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$")
[full_identifier] = [
id for id in full_identifiers if identifier_regex.match(id).group(1) == item["identifier"]
]
a.add_bill(full_identifier)
if full_identifier not in self.seen_agenda_items:
b = Bill(
# TODO: Fix this hardcode
legislative_session="2014-2018",
identifier=full_identifier,
title=item["title"],
from_organization={"name": self.jurisdiction.name},
)
b.add_source(agenda_url)
b.add_document_link(
note="canonical",
media_type="text/html",
url=AGENDA_ITEM_TEMPLATE.format(full_identifier),
)
b.extras["wards"] = wards
addresses = addresses_d.get(full_identifier)
if addresses:
b.extras["locations"] = []
for address in addresses:
location = {"address": {"full_address": address}}
b.extras["locations"].append(location)
self.seen_agenda_items.append(full_identifier)
yield b
yield e