本文整理汇总了Python中pupa.scrape.Event.add_source方法的典型用法代码示例。如果您正苦于以下问题:Python Event.add_source方法的具体用法?Python Event.add_source怎么用?Python Event.add_source使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pupa.scrape.Event
的用法示例。
在下文中一共展示了Event.add_source方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: scrape_upper
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_source [as 别名]
def scrape_upper(self):
url = "http://www.oksenate.gov/Committees/meetingnotices.htm"
page = lxml.html.fromstring(self.get(url).text)
page.make_links_absolute(url)
text = page.text_content()
_, text = text.split('MEETING NOTICES')
re_date = r'[A-Z][a-z]+,\s+[A-Z][a-z]+ \d+, \d{4}'
chunks = zip(re.finditer(re_date, text), re.split(re_date, text)[1:])
for match, data in chunks:
when = match.group()
when = datetime.datetime.strptime(when, "%A, %B %d, %Y")
lines = filter(None, [x.strip() for x in data.splitlines()])
time_ = re.search(r'^\s*TIME:\s+(.+?)\s+\x96', data, re.M).group(1)
time_ = time_.replace('a.m.', 'AM').replace('p.m.', 'PM')
time_ = time.strptime(time_, '%I:%M %p')
when += datetime.timedelta(hours=time_.tm_hour, minutes=time_.tm_min)
title = lines[0]
where = re.search(r'^\s*PLACE:\s+(.+)', data, re.M).group(1)
where = where.strip()
event = Event(name=title,
start_date=self._tz.localize(when),
location_name=where)
event.add_source(url)
yield event
示例2: scrape_meetings
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_source [as 别名]
def scrape_meetings(self, meetings, group):
"""
Scrape and save event data from a list of meetings.
Arguments:
meetings -- A list of lxml elements containing event information
group -- The type of meeting. The legislature site applies
different formatting to events based on which group
they correspond to. `group` should be one of the
following strings: 'house', 'senate', or 'commission'.
"""
for meeting in meetings:
when = self.get_date(meeting)
description = self.get_description(meeting)
location = self.get_location(meeting)
if when and description and location:
event = Event(name=description, start_date=when.replace(tzinfo=self.tz),
description=description,
location_name=location)
agenda = self.get_agenda(meeting)
if agenda:
event.add_agenda_item(agenda)
event.add_source(url)
yield event
示例3: scrape_meeting_notice
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_source [as 别名]
def scrape_meeting_notice(self, chamber, item, url):
# Since Event Name is not provided for all mettings.
event_name = str(item['CommitteeName'])
# 04/25/2012 03:00:00 PM
fmt = "%m/%d/%y %I:%M %p"
start_time = dt.datetime.strptime(str(item['MeetingDateTime']), fmt)
location_name = str(item['AddressAliasNickname'])
event = Event(location_name=location_name,
start_date=self._tz.localize(start_time),
name=event_name,
description='Committee Meeting Status: {}'
.format(item['CommitteeMeetingStatusName'])
)
event.add_source(url)
event.add_committee(name=str(item['CommitteeName']), id=item['CommitteeId'])
page_url = ("http://legis.delaware.gov/json/MeetingNotice/"
"GetCommitteeMeetingItems?committeeMeetingId={}".format(
item['CommitteeMeetingId'])
)
event.add_source(page_url)
page_data = self.post(page_url).json()['Data']
for item in page_data:
event.add_agenda_item(description=str(item['ItemDescription']))
event.add_person(name=str(item['PrimarySponsorShortName']),
id=str(item['PrimarySponsorPersonId']),
note='Sponsor')
yield event
示例4: scrape
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_source [as 别名]
def scrape(self):
for page in self.eventPages(EVENTSPAGE):
events_table = page.xpath("//table[@class='rgMasterTable']")[0]
for events, headers, rows in self.parseDataTable(events_table) :
print(events)
location_string = events[u'Meeting\xa0Location']
location_list = location_string.split('--')
location = ', '.join(location_list[0:2])
status_string = location_list[-1].split('Chicago, Illinois')
if len(status_string) > 1 and status_string[1] :
status = status_string[1].lower()
if status not in ['cancelled', 'tentative', 'confirmed', 'passed'] :
print(status)
status = 'confirmed'
else :
status = 'confirmed'
when = events[u'Meeting\xa0Date']
time_string = events[u'Meeting\xa0Time']
event_time = datetime.datetime.strptime(time_string,
"%I:%M %p")
when = when.replace(hour=event_time.hour)
e = Event(name=events["Name"]["label"],
when=when,
location=location,
status=status)
e.add_source(EVENTSPAGE)
if events['Video'] != u'Not\xa0available' :
print(events['Video'])
yield e
示例5: scrape_committee_events
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_source [as 别名]
def scrape_committee_events(self, code, name):
events_url = \
'http://www.cga.ct.gov/basin/fullcalendar/commevents.php?' \
'comm_code={}'.format(code)
events_data = self.get(events_url).text
events = json.loads(events_data)
DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
for info in events:
if info['title'] is None:
self.warning("Event found with no title; it will be skipped")
continue
elif info['title'].startswith('CANCELLED:'):
self.info("Cancelled event found; it will be skipped: {}".
format(info['title']))
continue
when = datetime.datetime.strptime(info['start'], DATETIME_FORMAT)
# end = datetime.datetime.strptime(info['end'], DATETIME_FORMAT)
where = "{0} {1}".format(info['building'].strip(), info['location'].strip())
# end_time=self._tz.localize(end),
event = Event(start_time=self._tz.localize(when),
timezone=self._tz.zone,
location_name=where,
name=info['title'],
description=info['title'],)
event.add_source(events_url)
yield event
示例6: parse_div
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_source [as 别名]
def parse_div(self, row, chamber, com):
cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0]
# event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip()
title, location, start_date, end_date = self.parse_gcal(cal_link)
event = Event(
start_date=start_date,
end_date=end_date,
name=title,
location_name=location,
)
event.add_source('http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx')
for item in row.xpath('.//div[@class="col-xs-12a Item"]'):
description = item.xpath('string(.)').strip()
agenda = event.add_agenda_item(description=description)
for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'):
description = item.xpath('string(.)').strip()
agenda = event.add_agenda_item(description=description)
event.add_document(
description,
item.xpath('@href')[0],
media_type="application/pdf",
on_duplicate="ignore"
)
for item in row.xpath('.//div[contains(@class,"ItemContainer")]'
'[./div[@class="col-xs-1 Item"]]'):
description = item.xpath('string(.)').strip()
agenda = event.add_agenda_item(description=description)
bill = item.xpath('.//div[@class="col-xs-1 Item"]/a/text()')[0].strip()
agenda.add_bill(bill)
video = row.xpath('.//a[./span[@class="OnDemand"]]')
if video:
event.add_media_link(
'Video of Hearing',
video[0].xpath('@href')[0],
'text/html'
)
if 'subcommittee' in title.lower():
subcom = title.split('-')[0].strip()
event.add_participant(
subcom,
type='committee',
note='host',
)
else:
event.add_participant(
com,
type='committee',
note='host',
)
yield event
示例7: event_obj
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_source [as 别名]
def event_obj():
e = Event(
name="get-together",
start_date=datetime.datetime.utcnow().isoformat().split('.')[0] + 'Z',
location_name="Joe's Place",
)
e.add_source(url='http://example.com/foobar')
return e
示例8: scrape_house_weekly_schedule
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_source [as 别名]
def scrape_house_weekly_schedule(self):
url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx"
page = self.lxmlize(url)
meeting_rows = page.xpath('//table[@id = "table229"]/tr')
valid_meetings = [row for row in meeting_rows if row.xpath(
'./td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath(
'./td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath(
'./td[2]')[0].text_content()]
for meeting in valid_meetings:
try:
guid = meeting.xpath('./td/a[descendant::img[contains(@src,'
'"PDF-AGENDA.png")]]/@href')[0]
# self.logger.debug(guid)
self.warning("logger.debug" + guid)
except KeyError:
continue # Sometimes we have a dead link. This is only on
# dead entries.
committee_name = meeting.xpath('./td[1]/text()')[0].strip()
meeting_string = meeting.xpath('./td[2]')[0].text_content()
if "@" in meeting_string:
continue # Contains no time data.
date, time, location = ([s.strip() for s in meeting_string.split(
',') if s] + [None]*3)[:3]
# check for time in date because of missing comma
time_srch = re.search(r'\d{2}:\d{2} (AM|PM)', date)
if time_srch:
location = time
time = time_srch.group()
date = date.replace(time, '')
# self.logger.debug(location)
self.warning("logger.debug" + location)
year = datetime.datetime.now().year
datetime_string = ' '.join((date, str(year), time))
when = datetime.datetime.strptime(datetime_string, '%b %d %Y %I:%M %p')
when = self._tz.localize(when)
description = 'Committee Meeting: {}'.format(committee_name)
# self.logger.debug(description)
self.warning("logger.debug" + description)
event = Event(name=description,
start_date=self._tz.localize(when),
location_name=location)
event.add_source(url)
event.add_participant(committee_name, type='committee', note='host')
event.add_document(note='Agenda', url=guid, text='agenda',
media_type='application/pdf')
yield event
示例9: scrape
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_source [as 别名]
def scrape(self):
local_timezone = pytz.timezone("US/Eastern")
base_calendar_url = "http://www.miamidade.gov/cob/county-commission-calendar.asp"
#things get messy more than a few months out
#so we're just pulling 3 months. If we want three
#more, they are called "nxx", "nxy" and "nxz"
months = ["cur","nex","nxw"]
for m in months:
doc = self.lxmlize(base_calendar_url + "?next={}".format(m))
events = doc.xpath("//table[contains(@style,'dotted #ccc')]")
for event in events:
rows = event.xpath(".//tr")
for row in rows:
heading, data = row.xpath(".//td")
h = heading.text_content().lower().replace(":","").strip()
if h == "event":
title = data.text_content()
link = data.xpath(".//a")[0].attrib["href"]
elif h == "event date":
when = datetime.strptime(data.text, '%m/%d/%y %H:%M%p')
when = local_timezone.localize(when)
elif h == "location":
where = data.text
elif h == "description":
description = data.text
if link in DUPLICATE_EVENT_URLS:
continue
if title == "Mayor's FY 2016-17 Proposed Budget Public Meeting":
continue
if not description:
description = ""
status = "confirmed"
if "cancelled" in title.lower():
status = "cancelled"
e = Event(name=title,
start_time=when,
timezone="US/Eastern",
location_name=where,
description=description,
status=status)
e.add_source(link)
yield e
e = Event(name="Mayor's FY 2016-17 Proposed Budget Public Meeting",
start_time=local_timezone.localize(datetime.strptime('08/08/16 06:00PM', '%m/%d/%y %H:%M%p')),
timezone="US/Eastern",
location_name='111 NW 1st Street',
description='Pursuant to Section 2-1800A of the County Code, a Public Meeting has been scheduled by the Honorable Carlos A. Gimenez, Mayor, Miami-Dade County, to discuss the FY 2016-17 budget, tax rates, and fee changes.',
status='confirmed')
e.add_source('http://miamidade.gov/wps/Events/EventDetail.jsp?eventID=447192')
yield e
示例10: event_obj
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_source [as 别名]
def event_obj():
e = Event(
name="get-together",
start_time=datetime.datetime.utcnow(),
location_name="Joe's Place",
timezone="America/New_York",
)
e.add_source(url='foobar')
return e
示例11: scrape_event_page
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_source [as 别名]
def scrape_event_page(self, url, event_type):
page = self.lxmlize(url)
page.make_links_absolute('https://malegislature.gov/')
title = page.xpath('string(//div[contains(@class,"followable")]/h1)')
title = title.replace('Hearing Details', '').strip()
title = title.replace('Special Event Details', '')
start_day = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[2])').strip()
start_time = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[3])').strip()
location = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[4]//a)').strip()
description = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[5])').strip()
start_date = self._TZ.localize(
dateutil.parser.parse(
'{} {}'.format(start_day, start_time),
)
)
event = Event(
start_date=start_date,
name=title,
location_name=location,
description=description
)
event.add_source(url)
agenda_rows = page.xpath(
'//div[contains(@class,"col-sm-8") and .//h2[contains(@class,"agendaHeader")]]'
'/div/div/div[contains(@class,"panel-default")]')
for row in agenda_rows:
# only select the text node, not the spans
agenda_title = row.xpath('string(.//h4/a/text()[normalize-space()])').strip()
if agenda_title == '':
agenda_title = row.xpath('string(.//h4/text()[normalize-space()])').strip()
agenda = event.add_agenda_item(description=agenda_title)
bills = row.xpath('.//tbody/tr/td[1]/a/text()')
for bill in bills:
bill = bill.strip().replace('.', ' ')
agenda.add_bill(bill)
if event_type == 'Hearing':
event.add_participant(
title,
type='committee',
note='host',
)
yield event
示例12: scrape
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_source [as 别名]
def scrape(self):
calendar_url = "http://dccouncil.us/calendar"
data = self.get(calendar_url).text
doc = lxml.html.fromstring(data)
committee_regex = re.compile("(Committee .*?)will")
event_list = doc.xpath("//div[@class='event-description-dev']")
for event in event_list:
place_and_time = event.xpath(".//div[@class='event-description-dev-metabox']/p/text()")
when = " ".join([place_and_time[0].strip(), place_and_time[1].strip()])
if len(place_and_time) > 2:
location = place_and_time[2]
else:
location = "unknown"
# when is now of the following format:
# Wednesday, 2/25/2015 9:30am
when = datetime.datetime.strptime(when, "%A, %m/%d/%Y %I:%M%p")
description_content = event.xpath(".//div[@class='event-description-content-dev']")[0]
description_lines = description_content.xpath("./*")
name = description_lines[0].text_content()
desc_without_title = " ".join(d.text_content() for d in description_lines[1:])
description = re.sub(r'\s+', " ", description_content.text_content()).strip()
potential_bills = description_content.xpath(".//li")
committee = committee_regex.search(desc_without_title)
event_type = 'other'
if committee is not None:
committee = committee.group(1).strip()
event_type = 'committee:meeting'
e = Event(name=name,
description=description,
start_date=self._tz.localize(when),
location_name=location,
classification=event_type,
)
for b in potential_bills:
bill = b.xpath("./a/text()")
if len(bill) == 0:
continue
bill = bill[0]
bill_desc = b.text_content().replace(bill, "").strip(", ").strip()
ses, num = bill.split("-")
bill = ses.replace(" ", "") + "-" + num.zfill(4)
item = e.add_agenda_item(bill_desc)
item.add_bill(bill)
e.add_source(calendar_url)
if committee:
e.add_participant(committee, type='organization', note='host')
yield e
示例13: scrape_upper
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_source [as 别名]
def scrape_upper(self):
listing_url = 'https://www.senate.mo.gov/hearingsschedule/hrings.htm'
html = self.get(listing_url).text
# The HTML here isn't wrapped in a container per-event
# which makes xpath a pain. So string split by <hr>
# then parse each event's fragment for cleaner results
for fragment in html.split('<hr />')[1:]:
page = lxml.html.fromstring(fragment)
when_date = self.row_content(page, 'Date:')
when_time = self.row_content(page, 'Time:')
location = self.row_content(page, 'Room:')
location = '{}, {}'.format(
location,
'201 W Capitol Ave, Jefferson City, MO 65101'
)
# com = self.row_content(page, 'Committee:')
com = page.xpath('//td[descendant::b[contains(text(),"Committee")]]/a/text()')[0]
com = com.split(', Senator')[0].strip()
start_date = self._TZ.localize(
dateutil.parser.parse('{} {}'.format(when_date, when_time))
)
event = Event(
start_date=start_date,
name=com,
location_name=location
)
event.add_source(listing_url)
event.add_participant(
com,
type='committee',
note='host',
)
for bill_table in page.xpath('//table[@width="85%" and @border="0"]'):
bill_link = ''
if bill_table.xpath(self.bill_link_xpath):
agenda_line = bill_table.xpath('string(tr[2])').strip()
agenda_item = event.add_agenda_item(description=agenda_line)
bill_link = bill_table.xpath(self.bill_link_xpath)[0].strip()
agenda_item.add_bill(bill_link)
else:
agenda_line = bill_table.xpath('string(tr[1])').strip()
agenda_item = event.add_agenda_item(description=agenda_line)
yield event
示例14: scrape
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_source [as 别名]
def scrape(self):
EVENTS_URL = 'http://www.akleg.gov/basis/Meeting/Find'
events = self.lxmlize(EVENTS_URL).xpath('//ul[@id="meetingResults"]/li')
for info in events:
event_url = info.xpath('span[@class="col04"]/a/@href')[0]
doc = self.lxmlize(event_url)
# Skip events that are placeholders or tentative
# Also skip whole-chamber events
if any(x.strip().startswith("No Meeting") for x in
doc.xpath('//div[@class="schedule"]//text()')) \
or "session" in \
info.xpath('span[@class="col01"]/text()')[0].lower():
continue
name = " ".join(
x.strip()
for x in doc.xpath('//div[@class="schedule"]//text()')
if x.strip()
)
# Skip events with no name
if not name:
continue
event = Event(
start_date=self._TZ.localize(
datetime.datetime.strptime(
info.xpath('span[@class="col02"]/text()')[0],
self._DATETIME_FORMAT,
)
),
name=name,
location_name=doc.xpath(
'//div[@class="heading-container"]/span/text()'
)[0].title()
)
event.add_participant(
info.xpath('span[@class="col01"]/text()')[0].title(),
type='committee',
note='host',
)
for document in doc.xpath('//td[@data-label="Document"]/a'):
event.add_document(
document.xpath('text()')[0],
url=document.xpath('@href')[0]
)
event.add_source(EVENTS_URL)
event.add_source(event_url.replace(" ", "%20"))
yield event
示例15: scrape
# 需要导入模块: from pupa.scrape import Event [as 别名]
# 或者: from pupa.scrape.Event import add_source [as 别名]
def scrape(self):
tz = pytz.timezone("US/Eastern")
get_short_codes(self)
page = self.lxmlize(URL)
table = page.xpath(
"//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]
for event in table.xpath(".//tr")[1:]:
tds = event.xpath("./td")
committee = tds[0].text_content().strip()
descr = [x.text_content() for x in tds[1].xpath(".//span")]
if len(descr) != 1:
raise Exception
descr = descr[0].replace('.', '').strip()
when = tds[2].text_content().strip()
where = tds[3].text_content().strip()
notice = tds[4].xpath(".//a")[0]
notice_href = notice.attrib['href']
notice_name = notice.text
when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")
when = pytz.utc.localize(when)
event = Event(name=descr, start_time=when, classification='committee-meeting',
description=descr, location_name=where, timezone=tz.zone)
if "/" in committee:
committees = committee.split("/")
else:
committees = [committee]
for committee in committees:
if "INFO" not in committee:
committee = self.short_ids.get("committee", {"chamber": "unknown",
"name": committee})
else:
committee = {
"chamber": "joint",
"name": committee,
}
event.add_committee(committee['name'], note='host')
event.add_source(URL)
event.add_document(notice_name,
notice_href,
media_type='text/html')
for bill in self.get_related_bills(notice_href):
a = event.add_agenda_item(description=bill['descr'])
a.add_bill(
bill['bill_id'],
note=bill['type']
)
yield event