本文整理汇总了Python中utils.inspector.year_range函数的典型用法代码示例。如果您正苦于以下问题:Python year_range函数的具体用法?Python year_range怎么用?Python year_range使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了year_range函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the audit reports
for year in year_range:
if year < 2006: # This is the oldest year for these reports
year = 2006
url = AUDIT_REPORTS_BASE_URL.format(year)
doc = beautifulsoup_from_url(url)
results = []
results.extend(doc.select("tr.ms-rteTableOddRow-default"))
results.extend(doc.select("tr.ms-rteTableEvenRow-default"))
for result in results:
report = audit_report_from(result, url, year_range)
if report:
inspector.save_report(report)
for report_type, url in OTHER_URLS.items():
doc = beautifulsoup_from_url(url)
results = doc.select("#ctl00_PlaceHolderMain_ctl05_ctl01__ControlWrapper_RichHtmlField > p > a")
for result in results:
report = report_from(result, url, report_type, year_range)
if report:
inspector.save_report(report)
doc = beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
results = doc.select("#ctl00_PlaceHolderMain_ctl05_ctl01__ControlWrapper_RichHtmlField > p > a")
for result in results:
report = semiannual_report_from(result, SEMIANNUAL_REPORTS_URL, year_range)
if report:
inspector.save_report(report)
示例2: run
def run(options):
year_range = inspector.year_range(options, archive)
component = options.get('component')
if component:
components = [component]
else:
components = list(COMPONENTS.keys())
report_id = options.get('report_id')
limit = int(options.get('limit', 0))
all_audit_reports = {}
for component in components:
logging.info("## Fetching reports for component %s" % component)
url = url_for(options, component)
body = utils.download(url)
doc = BeautifulSoup(body)
results = doc.select("table.contentpaneopen table[border=1] tr")
# accept only trs that look like body tr's (no 'align' attribute)
# note: HTML is very inconsistent. cannot rely on thead or tbody
results = [x for x in results if x.get('align') is None]
if not results:
raise inspector.NoReportsFoundError("DHS (%s)" % component)
count = 0
for result in results:
report = report_from(result, component, url)
if not report:
continue
if report_id and (report_id != report['report_id']):
continue
if inspector.year_from(report) not in year_range:
# logging.info("[%s] Skipping, not in requested range." % report['report_id'])
continue
key = (report["report_id"], report["title"])
if key in all_audit_reports:
all_audit_reports[key]["agency"] = all_audit_reports[key]["agency"] + \
", " + report["agency"]
all_audit_reports[key]["agency_name"] = \
all_audit_reports[key]["agency_name"] + ", " + \
report["agency_name"]
else:
all_audit_reports[key] = report
count += 1
if limit and (count >= limit):
break
logging.info("## Fetched %i reports for component %s\n\n" % (count, component))
for report in all_audit_reports.values():
inspector.save_report(report)
示例3: run
def run(options):
year_range = inspector.year_range(options, archive)
doc = BeautifulSoup(utils.download(REPORTS_URL))
# Pull the semiannual reports
semiannul_results = doc.select("#AnnualManagementReports select")[0]
for result in semiannul_results.select("option"):
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the special reports
special_report_table = doc.find("table", attrs={"bordercolor": "#808080"})
for index, result in enumerate(special_report_table.select("tr")):
if not index:
# Skip the header row
continue
report = report_from(result, REPORTS_URL, report_type='other', year_range=year_range)
if report:
inspector.save_report(report)
# Pull the audit reports
for year in year_range:
if year < 2001: # The oldest fiscal year page available
continue
year_url = AUDIT_REPORTS_URL.format(year=year)
doc = BeautifulSoup(utils.download(year_url))
for index, result in enumerate(doc.select("#main table tr")):
if not index:
# Skip the header row
continue
report = report_from(result, year_url, report_type='audit', year_range=year_range)
if report:
inspector.save_report(report)
示例4: run
def run(options):
year_range = inspector.year_range(options, archive)
if datetime.datetime.now().month >= 10:
# October, November, and December fall into the next fiscal year
# Add next year to year_range to compensate
year_range.append(max(year_range) + 1)
# Pull the audit reports
for year in year_range:
url = audit_report_url(year)
if url:
parse_result_from_js_url(url, "auditreports", year, year_range, report_type='audit')
url = inspection_report_url(year)
if url:
parse_result_from_js_url(url, "iereports", year, year_range, report_type='inspection')
# Pull the congressional testimony
doc = utils.beautifulsoup_from_url(CONGRESSIONAL_TESTIMONY_REPORTS_URL)
results = doc.findAll("ul", type='disc')[0].select("li")
for result in results:
report = congressional_testimony_report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
results = doc.findAll("ul", type='disc')[0].select("li")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
示例5: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the audit reports
for year in year_range:
if year < 2002: # The oldest page for audit reports
continue
doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL.format(year=year)))
results = doc.select("div.content table tr")
for index, result in enumerate(results):
if not index:
# Skip the header row
continue
report = report_from(result, report_type="audit", year_range=year_range)
if report:
inspector.save_report(report)
# Pull the FOIA reports
doc = BeautifulSoup(utils.download(FOIA_REPORTS_URL))
results = doc.select("div.content table tr")
for index, result in enumerate(results):
if not index:
# Skip the header row
continue
report = report_from(result, report_type="other", year_range=year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
results = doc.select("div.content a")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
示例6: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the audit reports
for year in year_range:
if year < 2005: # This is the earliest audits go back
continue
url = AUDIT_REPORTS_URL.format(year=year)
doc = BeautifulSoup(utils.download(url))
results = doc.select("div.content")
if not results:
raise inspector.NoReportsFoundError("Tennessee Valley Authority (%d)" % year)
for result in results:
report = audit_report_from(result, url, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
results = doc.select("report")
if not results:
raise inspector.NoReportsFoundError("Tennessee Valley Authority (semiannual reports)")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
示例7: run
def run(options):
year_range = inspector.year_range(options)
pages = options.get('pages', ALL_PAGES)
max_page = None
for page in range(1, (int(pages) + 1)):
if max_page and (page > max_page):
print("End of pages!")
break
print("## Downloading page %i" % page)
url = url_for(options, page)
body = utils.download(url)
doc = BeautifulSoup(body)
max_page = last_page_for(doc)
results = doc.select(".views-row")
for result in results:
report = report_from(result)
# inefficient enforcement of --year arg, USPS doesn't support it server-side
# TODO: change to published_on.year once it's a datetime
if inspector.year_from(report) not in year_range:
print("[%s] Skipping report, not in requested range." % report['report_id'])
continue
inspector.save_report(report)
示例8: run
def run(options):
year_range = inspector.year_range(options, archive)
# Find the number of pages to iterate
doc = BeautifulSoup(utils.download(REPORTS_URL))
page_count_text = doc.select("div.AspNet-GridView-Pagination")[0].text
page_count = int(re.search("Page 1 of (\d+)", page_count_text).groups()[0])
# Iterate over those pages
for page in range(1, page_count + 1):
response = utils.scraper.post(
REPORTS_URL,
data={
"__EVENTTARGET": "ctl00$ctl00$MainContent$NavTreeSubContent$sv$GridViewSummary",
"__EVENTARGUMENT": "Page${page_number}".format(page_number=page),
},
cookies=COOKIES,
)
doc = BeautifulSoup(response.content)
results = doc.select("div.AspNet-GridView table tr")
if not results:
break
for index, result in enumerate(results):
if not index:
# Skip the header row
continue
report = report_from(result, year_range)
if report:
inspector.save_report(report)
示例9: run
def run(options):
year_range = inspector.year_range(options)
# Pull the audit reports
for year in year_range:
url = audit_report_url(year)
if url:
parse_result_from_js_url(url, "auditreports", year, year_range)
url = inspection_report_url(year)
if url:
parse_result_from_js_url(url, "iereports", year, year_range)
# Pull the congressional testimony
doc = BeautifulSoup(utils.download(CONGRESSIONAL_TESTIMONY_REPORTS_URL))
results = doc.findAll("ul", type='disc')[0].select("li")
for result in results:
report = congressional_testimony_report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
results = doc.findAll("ul", type='disc')[0].select("li")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
示例10: run
def run(options):
year_range = inspector.year_range(options, archive)
results_flag = False
# Pull the audit reports
for year in year_range:
if year < 2002: # The oldest page for audit reports
continue
if year == 2018:
doc = utils.beautifulsoup_from_url(LATEST_AUDIT_REPORTS_URL)
else:
doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL.format(year=year))
if doc is None:
# Next year's audit page may not be published yet
continue
results = doc.select("div.mainCenter table tr")
if results:
results_flag = True
for index, result in enumerate(results):
if not index:
# Skip the header row
continue
report = report_from(result, report_type='audit', year_range=year_range)
if report:
inspector.save_report(report)
if not results_flag:
raise inspector.NoReportsFoundError("NCUA (audit reports)")
# Pull the other reports
doc = utils.beautifulsoup_from_url(OTHER_REPORTS_URL)
results = doc.select("div.mainCenter p")
if not results:
raise inspector.NoReportsFoundError("NCUA (other)")
for result in results:
report = other_report_from(result, year_range=year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
results = doc.select("div#mainColumns div.mainCenter a")
if not results:
raise inspector.NoReportsFoundError("NCUA (semiannual reports)")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the performance and strategic plans
doc = utils.beautifulsoup_from_url(PLANS_URL)
results = doc.select("div.mainCenter p")
if not results:
raise inspector.NoReportsFoundError("NCUA (performance/strategic plans)")
for result in results:
report = plan_from(result, year_range)
if report:
inspector.save_report(report)
示例11: run
def run(options):
year_range = inspector.year_range(options)
only_id = options.get('report_id')
print("## Downloading reports from %i to %i" % (year_range[0], year_range[-1]))
url = url_for()
body = utils.download(url)
doc = BeautifulSoup(body)
results = doc.select("section")
for result in results:
try:
year = int(result.get("title"))
# check that the fetched year is in the range
if year not in year_range:
continue
print("## Downloading year %i " % year)
except ValueError:
continue
# gets each table entry and sends generates a report from it
listings = result.div.table.tbody.contents
for item in listings:
if type(item) is not bs4.element.Tag:
continue
report = report_from(item)
# can limit it to just one report, for debugging convenience
if only_id and only_id != report['report_id']:
continue
inspector.save_report(report)
示例12: scrape_restricted_reports
def scrape_restricted_reports(options):
"""Restricted Products.
A single HTML page lists unreleased reports since 2014, with no links."""
# These reports are unreleased -- we could make this the text?
"""The following products have been determined to contain either
classified information or controlled unclassified information by the audited
agencies and cannot be publicly released.
Members of Congress or congressional staff who wish to obtain one or more of
these products should call or e-mail the Congressional Relations Office.
All others who wish to obtain one or more of these products should follow the
instructions found on Requesting Restricted Products."""
REPORTS_URL = 'http://www.gao.gov/restricted/restricted_reports'
archive = 2014
year_range = inspector.year_range(options, archive)
doc = utils.beautifulsoup_from_url(REPORTS_URL)
results = doc.select("div.listing")
for result in results:
report = process_restricted_report(result, year_range, REPORTS_URL)
if report:
inspector.save_report(report)
示例13: run
def run(options):
year_range = inspector.year_range(options, archive)
# # Pull the RSS feed
doc = BeautifulSoup(utils.download(RSS_URL))
results = doc.select("item")
for result in results:
report = rss_report_from(result, year_range)
if report:
inspector.save_report(report)
# # Pull the recent audit reports.
doc = BeautifulSoup(utils.download(RECENT_AUDITS_URL))
results = doc.select("div.block > a")
for result in results:
report = report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the archive audit reports
doc = BeautifulSoup(utils.download(AUDIT_ARCHIVE_URL))
results = doc.select("div.block a")
for result in results:
report = report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the other reports
doc = BeautifulSoup(utils.download(OTHER_REPORTS_URl))
results = doc.select("div.block > a")
for result in results:
report = report_from(result, year_range)
if report:
inspector.save_report(report)
示例14: scrape_reports
def scrape_reports(options):
"""Pull reports from "Reports and Testimonies - Browse by date" web page."""
REPORTS_URL = 'http://www.gao.gov/browse/date/custom?adv_begin_date=01/01/' +\
'%s&adv_end_date=12/31/%s&rows=50&o=%s' # % (year, year, offset)
archive = 1970
# Amazingly, reports go back to 1940, though those are unlikely to be
# legible enough to OCR. Also very cool, even 1950s-era reports seem to have
# a highlightable embedded text layer in them. Of course, it was the
# General Accounting Office back then and less oversighty.
year_range = inspector.year_range(options, archive)
for year in year_range:
is_next_page = True
offset = 0
while is_next_page:
doc = utils.beautifulsoup_from_url(
REPORTS_URL % (year, year, offset))
results = doc.select("div.listing")
for result in results:
report = process_report(result, year_range)
if report:
inspector.save_report(report)
page_links = doc.select("a.non-current_page")
if len(page_links) and page_links[-1].text.startswith('Next'):
offset += 50
else:
is_next_page = False
示例15: run
def run(options):
year_range = inspector.year_range(options)
max_pages = int(options.get('pages', 1))
for year in year_range:
page = 1
done = False
while not done:
url = url_for(options, page, year)
body = utils.download(url)
doc = BeautifulSoup(body)
next_page = page + 1
found_next_page = False
page_links = doc.select("li.pager-item a.active")
for page_link in page_links:
if page_link.text == str(next_page):
found_next_page = True
break
if not found_next_page:
done = True
if next_page > max_pages:
done = True
results = doc.select("table.views-table > tbody > tr")
for result in results:
report = report_from(result)
inspector.save_report(report)
page = next_page
if not done:
print('Moving to next page (%d)' % page)