本文整理汇总了Python中utils.inspector.save_report函数的典型用法代码示例。如果您正苦于以下问题:Python save_report函数的具体用法?Python save_report怎么用?Python save_report使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了save_report函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run
def run(options):
year_range = inspector.year_range(options)
topics = options.get('topics')
if topics:
topics = topics.split(",")
else:
topics = TOPIC_TO_URL.keys()
for topic in topics:
topic_url = TOPIC_TO_URL[topic]
body = utils.download(topic_url)
doc = BeautifulSoup(body)
try:
year_results = doc.select("#Listing")[0]
results = [x for x in year_results.select("ul li ul li")]
except IndexError:
try:
all_results = doc.select("#bodyholder")[0]
results = [x for x in all_results.select("ul li")]
except IndexError:
results = doc.select("table ul li")
# Sometimes multiple reports are listed under the same datetime element.
# We store which published datetime we saw last so that the next report
# can use if if we are unable to find another published time.
last_published_on = None
for result in results:
report, last_published_on = report_from(result, topic_url, topic, year_range, last_published_on)
if report:
inspector.save_report(report)
示例2: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the audit reports
for page in range(1, 1000):
doc = beautifulsoup_from_url("{}?RS={}".format(REPORTS_URL, page))
results = doc.select("div.leadin")
if not results:
if page == 1:
raise inspector.NoReportsFoundError("VA (audit reports)")
else:
break
for result in results:
report = report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
results = doc.select("div.leadin")
if not results:
raise inspector.NoReportsFoundError("VA (semiannual reports)")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
示例3: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the audit reports
for year in year_range:
url = AUDITS_REPORTS_URL.format(str(year)[2:4])
doc = BeautifulSoup(utils.download(url))
results = doc.select("tr")
if not results:
raise inspector.NoReportsFoundError("NASA (%d)" % year)
for index, result in enumerate(results):
if not index or not result.text.strip():
# Skip the header row and any empty rows
continue
report = audit_report_from(result, url, year_range)
if report:
inspector.save_report(report)
# Pull the other reports
doc = BeautifulSoup(utils.download(OTHER_REPORT_URL))
results = doc.select("#subContainer ul li")
if not results:
raise inspector.NoReportsFoundError("NASA (other)")
for result in results:
report = other_report_from(result, year_range)
if report:
inspector.save_report(report)
示例4: scrape_reports
def scrape_reports(options):
"""Pull reports from "Reports and Testimonies - Browse by date" web page."""
REPORTS_URL = 'http://www.gao.gov/browse/date/custom?adv_begin_date=01/01/' +\
'%s&adv_end_date=12/31/%s&rows=50&o=%s' # % (year, year, offset)
archive = 1970
# Amazingly, reports go back to 1940, though those are unlikely to be
# legible enough to OCR. Also very cool, even 1950s-era reports seem to have
# a highlightable embedded text layer in them. Of course, it was the
# General Accounting Office back then and less oversighty.
year_range = inspector.year_range(options, archive)
for year in year_range:
is_next_page = True
offset = 0
while is_next_page:
doc = utils.beautifulsoup_from_url(
REPORTS_URL % (year, year, offset))
results = doc.select("div.listing")
for result in results:
report = process_report(result, year_range)
if report:
inspector.save_report(report)
page_links = doc.select("a.non-current_page")
if len(page_links) and page_links[-1].text.startswith('Next'):
offset += 50
else:
is_next_page = False
示例5: run
def run(options):
year_range = inspector.year_range(options, archive)
# Suggested flow, for an IG which paginates results.
pages = options.get('pages', ALL_PAGES)
for page in range(1, (int(pages) + 1)):
data = {
'view_name': 'oig_nodes',
'view_display_id': 'block_search_oig_reports',
}
if page:
# Only add page= if page > 0
data['page'] = page
response = utils.scraper.post(REPORTS_AJAX_URL,
data=data,
headers={
"Content-Type": "application/x-www-form-urlencoded",
},
)
page_html = response.json()[1]['data']
doc = BeautifulSoup(page_html)
results = doc.select("tr")
if not results:
break
for index, result in enumerate(results):
if not index:
# Skip the header row
continue
report = report_from(result, year_range)
if report:
inspector.save_report(report)
示例6: run
def run(options):
year_range = inspector.year_range(options, archive)
pre_1998_done = False
# Pull the audit reports
for year in year_range:
if year < 1998:
if pre_1998_done:
continue
else:
pre_1998_done = True
for page_number in range(0, 10000):
year_url = url_for(year, page_number)
doc = beautifulsoup_from_url(year_url)
results = doc.select("ol li")
if not results:
if page_number == 0:
raise inspector.NoReportsFoundError("Department of Labor (%s)" % year_url)
else:
break
for result in results:
report = report_from(result, year_url)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
results = doc.select("p > a:nth-of-type(1)")
if not results:
raise inspector.NoReportsFoundError("Department of Labor (semiannal reports)")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
示例7: run
def run(options):
year_range = inspector.year_range(options, archive)
report_flag = False
# Pull the table of reports for each year
for year in year_range:
url = url_for_year(year)
html = utils.download(url, scraper_slug="osc")
if html is None:
if year == max(year_range):
continue
else:
raise Exception("Couldn't fetch reports page {}".format(url))
# spaces appear as   and \u200b .... fix that now
html = html.replace(' ', ' ').replace('\u200b', ' ').replace('\u00a0', ' ').replace('\r', '').replace('\n', '')
doc = BeautifulSoup(html, "lxml")
OUTCOME_CODES = generate_outcome_codes(doc)
keys_used = [] # a few reports appear multiple times... ignore them the second time if they appear more than once
results = doc.findAll("table")[1].tbody.findAll('tr') # no ids on the tables, but it's the second one
for result in results:
reports = report_from(result, year, year_range, url, OUTCOME_CODES)
for report in reports:
if report['report_id'] not in keys_used:
inspector.save_report(report)
keys_used.append(report['report_id'])
report_flag = True
if not report_flag:
raise inspector.NoReportsFoundError("OSC")
示例8: run
def run(options):
year_range = inspector.year_range(options, archive)
min_year = min(year_range)
page = 0
last_page = 0
while page <= last_page:
doc = utils.beautifulsoup_from_url(REPORT_SEARCH_URL.format(min_year, page))
last_page_link = doc.find("a", title="Go to last page")
if last_page_link:
href = last_page_link["href"]
page_match = re.search("[?&]page=([0-9]+)(?:&|$)", href)
if page_match:
last_page = int(page_match.group(1))
results = doc.select(".view-reports-advanced-search .views-row")
if not results:
raise inspector.NoReportsFoundError("Department of the Interior")
for result in results:
report = report_from(result, year_range)
if report:
inspector.save_report(report)
page += 1
if last_page == 0:
raise Exception("Did not find last page link")
示例9: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the reports with pagination
for report_type, report_url_format in PAGINATED_REPORT_FORMATS.items():
for page in range(0, 999):
url = report_url_format.format(page=page)
doc = BeautifulSoup(utils.download(url))
results = doc.select("li.views-row")
if not results:
if page == 0:
raise inspector.NoReportsFoundError("USAID (%s)" % report_type)
else:
break
for result in results:
report = report_from(result, url, report_type, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports (no pagination)
doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
results = doc.select("li.views-row")
if not results:
raise inspector.NoReportsFoundError("USAID (semiannual reports)")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
示例10: run
def run(options):
year_range = inspector.year_range(options, archive)
pages = options.get('pages', ALL_PAGES)
# default to starting at page 1
begin = int(options.get('begin', 1))
max_page = None
for page in range(begin, (int(pages) + 1)):
if max_page and (page > max_page):
logging.debug("End of pages!")
break
logging.debug("## Downloading page %i" % page)
url = url_for(options, page)
body = utils.download(url)
doc = BeautifulSoup(body)
# When the USPS restores their page controls, we can use this again,
# which saves one network call each time.
max_page = last_page_for(doc)
results = doc.select(".views-row")
for result in results:
report = report_from(result)
# inefficient enforcement of --year arg, USPS doesn't support it server-side
# TODO: change to published_on.year once it's a datetime
if inspector.year_from(report) not in year_range:
logging.warn("[%s] Skipping report, not in requested range." % report['report_id'])
continue
inspector.save_report(report)
示例11: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the reports
for report_type, report_url in REPORT_URLS:
doc = utils.beautifulsoup_from_url(report_url)
results = doc.select("td.mainInner div.ms-WPBody > div > ul > li")
if not results:
raise inspector.NoReportsFoundError("SIGTARP ({})".format(report_type))
for result in results:
report = report_from(result, report_type, year_range)
if report:
inspector.save_report(report)
doc = utils.beautifulsoup_from_url(QUARTERLY_REPORTS_URL)
results = doc.select("#MSOZoneCell_WebPartWPQ3 .s4-wpTopTable a")
if not results:
raise inspector.NoReportsFoundError("SIGTARP (quarterly reports)")
for result in results:
report = quarterly_report_from(result, year_range)
if report:
inspector.save_report(report)
示例12: run
def run(options):
year_range = inspector.year_range(options)
for page_url in URLS:
done = False
body = utils.download(page_url)
doc = BeautifulSoup(body)
maincontent = doc.select("div#CS_Element_eximpagemaincontent")[0]
all_p = maincontent.find_all("p")
for p in all_p:
for all_text, link_text, link_url in recurse_tree(p, False):
if link_url == None:
continue
if link_url.startswith("mailto:"):
continue
if page_url == WHATS_NEW_URL and link_url == "/oig/whats-new-archive.cfm":
# end of page
done = True
break
if link_url.startswith("https://public.govdelivery.com/"):
continue
for index_url in URLS:
if index_url.find(link_url) != -1:
continue
year = DATE_RE.search(all_text).group(3)
if int(year) not in year_range:
continue
report = report_from(all_text, link_text, link_url, page_url)
inspector.save_report(report)
if done: break
示例13: run
def run(options):
year_range = inspector.year_range(options)
max_pages = int(options.get('pages', 1))
for year in year_range:
page = 1
done = False
while not done:
url = url_for(options, page, year)
body = utils.download(url)
doc = BeautifulSoup(body)
next_page = page + 1
found_next_page = False
page_links = doc.select("li.pager-item a.active")
for page_link in page_links:
if page_link.text == str(next_page):
found_next_page = True
break
if not found_next_page:
done = True
if next_page > max_pages:
done = True
results = doc.select("table.views-table > tbody > tr")
for result in results:
report = report_from(result)
inspector.save_report(report)
page = next_page
if not done:
print('Moving to next page (%d)' % page)
示例14: extract_reports_for_subtopic
def extract_reports_for_subtopic(subtopic_url, year_range, topic_name, subtopic_name):
doc = beautifulsoup_from_url(subtopic_url)
if not doc:
raise Exception("Failure fetching subtopic URL: %s" % subtopic_url)
results = None
# This URL is different than the rest and needs to find the "p > a"s first.
if subtopic_url == TOPIC_TO_URL['TMPC']:
results = doc.select("#leftContentInterior > p > a")
if not results:
results = doc.select("#leftContentInterior dl dd")
if not results:
results = doc.select("#leftContentInterior ul li")
if not results:
results = doc.select("#leftContentInterior > p > a")
if not results:
raise inspector.NoReportsFoundError("HHS (%s)" % subtopic_name)
for result in results:
if 'crossref' in result.parent.parent.attrs.get('class', []):
continue
if result.parent.parent.attrs.get('id') == 'related':
continue
report = report_from(result, year_range, topic_name, subtopic_url, subtopic_name)
if report:
inspector.save_report(report)
示例15: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the reports with pagination
for report_type, report_url_format in PAGINATED_REPORT_FORMATS:
for page in range(0, 999):
url = report_url_format.format(page=page)
doc = utils.beautifulsoup_from_url(url)
if report_type == "audit" and page == 0 and not doc.select("div.views-field-field-auditreport-doc-1"):
raise Exception("Report number CSS class has changed")
results = doc.select("li.views-row")
if not results:
if page == 0:
raise inspector.NoReportsFoundError("USAID (%s)" % report_type)
else:
break
for result in results:
report = report_from(result, url, report_type, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports (no pagination)
doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
results = doc.select("li.views-row")
if not results:
raise inspector.NoReportsFoundError("USAID (semiannual reports)")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)