本文整理汇总了Python中utils.utils.download函数的典型用法代码示例。如果您正苦于以下问题:Python download函数的具体用法?Python download怎么用?Python download使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了download函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run
def run(options):
year_range = inspector.year_range(options, archive)
# # Pull the RSS feed
doc = BeautifulSoup(utils.download(RSS_URL))
results = doc.select("item")
for result in results:
report = rss_report_from(result, year_range)
if report:
inspector.save_report(report)
# # Pull the recent audit reports.
doc = BeautifulSoup(utils.download(RECENT_AUDITS_URL))
results = doc.select("div.block > a")
for result in results:
report = report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the archive audit reports
doc = BeautifulSoup(utils.download(AUDIT_ARCHIVE_URL))
results = doc.select("div.block a")
for result in results:
report = report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the other reports
doc = BeautifulSoup(utils.download(OTHER_REPORTS_URl))
results = doc.select("div.block > a")
for result in results:
report = report_from(result, year_range)
if report:
inspector.save_report(report)
示例2: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the audit reports
for year in year_range:
if year < 2002: # The oldest page for audit reports
continue
doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL.format(year=year)))
results = doc.select("div.content table tr")
for index, result in enumerate(results):
if not index:
# Skip the header row
continue
report = report_from(result, report_type="audit", year_range=year_range)
if report:
inspector.save_report(report)
# Pull the FOIA reports
doc = BeautifulSoup(utils.download(FOIA_REPORTS_URL))
results = doc.select("div.content table tr")
for index, result in enumerate(results):
if not index:
# Skip the header row
continue
report = report_from(result, report_type="other", year_range=year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
results = doc.select("div.content a")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
示例3: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the audit reports
for year in year_range:
if year < 2005: # This is the earliest audits go back
continue
url = AUDIT_REPORTS_URL.format(year=year)
doc = BeautifulSoup(utils.download(url))
results = doc.select("div.content")
if not results:
raise inspector.NoReportsFoundError("Tennessee Valley Authority (%d)" % year)
for result in results:
report = audit_report_from(result, url, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
results = doc.select("report")
if not results:
raise inspector.NoReportsFoundError("Tennessee Valley Authority (semiannual reports)")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
示例4: run
def run(options):
year_range = inspector.year_range(options)
# Pull the audit reports
for year in year_range:
url = audit_report_url(year)
if url:
parse_result_from_js_url(url, "auditreports", year, year_range)
url = inspection_report_url(year)
if url:
parse_result_from_js_url(url, "iereports", year, year_range)
# Pull the congressional testimony
doc = BeautifulSoup(utils.download(CONGRESSIONAL_TESTIMONY_REPORTS_URL))
results = doc.findAll("ul", type='disc')[0].select("li")
for result in results:
report = congressional_testimony_report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
results = doc.findAll("ul", type='disc')[0].select("li")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
示例5: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the general reports
doc = BeautifulSoup(utils.download(REPORTS_URL))
results = doc.select("div#mainContent li.mainContenttext a")
for result in results:
report = report_from(result, REPORTS_URL, year_range)
if report:
inspector.save_report(report)
# Pull the archive reports
doc = BeautifulSoup(utils.download(REPORT_ARCHIVE_URL))
results = doc.select("div#mainContent li.mainContenttext a") + doc.select("div#mainContent span.mainContenttext a")
for result in results:
if not result.text:
continue
report = report_from(result, REPORT_ARCHIVE_URL, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
results = doc.select("div#mainContent li.mainContenttext a")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
示例6: urls_for_topics
def urls_for_topics(self, topics):
for topic in topics:
# Topic might be a tuple for ADDITIONAL_TOPICS (not ones from command
# line).
self.report_type = None
if isinstance(topic, tuple):
topic, report_type = topic
self.report_type = report_type
last_page = False
url = TOPIC_TO_URL[topic]
page = BeautifulSoup(utils.download(url))
page_started = self.is_first_page(page)
if page_started:
yield url
for link in page.select('li.pager-item a'):
next_url = urljoin(url, link['href'])
next_page = BeautifulSoup(utils.download(next_url))
if not page_started:
page_started = self.is_first_page(next_page)
if page_started:
yield next_url
last_page = self.is_last_page(next_page)
if last_page:
break
if last_page:
continue
self.report_type = None # Clear this out afterwards
示例7: urls_for
def urls_for(self):
only = self.options.get('topics')
if only: # if only...
only = set(only.split(','))
only = [(o, TOPIC_TO_REPORT_TYPE[o]) if o in TOPIC_TO_REPORT_TYPE else o
for o in only]
yield from self.urls_for_topics(only)
# If there are topics selected, ONLY yield URLs for those.
return
# First yield the URLs for the topics that are tangential to the main
# Calendar Year reports.
yield from self.urls_for_topics(ADDITIONAL_TOPICS)
# Not getting reports from specific topics, iterate over all Calendar Year
# reports.
page = BeautifulSoup(utils.download(BASE_URL))
# Iterate over each "Calendar Year XXXX" link
for li in page.select('.field-items li'):
md = RE_CALENDAR_YEAR.search(li.text)
if md:
cur_year = int(md.group(1))
if cur_year >= self.year_range[0] and cur_year <= self.year_range[-1]:
href = li.select('a')[0]['href']
next_url = urljoin(BASE_URL, href)
# The first page of reports is yielded.
yield next_url
# Next, read all the pagination links for the page and yield those. So
# far, I haven't seen a page that doesn't have all of the following
# pages enumerated.
next_page = BeautifulSoup(utils.download(next_url))
for link in next_page.select('li.pager-item a'):
yield urljoin(BASE_URL, link['href'])
示例8: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the audit reports
for year in year_range:
url = AUDITS_REPORTS_URL.format(str(year)[2:4])
doc = BeautifulSoup(utils.download(url))
results = doc.select("tr")
if not results:
raise inspector.NoReportsFoundError("NASA (%d)" % year)
for index, result in enumerate(results):
if not index or not result.text.strip():
# Skip the header row and any empty rows
continue
report = audit_report_from(result, url, year_range)
if report:
inspector.save_report(report)
# Pull the other reports
doc = BeautifulSoup(utils.download(OTHER_REPORT_URL))
results = doc.select("#subContainer ul li")
if not results:
raise inspector.NoReportsFoundError("NASA (other)")
for result in results:
report = other_report_from(result, year_range)
if report:
inspector.save_report(report)
示例9: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the reports with pagination
for report_type, report_url_format in PAGINATED_REPORT_FORMATS.items():
for page in range(0, 999):
url = report_url_format.format(page=page)
doc = BeautifulSoup(utils.download(url))
results = doc.select("li.views-row")
if not results:
if page == 0:
raise inspector.NoReportsFoundError("USAID (%s)" % report_type)
else:
break
for result in results:
report = report_from(result, url, report_type, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports (no pagination)
doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
results = doc.select("li.views-row")
if not results:
raise inspector.NoReportsFoundError("USAID (semiannual reports)")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
示例10: run
def run(options):
year_range = inspector.year_range(options, archive)
doc = BeautifulSoup(utils.download(REPORTS_URL))
# Pull the semiannual reports
semiannul_results = doc.select("#AnnualManagementReports select")[0]
for result in semiannul_results.select("option"):
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the special reports
special_report_table = doc.find("table", attrs={"bordercolor": "#808080"})
for index, result in enumerate(special_report_table.select("tr")):
if not index:
# Skip the header row
continue
report = report_from(result, REPORTS_URL, report_type='other', year_range=year_range)
if report:
inspector.save_report(report)
# Pull the audit reports
for year in year_range:
if year < 2001: # The oldest fiscal year page available
continue
year_url = AUDIT_REPORTS_URL.format(year=year)
doc = BeautifulSoup(utils.download(year_url))
for index, result in enumerate(doc.select("#main table tr")):
if not index:
# Skip the header row
continue
report = report_from(result, year_url, report_type='audit', year_range=year_range)
if report:
inspector.save_report(report)
示例11: run
def run(options):
year_range = inspector.year_range(options, archive)
pages = options.get('pages', ALL_PAGES)
# Pull the audit reports. Pages are 0-indexed.
for page in range(0, int(pages) - 1):
doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL.format(page=page)))
results = doc.select("span.field-content")
if not results:
# No more results, we must have hit the last page
break
for result in results:
report = report_from(result, year_range, report_type='audit')
if report:
inspector.save_report(report)
# Grab the other reports
for report_type, url in OTHER_REPORT_URLS.items():
doc = BeautifulSoup(utils.download(url))
results = doc.select(".views-field")
if not results:
results = doc.select(".views-row")
for result in results:
report = report_from(result, year_range, report_type)
if report:
inspector.save_report(report)
示例12: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the audit reports
for year in year_range:
if year < 2006: # The oldest year for audit reports
continue
url = AUDIT_REPORTS_URL.format(year=year)
doc = BeautifulSoup(utils.download(url))
results = doc.select("div#content li")
for result in results:
report = audit_report_from(result, url, year, year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
results = doc.select("div#content li")
for result in results:
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the Peer Review
doc = BeautifulSoup(utils.download(PEER_REVIEWS_URL))
result = doc.find("div", id='content').find("a", text=True)
report = peer_review_from(result, year_range)
inspector.save_report(report)
示例13: extract_reports_for_oei
def extract_reports_for_oei(year_range):
topic_name = TOPIC_NAMES["OE"]
topic_url = TOPIC_TO_URL["OE"]
root_body = utils.download(topic_url)
root_doc = BeautifulSoup(root_body)
letter_urls = set()
for link in root_doc.select("#leftContentInterior li a"):
absolute_url = urljoin(topic_url, link['href'])
absolute_url = strip_url_fragment(absolute_url)
letter_urls.add(absolute_url)
if not letter_urls:
raise inspector.NoReportsFoundError("HHS (OEI first pass)")
all_results_links = {}
all_results_unreleased = []
for letter_url in letter_urls:
letter_body = utils.download(letter_url)
letter_doc = BeautifulSoup(letter_body)
results = letter_doc.select("#leftContentInterior ul li")
if not results:
raise inspector.NoReportsFoundError("HHS (OEI %s)" % letter_url)
for result in results:
if 'crossref' in result.parent.parent.attrs.get('class', []):
continue
if result.parent.parent.attrs.get('id') == 'related':
continue
node = result
while node and node.name != "h2":
node = node.previous
if node and node.name == "h2":
subtopic_name = str(node.text)
else:
subtopic_name = "(unknown)"
links = result.findAll("a")
if len(links) == 0:
result.extract()
all_results_unreleased.append([result, subtopic_name])
else:
url = links[0].get("href")
if url not in all_results_links:
result.extract()
all_results_links[url] = [result, subtopic_name]
else:
existing_result = all_results_links[url][0]
for temp in result.contents:
temp.extract()
existing_result.append(temp)
all_results_links[url][1] = "%s, %s" % (all_results_links[url][1], subtopic_name)
subtopic_url = TOPIC_TO_URL["OE"]
for result, subtopic_name in itertools.chain(all_results_links.values(), all_results_unreleased):
report = report_from(result, year_range, topic_name, subtopic_url, subtopic_name)
if report:
inspector.save_report(report)
示例14: run
def run(options):
year_range = inspector.year_range(options, archive)
# Pull the audit reports
doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL))
results = doc.select("td.text table tr")
if not results:
raise inspector.NoReportsFoundError("National Science Foundation (audit reports")
for result in results:
# ignore divider lines
if result.select("img"): continue
report = report_from(result, report_type='audit', year_range=year_range)
if report:
inspector.save_report(report)
# Pull the semiannual reports
doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
results = doc.select("td.text table tr")
if not results:
raise inspector.NoReportsFoundError("National Science Foundation (semiannual reports)")
for result in results:
if not result.text.strip():
continue
report = semiannual_report_from(result, year_range)
if report:
inspector.save_report(report)
# Pull the case reports
response = utils.scraper.post(
url=CASE_REPORTS_URL,
data=CASE_REPORTS_DATA,
)
doc = BeautifulSoup(response.content)
results = doc.select("td.text table tr")
if not results:
raise inspector.NoReportsFoundError("National Science Foundation (case reports)")
for index, result in enumerate(results):
if not index or not result.text.strip(): # Skip the header row and empty rows
continue
report = case_report_from(result, CASE_REPORTS_URL, year_range)
if report:
inspector.save_report(report)
# Pull the testimony
doc = BeautifulSoup(utils.download(TESTIMONY_REPORTS_URL))
results = doc.select("td.text table tr")
if not results:
raise inspector.NoReportsFoundError("National Science Foundation (testimony)")
for result in results:
if not result.text.strip():
continue
report = report_from(result, report_type='testimony', year_range=year_range)
if report:
inspector.save_report(report)
示例15: handle_scanner_args
def handle_scanner_args(args, opts) -> Tuple[dict, list]:
"""
--analytics: file path or URL to a CSV of participating domains.
This function also handles checking for the existence of the file,
downloading it succesfully, and reading the file in order to populate the
list of analytics domains.
"""
parser = scan_utils.ArgumentParser(prefix_chars="--")
parser.add_argument("--analytics", nargs=1, required=True)
parsed, unknown = parser.parse_known_args(args)
dicted = vars(parsed)
should_be_single = ["analytics"]
dicted = scan_utils.make_values_single(dicted, should_be_single)
resource = dicted.get("analytics")
if not resource.endswith(".csv"):
no_csv = "".join([
"--analytics should be the file path or URL to a CSV of participating",
" domains and end with .csv, which '%s' does not" % resource
])
logging.error(no_csv)
raise argparse.ArgumentTypeError(no_csv)
try:
parsed_url = urlparse(resource)
except:
raise
if parsed_url.scheme and parsed_url.scheme in ("http", "https"):
analytics_path = Path(opts["_"]["cache_dir"], "analytics.csv").resolve()
try:
utils.download(resource, str(analytics_path))
except:
logging.error(utils.format_last_exception())
no_csv = "--analytics URL %s not downloaded successfully." % resource
logging.error(no_csv)
raise argparse.ArgumentTypeError(no_csv)
else:
if not os.path.exists(resource):
no_csv = "--analytics file %s not found." % resource
logging.error(no_csv)
raise FileNotFoundError(no_csv)
else:
analytics_path = resource
analytics_domains = utils.load_domains(analytics_path)
dicted["analytics_domains"] = analytics_domains
del dicted["analytics"]
return (dicted, unknown)