当前位置: 首页>>代码示例>>Python>>正文


Python inspector.save_report函数代码示例

本文整理汇总了Python中utils.inspector.save_report函数的典型用法代码示例。如果您正苦于以下问题:Python save_report函数的具体用法?Python save_report怎么用?Python save_report使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了save_report函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: run

def run(options):
  year_range = inspector.year_range(options)
  topics = options.get('topics')
  if topics:
    topics = topics.split(",")
  else:
    topics = TOPIC_TO_URL.keys()

  for topic in topics:
    topic_url = TOPIC_TO_URL[topic]
    body = utils.download(topic_url)
    doc = BeautifulSoup(body)

    try:
      year_results = doc.select("#Listing")[0]
      results = [x for x in year_results.select("ul li ul li")]
    except IndexError:
      try:
        all_results = doc.select("#bodyholder")[0]
        results = [x for x in all_results.select("ul li")]
      except IndexError:
        results = doc.select("table ul li")

    # Sometimes multiple reports are listed under the same datetime element.
    # We store which published datetime we saw last so that the next report
    # can use if if we are unable to find another published time.
    last_published_on = None
    for result in results:
      report, last_published_on = report_from(result, topic_url, topic, year_range, last_published_on)
      if report:
        inspector.save_report(report)
开发者ID:stvnrlly,项目名称:inspectors-general,代码行数:31,代码来源:sec.py

示例2: run

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for page in range(1, 1000):
    doc = beautifulsoup_from_url("{}?RS={}".format(REPORTS_URL, page))
    results = doc.select("div.leadin")
    if not results:
      if page == 1:
        raise inspector.NoReportsFoundError("VA (audit reports)")
      else:
        break
    for result in results:
      report = report_from(result, year_range)
      if report:
        inspector.save_report(report)

  # Pull the semiannual reports
  doc = beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.select("div.leadin")
  if not results:
    raise inspector.NoReportsFoundError("VA (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
开发者ID:Cloudxtreme,项目名称:inspectors-general,代码行数:26,代码来源:va.py

示例3: run

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for year in year_range:
    url = AUDITS_REPORTS_URL.format(str(year)[2:4])
    doc = BeautifulSoup(utils.download(url))
    results = doc.select("tr")
    if not results:
      raise inspector.NoReportsFoundError("NASA (%d)" % year)
    for index, result in enumerate(results):
      if not index or not result.text.strip():
        # Skip the header row and any empty rows
        continue
      report = audit_report_from(result, url, year_range)
      if report:
        inspector.save_report(report)

  # Pull the other reports
  doc = BeautifulSoup(utils.download(OTHER_REPORT_URL))
  results = doc.select("#subContainer ul li")
  if not results:
    raise inspector.NoReportsFoundError("NASA (other)")
  for result in results:
    report = other_report_from(result, year_range)
    if report:
      inspector.save_report(report)
开发者ID:Cloudxtreme,项目名称:inspectors-general,代码行数:27,代码来源:nasa.py

示例4: scrape_reports

def scrape_reports(options):
  """Pull reports from "Reports and Testimonies - Browse by date" web page."""

  REPORTS_URL = 'http://www.gao.gov/browse/date/custom?adv_begin_date=01/01/' +\
    '%s&adv_end_date=12/31/%s&rows=50&o=%s' # % (year, year, offset)
  archive = 1970
  # Amazingly, reports go back to 1940, though those are unlikely to be
  # legible enough to OCR. Also very cool, even 1950s-era reports seem to have
  # a highlightable embedded text layer in them. Of course, it was the
  # General Accounting Office back then and less oversighty.

  year_range = inspector.year_range(options, archive)
  for year in year_range:
    is_next_page = True
    offset = 0
    while is_next_page:
      doc = utils.beautifulsoup_from_url(
        REPORTS_URL % (year, year, offset))
      results = doc.select("div.listing")
      for result in results:
        report = process_report(result, year_range)
        if report:
          inspector.save_report(report)
      page_links = doc.select("a.non-current_page")
      if len(page_links) and page_links[-1].text.startswith('Next'):
        offset += 50
      else:
        is_next_page = False
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:28,代码来源:gaoreports.py

示例5: run

def run(options):
  year_range = inspector.year_range(options, archive)

  # Suggested flow, for an IG which paginates results.
  pages = options.get('pages', ALL_PAGES)
  for page in range(1, (int(pages) + 1)):
    data = {
      'view_name': 'oig_nodes',
      'view_display_id': 'block_search_oig_reports',
    }
    if page:
      # Only add page= if page > 0
      data['page'] = page

    response = utils.scraper.post(REPORTS_AJAX_URL,
        data=data,
        headers={
            "Content-Type": "application/x-www-form-urlencoded",
        },
    )
    page_html = response.json()[1]['data']
    doc = BeautifulSoup(page_html)
    results = doc.select("tr")
    if not results:
      break

    for index, result in enumerate(results):
      if not index:
        # Skip the header row
        continue
      report = report_from(result, year_range)
      if report:
        inspector.save_report(report)
开发者ID:slobdell,项目名称:inspectors-general,代码行数:33,代码来源:sba.py

示例6: run

def run(options):
  year_range = inspector.year_range(options, archive)

  pre_1998_done = False

  # Pull the audit reports
  for year in year_range:
    if year < 1998:
      if pre_1998_done:
        continue
      else:
        pre_1998_done = True
    for page_number in range(0, 10000):
      year_url = url_for(year, page_number)
      doc = beautifulsoup_from_url(year_url)
      results = doc.select("ol li")
      if not results:
        if page_number == 0:
          raise inspector.NoReportsFoundError("Department of Labor (%s)" % year_url)
        else:
          break
      for result in results:
        report = report_from(result, year_url)
        if report:
          inspector.save_report(report)

  # Pull the semiannual reports
  doc = beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.select("p > a:nth-of-type(1)")
  if not results:
    raise inspector.NoReportsFoundError("Department of Labor (semiannal reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
开发者ID:Cloudxtreme,项目名称:inspectors-general,代码行数:35,代码来源:labor.py

示例7: run

def run(options):
  year_range = inspector.year_range(options, archive)
  report_flag = False

  # Pull the table of reports for each year
  for year in year_range:
    url = url_for_year(year)
    html = utils.download(url, scraper_slug="osc")

    if html is None:
      if year == max(year_range):
        continue
      else:
        raise Exception("Couldn't fetch reports page {}".format(url))

    #  spaces appear as &#160; and \u200b .... fix that now
    html = html.replace('&#160;', ' ').replace('\u200b', ' ').replace('\u00a0', ' ').replace('\r', '').replace('\n', '')
    doc = BeautifulSoup(html, "lxml")

    OUTCOME_CODES = generate_outcome_codes(doc)

    keys_used = []  # a few reports appear multiple times... ignore them the second time if they appear more than once

    results = doc.findAll("table")[1].tbody.findAll('tr')  # no ids on the tables, but it's the second one
    for result in results:
      reports = report_from(result, year, year_range, url, OUTCOME_CODES)
      for report in reports:
        if report['report_id'] not in keys_used:
          inspector.save_report(report)
          keys_used.append(report['report_id'])
          report_flag = True

  if not report_flag:
    raise inspector.NoReportsFoundError("OSC")
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:34,代码来源:osc.py

示例8: run

def run(options):
  year_range = inspector.year_range(options, archive)
  min_year = min(year_range)
  page = 0
  last_page = 0

  while page <= last_page:
    doc = utils.beautifulsoup_from_url(REPORT_SEARCH_URL.format(min_year, page))
    last_page_link = doc.find("a", title="Go to last page")
    if last_page_link:
      href = last_page_link["href"]
      page_match = re.search("[?&]page=([0-9]+)(?:&|$)", href)
      if page_match:
        last_page = int(page_match.group(1))

    results = doc.select(".view-reports-advanced-search .views-row")
    if not results:
      raise inspector.NoReportsFoundError("Department of the Interior")
    for result in results:
      report = report_from(result, year_range)
      if report:
        inspector.save_report(report)
    page += 1
  if last_page == 0:
    raise Exception("Did not find last page link")
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:25,代码来源:interior.py

示例9: run

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the reports with pagination
  for report_type, report_url_format in PAGINATED_REPORT_FORMATS.items():
    for page in range(0, 999):
      url = report_url_format.format(page=page)
      doc = BeautifulSoup(utils.download(url))
      results = doc.select("li.views-row")
      if not results:
        if page == 0:
          raise inspector.NoReportsFoundError("USAID (%s)" % report_type)
        else:
          break

      for result in results:
        report = report_from(result, url, report_type, year_range)
        if report:
          inspector.save_report(report)

  # Pull the semiannual reports (no pagination)
  doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
  results = doc.select("li.views-row")
  if not results:
    raise inspector.NoReportsFoundError("USAID (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
开发者ID:Cloudxtreme,项目名称:inspectors-general,代码行数:29,代码来源:usaid.py

示例10: run

def run(options):
  year_range = inspector.year_range(options, archive)
  pages = options.get('pages', ALL_PAGES)

  # default to starting at page 1
  begin = int(options.get('begin', 1))

  max_page = None
  for page in range(begin, (int(pages) + 1)):
    if max_page and (page > max_page):
      logging.debug("End of pages!")
      break

    logging.debug("## Downloading page %i" % page)
    url = url_for(options, page)
    body = utils.download(url)
    doc = BeautifulSoup(body)

    # When the USPS restores their page controls, we can use this again,
    # which saves one network call each time.
    max_page = last_page_for(doc)

    results = doc.select(".views-row")

    for result in results:
      report = report_from(result)

      # inefficient enforcement of --year arg, USPS doesn't support it server-side
      # TODO: change to published_on.year once it's a datetime
      if inspector.year_from(report) not in year_range:
        logging.warn("[%s] Skipping report, not in requested range." % report['report_id'])
        continue

      inspector.save_report(report)
开发者ID:slobdell,项目名称:inspectors-general,代码行数:34,代码来源:usps.py

示例11: run

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the reports
  for report_type, report_url in REPORT_URLS:
    doc = utils.beautifulsoup_from_url(report_url)
    results = doc.select("td.mainInner div.ms-WPBody > div > ul > li")

    if not results:
      raise inspector.NoReportsFoundError("SIGTARP ({})".format(report_type))

    for result in results:
      report = report_from(result, report_type, year_range)
      if report:
        inspector.save_report(report)

  doc = utils.beautifulsoup_from_url(QUARTERLY_REPORTS_URL)
  results = doc.select("#MSOZoneCell_WebPartWPQ3 .s4-wpTopTable a")

  if not results:
    raise inspector.NoReportsFoundError("SIGTARP (quarterly reports)")

  for result in results:
    report = quarterly_report_from(result, year_range)
    if report:
      inspector.save_report(report)
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:26,代码来源:sigtarp.py

示例12: run

def run(options):
  year_range = inspector.year_range(options)

  for page_url in URLS:
    done = False
    body = utils.download(page_url)
    doc = BeautifulSoup(body)

    maincontent = doc.select("div#CS_Element_eximpagemaincontent")[0]
    all_p = maincontent.find_all("p")

    for p in all_p:
      for all_text, link_text, link_url in recurse_tree(p, False):
        if link_url == None:
          continue
        if link_url.startswith("mailto:"):
          continue
        if page_url == WHATS_NEW_URL and link_url == "/oig/whats-new-archive.cfm":
          # end of page
          done = True
          break
        if link_url.startswith("https://public.govdelivery.com/"):
          continue
        for index_url in URLS:
          if index_url.find(link_url) != -1:
            continue

        year = DATE_RE.search(all_text).group(3)
        if int(year) not in year_range:
          continue

        report = report_from(all_text, link_text, link_url, page_url)
        inspector.save_report(report)
      if done: break
开发者ID:spulec,项目名称:inspectors-general,代码行数:34,代码来源:exim.py

示例13: run

def run(options):
  year_range = inspector.year_range(options)
  max_pages = int(options.get('pages', 1))
  for year in year_range:
    page = 1
    done = False
    while not done:
      url = url_for(options, page, year)
      body = utils.download(url)

      doc = BeautifulSoup(body)

      next_page = page + 1
      found_next_page = False
      page_links = doc.select("li.pager-item a.active")
      for page_link in page_links:
        if page_link.text == str(next_page):
          found_next_page = True
          break
      if not found_next_page:
        done = True
      if next_page > max_pages:
        done = True

      results = doc.select("table.views-table > tbody > tr")
      for result in results:
        report = report_from(result)
        inspector.save_report(report)

      page = next_page
      if not done:
        print('Moving to next page (%d)' % page)
开发者ID:spulec,项目名称:inspectors-general,代码行数:32,代码来源:amtrak.py

示例14: extract_reports_for_subtopic

def extract_reports_for_subtopic(subtopic_url, year_range, topic_name, subtopic_name):
  doc = beautifulsoup_from_url(subtopic_url)
  if not doc:
    raise Exception("Failure fetching subtopic URL: %s" % subtopic_url)

  results = None

  # This URL is different than the rest and needs to find the "p > a"s first.
  if subtopic_url == TOPIC_TO_URL['TMPC']:
    results = doc.select("#leftContentInterior > p > a")
  if not results:
    results = doc.select("#leftContentInterior dl dd")
  if not results:
    results = doc.select("#leftContentInterior ul li")
  if not results:
    results = doc.select("#leftContentInterior > p > a")
  if not results:
    raise inspector.NoReportsFoundError("HHS (%s)" % subtopic_name)
  for result in results:
    if 'crossref' in result.parent.parent.attrs.get('class', []):
      continue
    if result.parent.parent.attrs.get('id') == 'related':
      continue
    report = report_from(result, year_range, topic_name, subtopic_url, subtopic_name)
    if report:
      inspector.save_report(report)
开发者ID:Cloudxtreme,项目名称:inspectors-general,代码行数:26,代码来源:hhs.py

示例15: run

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the reports with pagination
  for report_type, report_url_format in PAGINATED_REPORT_FORMATS:
    for page in range(0, 999):
      url = report_url_format.format(page=page)
      doc = utils.beautifulsoup_from_url(url)
      if report_type == "audit" and page == 0 and not doc.select("div.views-field-field-auditreport-doc-1"):
        raise Exception("Report number CSS class has changed")
      results = doc.select("li.views-row")
      if not results:
        if page == 0:
          raise inspector.NoReportsFoundError("USAID (%s)" % report_type)
        else:
          break

      for result in results:
        report = report_from(result, url, report_type, year_range)
        if report:
          inspector.save_report(report)

  # Pull the semiannual reports (no pagination)
  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.select("li.views-row")
  if not results:
    raise inspector.NoReportsFoundError("USAID (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:31,代码来源:usaid.py


注:本文中的utils.inspector.save_report函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。