当前位置: 首页>>代码示例>>Python>>正文


Python utils.beautifulsoup_from_url函数代码示例

本文整理汇总了Python中utils.utils.beautifulsoup_from_url函数的典型用法代码示例。如果您正苦于以下问题:Python beautifulsoup_from_url函数的具体用法?Python beautifulsoup_from_url怎么用?Python beautifulsoup_from_url使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了beautifulsoup_from_url函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_reports_by_year

  def get_reports_by_year(self):
    # This page contains semianual reports as well as a few Audit reports for
    # Fiscal Year 2014 and links to sub-pages that contain links for other
    # fiscal years.
    doc = utils.beautifulsoup_from_url(REPORTS_BY_YEAR_URL)

    # Get the semiannual reports to Congress.
    self.get_semiannual_reports_to_congress(doc)

    # Reports that are 'bare' on the page, listed explicitly
    (bare_report_ul_1,
     bare_report_ul_2) = self.get_uls_past_audit_header(doc)
    self.get_bare_reports(bare_report_ul_1)
    self.get_bare_reports(bare_report_ul_2)

    # Links on the page to audit reports from past fiscal years
    link_subpage_ul = doc.select(".submenu-submenu")[0]
    for li in link_subpage_ul.find_all('li'):
      link = li.find('a')
      if link:
        next_url = urljoin(REPORTS_BY_YEAR_URL, link['href'])
        doc = utils.beautifulsoup_from_url(next_url)
        uls = self.get_uls_past_audit_header(doc)
        assert len(uls) == 1, ('Mysterious additional ul data on page: %s' %
                               next_url)
        self.get_bare_reports(uls[0])
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:26,代码来源:loc.py

示例2: run

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the reports
  for report_type, report_url in REPORT_URLS:
    doc = utils.beautifulsoup_from_url(report_url)
    results = doc.select("td.mainInner div.ms-WPBody > div > ul > li")

    if not results:
      raise inspector.NoReportsFoundError("SIGTARP ({})".format(report_type))

    for result in results:
      report = report_from(result, report_type, year_range)
      if report:
        inspector.save_report(report)

  doc = utils.beautifulsoup_from_url(QUARTERLY_REPORTS_URL)
  results = doc.select("#MSOZoneCell_WebPartWPQ3 .s4-wpTopTable a")

  if not results:
    raise inspector.NoReportsFoundError("SIGTARP (quarterly reports)")

  for result in results:
    report = quarterly_report_from(result, year_range)
    if report:
      inspector.save_report(report)
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:26,代码来源:sigtarp.py

示例3: run

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the reports with pagination
  for report_type, report_url_format in PAGINATED_REPORT_FORMATS:
    for page in range(0, 999):
      url = report_url_format.format(page=page)
      doc = utils.beautifulsoup_from_url(url)
      if report_type == "audit" and page == 0 and not doc.select("div.views-field-field-auditreport-doc-1"):
        raise Exception("Report number CSS class has changed")
      results = doc.select("li.views-row")
      if not results:
        if page == 0:
          raise inspector.NoReportsFoundError("USAID (%s)" % report_type)
        else:
          break

      for result in results:
        report = report_from(result, url, report_type, year_range)
        if report:
          inspector.save_report(report)

  # Pull the semiannual reports (no pagination)
  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.select("li.views-row")
  if not results:
    raise inspector.NoReportsFoundError("USAID (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:31,代码来源:usaid.py

示例4: run

def run(options):
  year_range = inspector.year_range(options, archive)
  if datetime.datetime.now().month >= 10:
    # October, November, and December fall into the next fiscal year
    # Add next year to year_range to compensate
    year_range.append(max(year_range) + 1)

  # Pull the audit reports
  for year in year_range:
    url = audit_report_url(year)
    if url:
      parse_result_from_js_url(url, "auditreports", year, year_range, report_type='audit')
    url = inspection_report_url(year)
    if url:
      parse_result_from_js_url(url, "iereports", year, year_range, report_type='inspection')

  # Pull the congressional testimony
  doc = utils.beautifulsoup_from_url(CONGRESSIONAL_TESTIMONY_REPORTS_URL)
  results = doc.findAll("ul", type='disc')[0].select("li")
  for result in results:
    report = congressional_testimony_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.findAll("ul", type='disc')[0].select("li")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:31,代码来源:tigta.py

示例5: run

def run(options):
  year_range = inspector.year_range(options, archive)
  results_flag = False

  # Pull the audit reports
  for year in year_range:
    if year < 2002:  # The oldest page for audit reports
      continue
    if year == 2018:
      doc = utils.beautifulsoup_from_url(LATEST_AUDIT_REPORTS_URL)
    else:
      doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL.format(year=year))

    if doc is None:
      # Next year's audit page may not be published yet
      continue

    results = doc.select("div.mainCenter table tr")
    if results:
      results_flag = True
    for index, result in enumerate(results):
      if not index:
        # Skip the header row
        continue
      report = report_from(result, report_type='audit', year_range=year_range)
      if report:
        inspector.save_report(report)

  if not results_flag:
    raise inspector.NoReportsFoundError("NCUA (audit reports)")

  # Pull the other reports
  doc = utils.beautifulsoup_from_url(OTHER_REPORTS_URL)
  results = doc.select("div.mainCenter p")
  if not results:
    raise inspector.NoReportsFoundError("NCUA (other)")
  for result in results:
    report = other_report_from(result, year_range=year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.select("div#mainColumns div.mainCenter a")
  if not results:
    raise inspector.NoReportsFoundError("NCUA (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the performance and strategic plans
  doc = utils.beautifulsoup_from_url(PLANS_URL)
  results = doc.select("div.mainCenter p")
  if not results:
    raise inspector.NoReportsFoundError("NCUA (performance/strategic plans)")
  for result in results:
    report = plan_from(result, year_range)
    if report:
      inspector.save_report(report)
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:60,代码来源:ncua.py

示例6: run

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the reports
  doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL)
  rows = doc.select("div.content > div > div > div > div > div.row")
  row_audits = rows[0]

  # Audit reports
  results = row_audits.select("ul li.pdf")
  if not results:
    raise inspector.NoReportsFoundError("CPB (audits)")
  for result in results:
    report = report_from(result, AUDIT_REPORTS_URL, "audit", year_range)
    if report:
      inspector.save_report(report)

  doc = utils.beautifulsoup_from_url(OTHER_REPORTS_URL)
  rows = doc.select("div.content > div > div > div > div.row")
  row_peer_review = rows[0]
  col_plans = rows[1].select("div.col-md-6")[0]
  col_congress = rows[1].select("div.col-md-6")[1]

  # Peer review
  results = row_peer_review.select("ul li.pdf")
  if not results:
    raise inspector.NoReportsFoundError("CPB (peer reviews)")
  for result in results:
    report = report_from(result, OTHER_REPORTS_URL, "other", year_range)
    if report:
      inspector.save_report(report)

  # Plans
  results = col_plans.select("ul li.pdf")
  if not results:
    raise inspector.NoReportsFoundError("CPB (plans)")
  for result in results:
    report = report_from(result, OTHER_REPORTS_URL, "other", year_range)
    if report:
      inspector.save_report(report)

  # Semiannual reports to congress
  results = col_congress.select("ul li.pdf")
  if not results:
    raise inspector.NoReportsFoundError("CPB (semiannual reports)")
  for result in results:
    report = report_from(result, OTHER_REPORTS_URL, "semiannual_report", year_range)
    if report:
      inspector.save_report(report)
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:49,代码来源:cpb.py

示例7: run

def run(options):
  year_range = inspector.year_range(options, archive)

  urls = [ARCHIVED_REPORTS_URL, PRIOR_PENDING_REPORTS_URL]
  for year in year_range:
    if year >= 2005:
        urls.append(AUDIT_REPORTS_URL.format(year))

  # Pull the audit reports
  for url in urls:
    doc = utils.beautifulsoup_from_url(url)
    results = doc.find("table", border="1").select("tr")
    if not results:
      raise inspector.NoReportsFoundError("Nuclear Regulatory Commission (%d)" % year)
    for index, result in enumerate(results):
      if not index:
        # Skip the header row
        continue
      report = audit_report_from(result, url, year_range)
      if report:
        inspector.save_report(report)

  # Pull the congressional testimony
  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  semiannual_reports_table = doc.find("table", border="1")
  results = semiannual_reports_table.select("tr")
  if not results:
    raise inspector.NoReportsFoundError("Nuclear Regulatory Commission (congressional testimony)")
  for index, result in enumerate(results):
    if index < 2:
      # Skip the first two header rows
      continue
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the other reports
  for reports_url, id_prefix in OTHER_REPORT_URLS:
    doc = utils.beautifulsoup_from_url(reports_url)
    results = doc.find("table", border="1").select("tr")
    if not results:
      raise inspector.NoReportsFoundError("Nuclear Regulatory Commission (other)")
    for index, result in enumerate(results):
      if not index:
        # Skip the header row
        continue
      report = other_report_from(result, year_range, id_prefix, reports_url)
      if report:
        inspector.save_report(report)
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:49,代码来源:nrc.py

示例8: semiannual_report_from

def semiannual_report_from(result, year_range):
  link = result.find("a")

  title = link.text

  # Parse the report title. Ex:
  # 'OIG Semiannual Report to the Congress: October 1, 2013 - March 31, 2014 (incl. MCC)'
  published_on_text = title.split("-")[-1].split("–")[-1].split("(")[0].strip()
  published_on_text = published_on_text.replace("September 31", "September 30")  # See note to IG Web team
  published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % title)
    return

  landing_url = urljoin(SEMIANNUAL_REPORTS_URL, link.get('href'))
  landing_page = utils.beautifulsoup_from_url(landing_url)

  report_url = landing_page.select("div.field-type-file a")[0].get('href')
  report_filename = report_url.split("/")[-1]
  report_id, _ = os.path.splitext(report_filename)

  report = {
    'inspector': "usaid",
    'inspector_url': "https://oig.usaid.gov",
    'agency': "usaid",
    'agency_name': "Agency For International Development",
    'type': 'semiannual_report',
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:34,代码来源:usaid.py

示例9: run

def run(options):
  year_range = inspector.year_range(options, archive)
  min_year = min(year_range)
  page = 0
  last_page = 0

  while page <= last_page:
    doc = utils.beautifulsoup_from_url(REPORT_SEARCH_URL.format(min_year, page))
    last_page_link = doc.find("a", title="Go to last page")
    if last_page_link:
      href = last_page_link["href"]
      page_match = re.search("[?&]page=([0-9]+)(?:&|$)", href)
      if page_match:
        last_page = int(page_match.group(1))

    results = doc.select(".view-reports-advanced-search .views-row")
    if not results:
      raise inspector.NoReportsFoundError("Department of the Interior")
    for result in results:
      report = report_from(result, year_range)
      if report:
        inspector.save_report(report)
    page += 1
  if last_page == 0:
    raise Exception("Did not find last page link")
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:25,代码来源:interior.py

示例10: extract_reports_for_subtopic

def extract_reports_for_subtopic(subtopic_url, year_range, topic_name, subtopic_name):
  doc = utils.beautifulsoup_from_url(subtopic_url)
  if not doc:
    raise Exception("Failure fetching subtopic URL: %s" % subtopic_url)

  results = None

  # This URL is different than the rest and needs to find the "p > a"s first.
  if subtopic_url == TOPIC_TO_URL['TMPC']:
    results = doc.select("#leftContentInterior > p > a")
  if not results:
    results = doc.select("#leftContentInterior dl dd")
  if not results:
    results = doc.select("#leftContentInterior ul li")
  if not results:
    results = doc.select("#leftContentInterior > p > a")
  if not results:
    raise inspector.NoReportsFoundError("HHS (%s)" % subtopic_name)
  for result in results:
    if 'crossref' in result.parent.parent.attrs.get('class', []):
      continue
    if result.parent.parent.attrs.get('id') == 'related':
      continue
    report = report_from(result, year_range, topic_name, subtopic_url, subtopic_name)
    if report:
      deduplicate_save_report(report)
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:26,代码来源:hhs.py

示例11: scrape_reports

def scrape_reports(options):
  """Pull reports from "Reports and Testimonies - Browse by date" web page."""

  REPORTS_URL = 'http://www.gao.gov/browse/date/custom?adv_begin_date=01/01/' +\
    '%s&adv_end_date=12/31/%s&rows=50&o=%s' # % (year, year, offset)
  archive = 1970
  # Amazingly, reports go back to 1940, though those are unlikely to be
  # legible enough to OCR. Also very cool, even 1950s-era reports seem to have
  # a highlightable embedded text layer in them. Of course, it was the
  # General Accounting Office back then and less oversighty.

  year_range = inspector.year_range(options, archive)
  for year in year_range:
    is_next_page = True
    offset = 0
    while is_next_page:
      doc = utils.beautifulsoup_from_url(
        REPORTS_URL % (year, year, offset))
      results = doc.select("div.listing")
      for result in results:
        report = process_report(result, year_range)
        if report:
          inspector.save_report(report)
      page_links = doc.select("a.non-current_page")
      if len(page_links) and page_links[-1].text.startswith('Next'):
        offset += 50
      else:
        is_next_page = False
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:28,代码来源:gaoreports.py

示例12: scrape_restricted_reports

def scrape_restricted_reports(options):
  """Restricted Products.

  A single HTML page lists unreleased reports since 2014, with no links."""

  # These reports are unreleased -- we could make this the text?
  """The following products have been determined to contain either
classified information or controlled unclassified information by the audited
agencies and cannot be publicly released.

Members of Congress or congressional staff who wish to obtain one or more of
these products should call or e-mail the Congressional Relations Office.
All others who wish to obtain one or more of these products should follow the
instructions found on Requesting Restricted Products."""

  REPORTS_URL = 'http://www.gao.gov/restricted/restricted_reports'
  archive = 2014

  year_range = inspector.year_range(options, archive)
  doc = utils.beautifulsoup_from_url(REPORTS_URL)
  results = doc.select("div.listing")
  for result in results:
    report = process_restricted_report(result, year_range, REPORTS_URL)
    if report:
      inspector.save_report(report)
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:25,代码来源:gaoreports.py

示例13: run

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL)

  headers = set([a.parent for a in
                 doc.find_all("a", id=re.compile("^[0-9]{4}$"))])
  headers.update(doc.find_all("p", class_="Ptitle1"))
  headers = sorted(headers, key=lambda p: int(p.text.strip()), reverse=True)
  if not headers:
    raise inspector.NoReportsFoundError("ITC")

  for header in headers:
    year = int(header.text.strip())
    results = header.findNextSibling("ul").select("li")

    for result in results:
      if not inspector.sanitize(result.text):
        logging.debug("Skipping empty list item.")
        continue

      report = audit_report_from(year, result, AUDIT_REPORTS_URL, year_range)
      if report:
        inspector.save_report(report)
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:25,代码来源:itc.py

示例14: get_listed_reports

 def get_listed_reports(self, url):
   doc = utils.beautifulsoup_from_url(url)
   article = doc.select('.article')[0]
   results = article.find_all('ul')
   if not results:
     raise inspector.NoReportsFoundError("Library of Congress (%s)" % url)
   for ul in results:
     self.get_bare_reports(ul)
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:8,代码来源:loc.py

示例15: run

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for url, report_type, parse_func in REPORT_PAGES_INFO:
    doc = utils.beautifulsoup_from_url(url)

    content = doc.select("section.article-content")[0]
    parse_func(content, url, report_type, year_range)
开发者ID:unitedstates,项目名称:inspectors-general,代码行数:9,代码来源:lsc.py


注:本文中的utils.utils.beautifulsoup_from_url函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。