当前位置: 首页>>代码示例>>Python>>正文


Python soupselect.select函数代码示例

本文整理汇总了Python中soupselect.select函数的典型用法代码示例。如果您正苦于以下问题:Python select函数的具体用法?Python select怎么用?Python select使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了select函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: extractPage

def extractPage(url, pagination=True):
    print 'Extracting : %s' % url
    result = []
    page = request(url)
    soup = BeautifulSoup(page)
    info = select(soup, '.courseInfo')
    for record in info:
        courseNumber = record.find('span', {'class': 'courseNumber'}).text
        courseTitle = record.find('span', {'class': 'courseTitle'}).text
        courseAttrs = record.find('div', {'class': 'courseAttributes'}).text
        terms = [x for x in courseAttrs.split('|') if 'terms' in x.lower()] 
        if terms:
            courseTime = str(terms[0].split(':')[1]).strip()
        else:
            courseTime = "not given this year"

        obj = {
                'title': courseTitle,
                'number': courseNumber,
                'time': courseTime
                }
        result.append(obj)

    subresults = []
    if pagination:
        pages = select(soup, '#pagination a')
        pagesLinks = href(pages)
        for l in set(pagesLinks):
            subresults.extend(extractPage(BASE + l, False))
    if subresults:
        result.extend(subresults) 
    return result
开发者ID:dahabit,项目名称:scrap,代码行数:32,代码来源:app.py

示例2: parse_obituary

def parse_obituary(url,category):
	"""
	Extracts the necessary information from a single obituary page
	"""
	page = requests.get(url)
	soup = Soup(page.text)
	try:
		date = select(soup, 'p strong')[0].contents[0]
		date = date[date.rfind('died ')+5:].strip()
		cal = pdt.Calendar()
		print >> sys.stderr, 'parsing',date
		date = cal.parseDateText(date)
	except:
		print >> sys.stderr, 'failed to parse'
		return
	date = str('%s/%s/%s' % (date[2],date[1],date[0]))
	publisher = 'Telegraph'
	type = 'obituaries'
	name = select(soup, '.storyHead h1')[0].contents[0]
	content = ''
	for para in select(soup, '#mainBodyArea p'):
		if len(para.contents) > 0:
			content = content + para.contents[0]

	content = content.strip().replace('"','\'')		
	content = content.strip().replace('\n','')
	
	print >> sys.stdout, '%s,%s,%s,%s,"%s","%s"' % (date.encode("UTF-8"),
													publisher.encode("UTF-8"),
													type.encode("UTF-8"),
													name.encode("UTF-8"),
													content.encode("UTF-8"),
													category.encode("UTF-8"))
开发者ID:MRdNk,项目名称:swbd,代码行数:33,代码来源:scrape_obituaries.py

示例3: get_raw_boxscore_data

  def get_raw_boxscore_data(self, boxscore_soup):
    # Load boxscore data. No logic here, just splitting from HTML into more
    # processable data.
    boxscore_data = []
    boxscore_rows = select(boxscore_soup, '#my-players-table tbody tr')
    for player_data in boxscore_rows:
      cells = select(player_data, 'td')
      if len(cells) == 13:
        # This order should match the boxscore table on espn
        (player_name, minutes, fgma, tpma, ftma, oreb, reb, ast, stl, blk,
            to, pf, pts) = [
          cell.text for cell in cells
        ]

        if not player_name:
          continue

        fgm, fga = fgma.split('-')
        tpm, tpa = tpma.split('-')
        ftm, fta = ftma.split('-')

        (minutes, fgm, fga, tpm, tpa, ftm, fta, oreb, reb, ast, stl, blk, to,
            pf, pts) = map(int, [
          minutes, fgm, fga, tpm, tpa, ftm, fta, oreb, reb, ast, stl, blk, to,
              pf, pts
        ])

        boxscore_data.append({
          'name': player_name, 'minutes': minutes, 'fgm': fgm, 'fga': fga,
          'tpm': tpm, 'tpa': tpa, 'ftm': ftm, 'fta': fta,
          'oreb': oreb, 'reb': reb,
          'ast': ast, 'stl': stl, 'blk': blk, 'to': to, 'pf': pf, 'pts': pts,
        })

    return boxscore_data
开发者ID:gunsch,项目名称:ncaa-stats,代码行数:35,代码来源:scraper.py

示例4: _extract_predictions

	def _extract_predictions(self, html):
		if '<p class="predictHead"><nobr><span id=\'i18n_en\'>No current prediction' in html:
			return None
		else:
			predictions = []
			soup = BeautifulSoup(html)	

			# get the primary/imminent prediction		
			try:
				minutes = self._clean_prediction_html(select(soup, '.predictionNumberForFirstPred')[0])
			except:
				return None
			if ('departing' in minutes.lower()) or ('arriving' in minutes.lower()):
				predictions.append(0)
			else:
				predictions.append(int(minutes))

			# get the other predictions
			for m in select(soup, '.predictionNumberForOtherPreds'):
				m = self._clean_prediction_html(m)
				try:
					predictions.append(int(m))
				except:
					pass

			return predictions
开发者ID:sbma44,项目名称:markmograph,代码行数:26,代码来源:nextbus.py

示例5: expandDocument

	def expandDocument(self,header,content,config=None):
		raise "obsolete"
		part = self.partDocument(header["document"],config)
		soup = part.expandSoup(content)
		header = part.get_collapsed_header(header=header)
		stateful_doc = "stateful" in header and header["stateful"] is True

		if stateful_doc:
			script = part.statefulConfigScript()
			if script:
				script_tag = soup.new_tag("script")
				script_tag["type"] = "application/config"
				script_tag.string = script
				soup.body.append(script_tag)

		# fill in meta tags
		self._applyMetaAndTitle(soup,header,config)

		if config["appcache"] == False:
			for h in select(soup,"html"):
				del h["manifest"]
		elif "manifest" in header:
			for h in select(soup,"html"):
				h["manifest"] = header["manifest"]

		if "Content-Language" in header:
			for h in select(soup,"html"):
				h["lang"] = header["Content-Language"]

		# offline markers
		lists = {
			"offline": self._getOfflineList(soup,header),
		}

		return soup.prettify(), lists
开发者ID:thepian,项目名称:thepian-pages,代码行数:35,代码来源:browsers.py

示例6: fetch_review_counts

def fetch_review_counts(appid):
    class FetchError(StandardError):
        pass

    url = 'http://store.steampowered.com/app/%i/' % appid
    request = urllib.urlopen(url)
    if request.code < 200 or request.code > 299:
        raise FetchError('Unable to fetch %s' % url, { 'appid': appid, 'status': request.code})

    soup = BeautifulSoup(request)

    positive_count = ''
    positive_count_elements = select(soup, '#ReviewsTab_positive .user_reviews_count')
    if len(positive_count_elements) > 0:
        positive_count = get_count(positive_count_elements[0])

    if not positive_count:
        print >>sys.stderr, "Warning: Unable to find positive user review count on page %s" % url

    negative_count = ''
    negative_count_elements = select(soup, '#ReviewsTab_negative .user_reviews_count')
    if len(negative_count_elements) > 0:
        negative_count = get_count(negative_count_elements[0])

    if not negative_count:
        print >>sys.stderr, "Warning: Unable to find negative user review count on page %s" % url

    return positive_count, negative_count
开发者ID:jorgenpt,项目名称:steam-tools,代码行数:28,代码来源:steam_fetch_review_counts.py

示例7: scrapeBlog

def scrapeBlog(blog):
	global completed
	blogurl = blog['postUrl']
	blogData = {}
	try:
		soup = Soup(urllib2.urlopen(blogurl))
		post = select(soup, 'div.post-body')

		title = select(soup, 'h1.title')
		titleNoTags = Soup(str(title))
		rawTitle = ''.join(filter(visible, titleNoTags.findAll(text=True))).strip()
		#print rawTitle

		noScript = Soup(str(post))
		rawText = ''.join(filter(visible, noScript.findAll(text=True))).strip()
		#print raw_text

		blogData['source'] = str(rawTitle)
		blogData['title'] = blog['titleNoFormatting']
		blogData['content'] = str(rawText)
		blogData['date'] = blog['publishedDate']
		blogData['url'] = str(blogurl)

	except e:
		pass
	with dataLock:
		data.append(blogData)
		completed += 1
开发者ID:Jbalkind,项目名称:Amazon-Hackathon,代码行数:28,代码来源:blogger.py

示例8: fetch_data

def fetch_data():
    def bvbreplace(s):
        return "BVB" if "Dortmund" in s else s

    doc = None
    try:
        doc, errs = tidy_document(urllib2.urlopen('http://www.bvb.de/').read(), tidyoptions)
        soup = Soup(doc)
    except Exception as e:
        raise Exception(u"Error fetching/parsing website: %s" % e)

    out = ''
    matchtime = datetime.datetime.now() + datetime.timedelta(hours=25)
    timestr = ''
    try:
        home = bvbreplace(select(soup, "div.next-match p span")[0].contents[0].strip())
        guest = bvbreplace(select(soup, "div.next-match p span")[1].contents[0].strip())
        league = ''
        try:
            league = select(soup, "div.next-match p span.tournament")[0].contents[0].strip()
        except:
            league = select(soup, "div.next-match p span")[2].contents[0].strip()            
        matchtime = datetime.datetime.strptime(select(soup, "div.next-match p")[1].contents[-1].strip(), u"%d.%m.%Y %H:%M")
        timestr = matchtime.strftime(u"%a, %d.%m.%Y %H:%M")
        dontgo = u"U42/U46/Kreuzviertel/Borsigplatz/Uni-Parkplatz" if u"BVB" == home else u"Kneipen mit TV in Dortmund"
        location = u"Heim" if u"BVB" == home else u"Auswaerts"
        out = u"WARNUNG! %s: %s vs %s (%s/%s). Meide %s." % (timestr, home, guest, location, league, dontgo)
    except IndexError:
        # This means: No next game on the webpage.
        sys.exit(1)
    except Exception as e:
        #print(traceback.format_exc())
        raise Exception(u"ERRBVB while parsing bvb.de: %s" % e)
    return out, matchtime
开发者ID:orithena,项目名称:sportswarnbot,代码行数:34,代码来源:bvb.py

示例9: sees_an_element

    def sees_an_element(self, doc, element=None, css_class=None, id=None, css_selector=None):
        """ Tests for the presence of a specified element on the current page...

        self.alice.sees_an_element(doc, id="element_id")
        self.alice.sees_an_element(doc, "element")
        self.alice.sees_an_element(doc, "div", "element_css_class")
        self.alice.sees_an_element(doc, selector="#myid element.bar")
        """
        selector = "any"
        if id:
            displayed_element = doc.find(id=id)
            selector = id
        elif css_selector:
            displayed_elements = select(doc, css_selector)
            displayed_element = displayed_elements[0] if displayed_elements else None
            selector = css_selector
        else:
            if css_class:
                selector = "%s.%s" % (element, css_class)
                displayed_element = select(doc, selector)
            else:
                displayed_element = doc.find(element)
                selector = element
        self.failUnless(displayed_element, "Could not find %s" % (selector))
        return displayed_element
开发者ID:emlprime,项目名称:wizards_duel,代码行数:25,代码来源:tests.py

示例10: Loop_Through_Messages

def Loop_Through_Messages(i): #i = start ID - 1
    
    while i < MaxMSG:
        i += 1
        
        Humanize(2) #Humanize the program by sleeping 0-2 seconds
        
        try:
            soup = Make_Soup("http://groups.yahoo.com/group/freecycledc/message/" + str(i))

            MSG_Title = select(soup, 'title')[0].text.replace('\n', '~n-break~')

            msgbodyhtml = select(soup, '.msgarea')[0]
            MSG_Body = unicode.join(u' ',map(unicode,msgbodyhtml)).replace('<br />', '~break~').replace('\n', '~n-break~')
            
            if MSG_Title == '': MSG_Title = '(none)'
            if MSG_Body == '': MSG_Body = '(none)'
            
            Message_Data_to_Table(i, MSG_Title, MSG_Body)
            
            print i, "of", MaxMSG
        except:
            print "ERROR: SCRAPE FAIL ON POSTING ID", i
            
            Check_Column("Title", MSG_Title)
            Check_Column("Body HTML", msgbodyhtml)
            Check_Column("Body Text", MSG_Body)
            
            if MSG_Title == 'freecycledc' or 'message' not in MSG_Title.lower():
                Message_Data_to_Table(i, 'Message does not exist', 'NOTHING TO SEE HERE, FOLKS')
            else:
                Message_Data_to_Table(i, 'FAIL', 'FAIL')
开发者ID:matthew-reilly,项目名称:freecycle,代码行数:32,代码来源:Freecycle_ETL.py

示例11: expand

	def expand(self,header,content,markup=None,config=None):
		"""
		General header/content expansion replacing expandDocument and expandScss
		"""
		lists = {
			"offline": [],
		}

		if "charset" not in header and markup is not None:
			header["charset"] = config["charset"]
		parent_doc = None
		if "document" in header:
			parent_doc = self.partDocument(header["document"],config)
			header = parent_doc.get_collapsed_header(header=header)

		if markup == "scss":
			content = self.expandScss(header,content,config=config)
		elif markup in ("text","xml"):
			pass #TODO consider what to do
		elif markup == "html":
			soup = None
			if parent_doc:
				soup = parent_doc.expandSoup(content)
			else:
				soup = BeautifulSoup(content,"html5lib")

			if "lang" in header:
				pass #TODO mark html element

			# print soup.head
			stateful_doc = "stateful" in header and header["stateful"] is True

			if stateful_doc:
				script = parent_doc.statefulConfigScript()
				if script:
					script_tag = soup.new_tag("script")
					script_tag["type"] = "application/config"
					script_tag.string = script
					soup.body.append(script_tag)

			# fill in meta tags
			self._applyMetaAndTitle(soup,header,config)

			if config["appcache"] == False:
				for h in select(soup,"html"):
					del h["manifest"]
			elif "manifest" in header:
				for h in select(soup,"html"):
					h["manifest"] = header["manifest"]

			if "Content-Language" in header:
				for h in select(soup,"html"):
					h["lang"] = header["Content-Language"]

			# offline markers
			lists["offline"] = self._getOfflineList(soup,header)
			content = soup.encode()

		return header, content, lists
开发者ID:thepian,项目名称:thepian-pages,代码行数:59,代码来源:browsers.py

示例12: get_games

def get_games(page=1):
    def select_first(soup, selector):
        result = select(soup, selector)
        if result and len(result) > 0:
            return result[0]
        else:
            return None

    def inner_text(soup):
        if isinstance(soup, NavigableString):
            return unicode(soup)
        elif soup.contents:
            return u"".join(inner_text(c) for c in soup.contents)
        else:
            return unicode(soup)

    result = []

    soup = BeautifulSoup(urllib.urlopen(search_result_url(page)))
    games = select(soup, "a.search_result_row")
    for game in games:
        href = str(game["href"])
        if re.search("http://store.steampowered.com/app/(\\d+)/", href):
            id = re.search("http://store.steampowered.com/app/(\\d+)/", href).group(1)
        else:
            logging.error("Error extracting ID, skipping")
            continue
        name = inner_text(select(game, "h4")[0])
        price = select_first(game, ".search_price")
        if price and price.contents:
            price = price.contents[-1].lower()

            if price.find("free") != -1:
                price = float(0)
            elif price.startswith("&#36;"):
                # Grab the last node, which is either the price or the "reduced
                # price"
                try:
                    price = float(price[5:])
                except:
                    logging.error("Price conversion error for %s: '%s'" % (name, price))
                    price = None
            else:
                price = None
                logging.error("Price parse error for %s: '%s'" % (name, price))
        else:
            price = None

        metascore = select_first(game, ".search_metascore")
        if metascore and metascore.string:
            metascore = int(metascore.string)
        else:
            metascore = None

        result.append(Game(id=id, name=name, price=price, metascore=metascore))

    return result
开发者ID:porkbuns,项目名称:steam-price-graph,代码行数:57,代码来源:SteamApi.py

示例13: raw_events

def raw_events(file):
    match = open(file, 'r')
    soup = BeautifulSoup(match.read())
    events = select(soup, 'div#live-text-commentary-wrapper div#live-text')
    more_events = select(soup, 'div#live-text-commentary-wrapper div#more-live-text')
    for event in events + more_events:
        for child in event.children:
            if type(child) is bs4.element.Tag:
                yield child.getText().strip()
开发者ID:mneedham,项目名称:neo4j-bbc,代码行数:9,代码来源:extractor.py

示例14: get_resources

 def get_resources(self, doc):
     resources = []
     for a in select(doc, 'a'):
         url = a.get('href')
         img = select(a, 'img[src]')[0]
         src = img.get('src')
         f_type = REG_URL_FILE.search(src).group(1).lower()
         resources.append((url, f_type))
     return resources
开发者ID:dedsm,项目名称:coursera,代码行数:9,代码来源:coursera.py

示例15: find_footnotes_and_anchors

def find_footnotes_and_anchors(soup):
    selector = '.sdfootnoteanc'
    footnote_anchors = select(soup, selector)
    #print '\n'.join([str(anc) for anc in footnote_anchors])

    footnotes = []
    for i in range(len(footnote_anchors)):
        selector = '#sdfootnote%s' % (i+1)
        footnotes.extend(select(soup, selector))
    #print '\n'.join([str(f) for f in footnotes])

    return footnote_anchors, footnotes
开发者ID:eaudeweb,项目名称:naaya.content.talkback,代码行数:12,代码来源:Convertor2.py


注:本文中的soupselect.select函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。