本文整理汇总了Python中soupselect.select函数的典型用法代码示例。如果您正苦于以下问题:Python select函数的具体用法?Python select怎么用?Python select使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了select函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: extractPage
def extractPage(url, pagination=True):
print 'Extracting : %s' % url
result = []
page = request(url)
soup = BeautifulSoup(page)
info = select(soup, '.courseInfo')
for record in info:
courseNumber = record.find('span', {'class': 'courseNumber'}).text
courseTitle = record.find('span', {'class': 'courseTitle'}).text
courseAttrs = record.find('div', {'class': 'courseAttributes'}).text
terms = [x for x in courseAttrs.split('|') if 'terms' in x.lower()]
if terms:
courseTime = str(terms[0].split(':')[1]).strip()
else:
courseTime = "not given this year"
obj = {
'title': courseTitle,
'number': courseNumber,
'time': courseTime
}
result.append(obj)
subresults = []
if pagination:
pages = select(soup, '#pagination a')
pagesLinks = href(pages)
for l in set(pagesLinks):
subresults.extend(extractPage(BASE + l, False))
if subresults:
result.extend(subresults)
return result
示例2: parse_obituary
def parse_obituary(url,category):
"""
Extracts the necessary information from a single obituary page
"""
page = requests.get(url)
soup = Soup(page.text)
try:
date = select(soup, 'p strong')[0].contents[0]
date = date[date.rfind('died ')+5:].strip()
cal = pdt.Calendar()
print >> sys.stderr, 'parsing',date
date = cal.parseDateText(date)
except:
print >> sys.stderr, 'failed to parse'
return
date = str('%s/%s/%s' % (date[2],date[1],date[0]))
publisher = 'Telegraph'
type = 'obituaries'
name = select(soup, '.storyHead h1')[0].contents[0]
content = ''
for para in select(soup, '#mainBodyArea p'):
if len(para.contents) > 0:
content = content + para.contents[0]
content = content.strip().replace('"','\'')
content = content.strip().replace('\n','')
print >> sys.stdout, '%s,%s,%s,%s,"%s","%s"' % (date.encode("UTF-8"),
publisher.encode("UTF-8"),
type.encode("UTF-8"),
name.encode("UTF-8"),
content.encode("UTF-8"),
category.encode("UTF-8"))
示例3: get_raw_boxscore_data
def get_raw_boxscore_data(self, boxscore_soup):
# Load boxscore data. No logic here, just splitting from HTML into more
# processable data.
boxscore_data = []
boxscore_rows = select(boxscore_soup, '#my-players-table tbody tr')
for player_data in boxscore_rows:
cells = select(player_data, 'td')
if len(cells) == 13:
# This order should match the boxscore table on espn
(player_name, minutes, fgma, tpma, ftma, oreb, reb, ast, stl, blk,
to, pf, pts) = [
cell.text for cell in cells
]
if not player_name:
continue
fgm, fga = fgma.split('-')
tpm, tpa = tpma.split('-')
ftm, fta = ftma.split('-')
(minutes, fgm, fga, tpm, tpa, ftm, fta, oreb, reb, ast, stl, blk, to,
pf, pts) = map(int, [
minutes, fgm, fga, tpm, tpa, ftm, fta, oreb, reb, ast, stl, blk, to,
pf, pts
])
boxscore_data.append({
'name': player_name, 'minutes': minutes, 'fgm': fgm, 'fga': fga,
'tpm': tpm, 'tpa': tpa, 'ftm': ftm, 'fta': fta,
'oreb': oreb, 'reb': reb,
'ast': ast, 'stl': stl, 'blk': blk, 'to': to, 'pf': pf, 'pts': pts,
})
return boxscore_data
示例4: _extract_predictions
def _extract_predictions(self, html):
if '<p class="predictHead"><nobr><span id=\'i18n_en\'>No current prediction' in html:
return None
else:
predictions = []
soup = BeautifulSoup(html)
# get the primary/imminent prediction
try:
minutes = self._clean_prediction_html(select(soup, '.predictionNumberForFirstPred')[0])
except:
return None
if ('departing' in minutes.lower()) or ('arriving' in minutes.lower()):
predictions.append(0)
else:
predictions.append(int(minutes))
# get the other predictions
for m in select(soup, '.predictionNumberForOtherPreds'):
m = self._clean_prediction_html(m)
try:
predictions.append(int(m))
except:
pass
return predictions
示例5: expandDocument
def expandDocument(self,header,content,config=None):
raise "obsolete"
part = self.partDocument(header["document"],config)
soup = part.expandSoup(content)
header = part.get_collapsed_header(header=header)
stateful_doc = "stateful" in header and header["stateful"] is True
if stateful_doc:
script = part.statefulConfigScript()
if script:
script_tag = soup.new_tag("script")
script_tag["type"] = "application/config"
script_tag.string = script
soup.body.append(script_tag)
# fill in meta tags
self._applyMetaAndTitle(soup,header,config)
if config["appcache"] == False:
for h in select(soup,"html"):
del h["manifest"]
elif "manifest" in header:
for h in select(soup,"html"):
h["manifest"] = header["manifest"]
if "Content-Language" in header:
for h in select(soup,"html"):
h["lang"] = header["Content-Language"]
# offline markers
lists = {
"offline": self._getOfflineList(soup,header),
}
return soup.prettify(), lists
示例6: fetch_review_counts
def fetch_review_counts(appid):
class FetchError(StandardError):
pass
url = 'http://store.steampowered.com/app/%i/' % appid
request = urllib.urlopen(url)
if request.code < 200 or request.code > 299:
raise FetchError('Unable to fetch %s' % url, { 'appid': appid, 'status': request.code})
soup = BeautifulSoup(request)
positive_count = ''
positive_count_elements = select(soup, '#ReviewsTab_positive .user_reviews_count')
if len(positive_count_elements) > 0:
positive_count = get_count(positive_count_elements[0])
if not positive_count:
print >>sys.stderr, "Warning: Unable to find positive user review count on page %s" % url
negative_count = ''
negative_count_elements = select(soup, '#ReviewsTab_negative .user_reviews_count')
if len(negative_count_elements) > 0:
negative_count = get_count(negative_count_elements[0])
if not negative_count:
print >>sys.stderr, "Warning: Unable to find negative user review count on page %s" % url
return positive_count, negative_count
示例7: scrapeBlog
def scrapeBlog(blog):
global completed
blogurl = blog['postUrl']
blogData = {}
try:
soup = Soup(urllib2.urlopen(blogurl))
post = select(soup, 'div.post-body')
title = select(soup, 'h1.title')
titleNoTags = Soup(str(title))
rawTitle = ''.join(filter(visible, titleNoTags.findAll(text=True))).strip()
#print rawTitle
noScript = Soup(str(post))
rawText = ''.join(filter(visible, noScript.findAll(text=True))).strip()
#print raw_text
blogData['source'] = str(rawTitle)
blogData['title'] = blog['titleNoFormatting']
blogData['content'] = str(rawText)
blogData['date'] = blog['publishedDate']
blogData['url'] = str(blogurl)
except e:
pass
with dataLock:
data.append(blogData)
completed += 1
示例8: fetch_data
def fetch_data():
def bvbreplace(s):
return "BVB" if "Dortmund" in s else s
doc = None
try:
doc, errs = tidy_document(urllib2.urlopen('http://www.bvb.de/').read(), tidyoptions)
soup = Soup(doc)
except Exception as e:
raise Exception(u"Error fetching/parsing website: %s" % e)
out = ''
matchtime = datetime.datetime.now() + datetime.timedelta(hours=25)
timestr = ''
try:
home = bvbreplace(select(soup, "div.next-match p span")[0].contents[0].strip())
guest = bvbreplace(select(soup, "div.next-match p span")[1].contents[0].strip())
league = ''
try:
league = select(soup, "div.next-match p span.tournament")[0].contents[0].strip()
except:
league = select(soup, "div.next-match p span")[2].contents[0].strip()
matchtime = datetime.datetime.strptime(select(soup, "div.next-match p")[1].contents[-1].strip(), u"%d.%m.%Y %H:%M")
timestr = matchtime.strftime(u"%a, %d.%m.%Y %H:%M")
dontgo = u"U42/U46/Kreuzviertel/Borsigplatz/Uni-Parkplatz" if u"BVB" == home else u"Kneipen mit TV in Dortmund"
location = u"Heim" if u"BVB" == home else u"Auswaerts"
out = u"WARNUNG! %s: %s vs %s (%s/%s). Meide %s." % (timestr, home, guest, location, league, dontgo)
except IndexError:
# This means: No next game on the webpage.
sys.exit(1)
except Exception as e:
#print(traceback.format_exc())
raise Exception(u"ERRBVB while parsing bvb.de: %s" % e)
return out, matchtime
示例9: sees_an_element
def sees_an_element(self, doc, element=None, css_class=None, id=None, css_selector=None):
""" Tests for the presence of a specified element on the current page...
self.alice.sees_an_element(doc, id="element_id")
self.alice.sees_an_element(doc, "element")
self.alice.sees_an_element(doc, "div", "element_css_class")
self.alice.sees_an_element(doc, selector="#myid element.bar")
"""
selector = "any"
if id:
displayed_element = doc.find(id=id)
selector = id
elif css_selector:
displayed_elements = select(doc, css_selector)
displayed_element = displayed_elements[0] if displayed_elements else None
selector = css_selector
else:
if css_class:
selector = "%s.%s" % (element, css_class)
displayed_element = select(doc, selector)
else:
displayed_element = doc.find(element)
selector = element
self.failUnless(displayed_element, "Could not find %s" % (selector))
return displayed_element
示例10: Loop_Through_Messages
def Loop_Through_Messages(i): #i = start ID - 1
while i < MaxMSG:
i += 1
Humanize(2) #Humanize the program by sleeping 0-2 seconds
try:
soup = Make_Soup("http://groups.yahoo.com/group/freecycledc/message/" + str(i))
MSG_Title = select(soup, 'title')[0].text.replace('\n', '~n-break~')
msgbodyhtml = select(soup, '.msgarea')[0]
MSG_Body = unicode.join(u' ',map(unicode,msgbodyhtml)).replace('<br />', '~break~').replace('\n', '~n-break~')
if MSG_Title == '': MSG_Title = '(none)'
if MSG_Body == '': MSG_Body = '(none)'
Message_Data_to_Table(i, MSG_Title, MSG_Body)
print i, "of", MaxMSG
except:
print "ERROR: SCRAPE FAIL ON POSTING ID", i
Check_Column("Title", MSG_Title)
Check_Column("Body HTML", msgbodyhtml)
Check_Column("Body Text", MSG_Body)
if MSG_Title == 'freecycledc' or 'message' not in MSG_Title.lower():
Message_Data_to_Table(i, 'Message does not exist', 'NOTHING TO SEE HERE, FOLKS')
else:
Message_Data_to_Table(i, 'FAIL', 'FAIL')
示例11: expand
def expand(self,header,content,markup=None,config=None):
"""
General header/content expansion replacing expandDocument and expandScss
"""
lists = {
"offline": [],
}
if "charset" not in header and markup is not None:
header["charset"] = config["charset"]
parent_doc = None
if "document" in header:
parent_doc = self.partDocument(header["document"],config)
header = parent_doc.get_collapsed_header(header=header)
if markup == "scss":
content = self.expandScss(header,content,config=config)
elif markup in ("text","xml"):
pass #TODO consider what to do
elif markup == "html":
soup = None
if parent_doc:
soup = parent_doc.expandSoup(content)
else:
soup = BeautifulSoup(content,"html5lib")
if "lang" in header:
pass #TODO mark html element
# print soup.head
stateful_doc = "stateful" in header and header["stateful"] is True
if stateful_doc:
script = parent_doc.statefulConfigScript()
if script:
script_tag = soup.new_tag("script")
script_tag["type"] = "application/config"
script_tag.string = script
soup.body.append(script_tag)
# fill in meta tags
self._applyMetaAndTitle(soup,header,config)
if config["appcache"] == False:
for h in select(soup,"html"):
del h["manifest"]
elif "manifest" in header:
for h in select(soup,"html"):
h["manifest"] = header["manifest"]
if "Content-Language" in header:
for h in select(soup,"html"):
h["lang"] = header["Content-Language"]
# offline markers
lists["offline"] = self._getOfflineList(soup,header)
content = soup.encode()
return header, content, lists
示例12: get_games
def get_games(page=1):
def select_first(soup, selector):
result = select(soup, selector)
if result and len(result) > 0:
return result[0]
else:
return None
def inner_text(soup):
if isinstance(soup, NavigableString):
return unicode(soup)
elif soup.contents:
return u"".join(inner_text(c) for c in soup.contents)
else:
return unicode(soup)
result = []
soup = BeautifulSoup(urllib.urlopen(search_result_url(page)))
games = select(soup, "a.search_result_row")
for game in games:
href = str(game["href"])
if re.search("http://store.steampowered.com/app/(\\d+)/", href):
id = re.search("http://store.steampowered.com/app/(\\d+)/", href).group(1)
else:
logging.error("Error extracting ID, skipping")
continue
name = inner_text(select(game, "h4")[0])
price = select_first(game, ".search_price")
if price and price.contents:
price = price.contents[-1].lower()
if price.find("free") != -1:
price = float(0)
elif price.startswith("$"):
# Grab the last node, which is either the price or the "reduced
# price"
try:
price = float(price[5:])
except:
logging.error("Price conversion error for %s: '%s'" % (name, price))
price = None
else:
price = None
logging.error("Price parse error for %s: '%s'" % (name, price))
else:
price = None
metascore = select_first(game, ".search_metascore")
if metascore and metascore.string:
metascore = int(metascore.string)
else:
metascore = None
result.append(Game(id=id, name=name, price=price, metascore=metascore))
return result
示例13: raw_events
def raw_events(file):
match = open(file, 'r')
soup = BeautifulSoup(match.read())
events = select(soup, 'div#live-text-commentary-wrapper div#live-text')
more_events = select(soup, 'div#live-text-commentary-wrapper div#more-live-text')
for event in events + more_events:
for child in event.children:
if type(child) is bs4.element.Tag:
yield child.getText().strip()
示例14: get_resources
def get_resources(self, doc):
resources = []
for a in select(doc, 'a'):
url = a.get('href')
img = select(a, 'img[src]')[0]
src = img.get('src')
f_type = REG_URL_FILE.search(src).group(1).lower()
resources.append((url, f_type))
return resources
示例15: find_footnotes_and_anchors
def find_footnotes_and_anchors(soup):
selector = '.sdfootnoteanc'
footnote_anchors = select(soup, selector)
#print '\n'.join([str(anc) for anc in footnote_anchors])
footnotes = []
for i in range(len(footnote_anchors)):
selector = '#sdfootnote%s' % (i+1)
footnotes.extend(select(soup, selector))
#print '\n'.join([str(f) for f in footnotes])
return footnote_anchors, footnotes