本文整理汇总了Python中pattern.web.DOM.by_id方法的典型用法代码示例。如果您正苦于以下问题:Python DOM.by_id方法的具体用法?Python DOM.by_id怎么用?Python DOM.by_id使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pattern.web.DOM
的用法示例。
在下文中一共展示了DOM.by_id方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: getRandomHistoryDOM
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
def getRandomHistoryDOM(language):
url = URL("http://"+language+".wikipedia.org/wiki/Special:Random")
#Gets the url only of the page this redirects to
redirectUrl = url.redirect
try:
#Grab the name of the wikipedia article from the url
urlComponents = string.split(redirectUrl, '/')
except AttributeError:
#Use some recursion if we encounter a page with no history, or some other error
return getRandomHistoryDOM(language)
#Get the history section of the article
redirectUrl = "http://"+language+".wikipedia.org/w/index.php?title="+urlComponents[4]+"&action=history"
print "Current article is: " +str(urlComponents[4])
#print redirectUrl
url = URL(redirectUrl);
dom = DOM(url.download(cached=False))
try:
historyList = dom.by_id("pagehistory").by_tag("li")
return historyList, urlComponents[4]
except AttributeError:
#Use some recursion if we encounter a page with no history, or some other error
dom = getRandomHistoryDOM(language)
return getRandomHistoryDOM(language)
示例2: scrape_education
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
def scrape_education(county_num):
if county_num<10:
county_num = '0' + str(county_num)
else:
county_num = str(county_num)
print county_num
#url = 'http://dq.cde.ca.gov/dataquest/Staff/StaffEduLvl.aspx?cYear=2011-12&cChoice=CoEduc&TheCounty=01,ALAMEDA&cType=T&cGender=&Submit=1'
url = 'http://dq.cde.ca.gov/dataquest/Staff/StaffEduLvl.aspx?cYear=2011-12&cChoice=CoEduc&TheCounty=' + county_num + '01,ALAMEDA&cType=T&cGender=&Submit=1'
abs_url = URL(string = url)
dom = DOM(abs_url.download(cached=True))#download the DOM
other = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[12].content.replace(',','')
associates = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[11].content.replace(',','')
bachelors = str(int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[9].content.replace(',','')) + int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[10].content.replace(',','')))
masters = str(int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[4].content.replace(',','')) + int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[5].content.replace(',','')))
jurisdoctor = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[3].content.replace(',','')
doctorate = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[2].content.replace(',','')
bachelors_and_less = str(int(bachelors) + int(associates) + int(other))
post_grad = str(int(masters) + int(jurisdoctor) + int(doctorate))
county = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("a")[0].content
# write all the collected data to a new row of the output file
writer.writerow([county, bachelors_and_less, post_grad, associates, bachelors, masters, jurisdoctor, doctorate])
示例3: getTitle
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
def getTitle(self, link):
html = URL(link).download()
body = DOM(html).body
node = body.by_id("main-article-info")
if node:
title = node.children[1].content.strip()
else:
title = ''
return title
示例4: getVisByCountry
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
def getVisByCountry(site):
countries = {}
url = URL(base + site)
aDom = DOM(url.download(cached=True))
if aDom.by_id("visitors-by-country") is not None:
vis = aDom.by_id("visitors-by-country")
countries = {}
for r in vis.by_class("tr1"):
if r.by_tag("a")[0].attributes.get("id") == "toggleMoreCountryVisits":
pass
else:
#print r.by_tag("a")[0].content
country = r.by_tag("a")[0].content.split(" ")[1].strip()
pct = float(r.by_tag("p")[1].content[0:-1])
#print country, pct
countries[country] = pct
sites[site] = countries
示例5: htmlParser
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
def htmlParser(self,link):
html = URL(link).download()
body = DOM(html).body
content = body.by_id("content")
if content:
plaincontent = plaintext(content.content, linebreaks=2, indentation = True)
pretty = unicode(plaincontent.strip())
else:
pretty=''
return pretty
示例6: load
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
def load(year, pagenum, pagerank):
strnum = str(year)
url = URL("http://www.imdb.com/search/title?at=0&sort=moviemeter,asc&start="
+str(pagenum)+"&title_type=feature&year="+strnum+","+strnum)
dom = DOM(url.download(timeout=30, cached=True))
htmlsource = dom.by_id("main").by_class("results")[0].by_class("title")[pagerank].by_tag("a")[0].source
urlpiece = re.search(r'/title/+[t0-9]+/', htmlsource)
finalurl = "http://www.imdb.com" + urlpiece.group(0)
url2 = URL(finalurl)
return url2
示例7: getTextAboutResturants
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
def getTextAboutResturants(self):
# get text about resturants
i=0
for rs in self.conn.resturants.find():
if not rs.get('information'):
information = {}
request = DOM(URL(rs['url']).download())
# Tags
if request.by_id('LocationMetaData'):
source = str(request.by_id('LocationMetaData').source.encode('cp1252', 'ignore'))
tags = Element(source[source.find('<b>Tags: </b>'):]).by_tag('a')
if tags:
information['parsedTags'] = [ (tag.attributes['href'], tag.content) for tag in tags]
# Review
if request.by_id('LocationDescription'):
information["review"] = plaintext(request.by_id('LocationDescription').content)
# Details
if request.by_id('LocationRestaurantDetails'):
information["details"] = request.by_id('LocationRestaurantDetails').by_tag('p')[0].content
rs['details'] = information
print information
self.conn.resturants.save(rs)
else:
print i, rs['name']
i +=1
示例8: getResturants
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
def getResturants(self):
# get all resturants and urls
for page in range(1,48):
params["page"] = page
request = DOM(URL(url, query=params).download())
searchResults = request.by_id('searchResults')
pageResults = searchResults.by_class('locationListing clearfix')
for item in pageResults:
link = item.by_tag('h4')[0].by_tag('a')[-1]
name = plaintext(link.content)
address = link.attributes['href']
resturant = { 'name' : name, 'url' : address}
conn['resturants'].insert(resturant)
示例9: get_search_string
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
def get_search_string(search, proxy):
if search == "Schindler's List":
search = "Schindler"
if search == "One Flew Over the Cuckoo's Nest":
search = "one flew over"
if search == "It's a Wonderful Life":
search = "wonderful life"
if search == u"L\xe9on: The Professional":
search = "the professional"
if search == "Terminator 2: Judgment Day":
search = "Terminator 2"
if search == u"Am\xe9lie":
search = "Amelie"
if search == "L.A. Confidential":
search = "Confidential"
if search == "Pan's Labyrinth":
search = "pan"
if search == "A Few Dollars More":
search = "dollars"
if search == "The Secret in Their Eyes":
search = "El secreto de sus ojos"
if search == "The King's Speech":
search = "the king"
if search == "Howl's Moving Castle":
search = "howl"
if search == "Harry Potter and the Deathly Hallows: Part 2":
search = "harry potter"
if search == "Who's Afraid of Virginia Woolf?":
search = "virginia woolf"
if search == "Rosemary's Baby":
search = "rosemary"
url = URL("http://1channel.ch")
dom = DOM(url.download(cached=False, timeout=20, proxy=proxy))
a = dom.by_id("searchform")
s_base = a.attributes.get("action")
s_text = "_keywords=" + search.replace(" ","+")
key = a.by_attribute(name="key")[0].attributes.get("value")
s_section = a.by_attribute(name="search_section")[0].attributes.get("value")
search_string = s_base + s_text + "&key=" + key + "&search_section=" + s_section
return search_string
示例10: scrape_truancy
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
def scrape_truancy(county_num):
if county_num<10:
county_num = '0' + str(county_num)
else:
county_num = str(county_num)
print county_num
#url = 'http://dq.cde.ca.gov/dataquest/SuspExp/suspexplrate.aspx?cYear=2011-12&cType=ALL&cCDS=01000000000000&cName=ALAMEDA&cLevel=County&cChoice=cSusExpRt'
url = 'http://dq.cde.ca.gov/dataquest/SuspExp/suspexplrate.aspx?cYear=2011-12&cType=ALL&cCDS=' + county_num + '000000000000&cName=ALAMEDA&cLevel=County&cChoice=cSusExpRt'
abs_url = URL(string = url)
dom = DOM(abs_url.download(cached=True))#download the DOM
county = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("a")[0].content
total_enrollment = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[3].content
suspensions = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[4].content
suspension_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[5].content
expulsions = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[6].content
expulsion_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[7].content
truants = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[8].content
trauncy_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[9].content
#For the first county only, also grab the statewide totals
if county_num=='01':
state_total_enrollment = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[3].content
state_suspensions = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[4].content
state_suspension_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[5].content
state_expulsions = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[6].content
state_expulsion_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[7].content
state_truants = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[8].content
state_trauncy_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[9].content
# write the statewide total data to the top row of the output file
writer.writerow(["California Total", state_total_enrollment,state_suspensions, state_suspension_rate, state_expulsions, state_expulsion_rate, state_truants, state_trauncy_rate])
# write all the collected data to a new row of the output file
writer.writerow([county, total_enrollment,suspensions, suspension_rate, expulsions, expulsion_rate, truants, trauncy_rate])
示例11: dfun
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
return
dfunv = dfun(dom2)
if dfunv == "":
return
writer.writerow([tfun(data), runfun(data), gfun(data), dfun(data),
wfun(data), afun(data), msfun(data), rtfun(data),
rtnmfun(data), bfun(text), bousfun(text),
bowfun(text), mpaafun(data), dfun(dom2)])
#this handles tags
year = 2000
while year < 2011:
pagenum = 1
while pagenum < 101:
pagerank = 0
while pagerank < 50:
url2 = load(year, pagenum, pagerank)
dom2 = DOM(url2.download(timeout=30, cached=True))
data = dom2.by_id("overview-top")
text = loadbus(url2)
print dfun(data)
entrytest(data,text,dom2)
pagerank += 1
pagenum += 50
year += 1
output.close()
示例12: loadbus
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
def loadbus(url):
url = URL(str(url)+"business?ref_=tt_dt_bus")
dom = DOM(url.download(timeout=30, cached=True))
return (dom.by_id("tn15content").content).encode('ascii', 'ignore')
示例13:
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
#build the final list of variables called season_label_container
roster_labels_container=[]
for label in roster_labels:
roster_labels_container.append(label.content.encode("utf8"))
#add in a column for the team acronym to act as a key
roster_labels_container.insert(0,"team_id")
roster_labels_container.insert(1,"Season")
#the roster_container holds all of the players for the specified team/year
roster_container = []
roster_container.append(roster_labels_container)
print roster_labels_container
# this is so powerful - I just needed to look and find the id for the roster table
all_divs = dom.by_id("roster")
#roster_trs holds a list of players info
roster_trs = all_divs.by_tag("tr")
#iterate through each player in the roster
for trs in roster_trs:
#this will hold the final encoded info/stats pulled from the current player
this_roster_farian = []
#now add the team id and the season
this_roster_farian.append(this_team_acronym)
this_roster_farian.append(this_season)
for t in trs.by_tag("td"):
#the player name has a link to the player,
#e.g. '<a href="/players/m/milledr01.html">Drew Miller</a>'
示例14: int
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
elif year_name[0:2] == "Fr":
grad_year = int(neu_year) + 4
# split name into first and last (adjusting for title rows)
name = cells[0].by_tag("a")
name = cells[0].content.split(" ", 1) if len(name) == 0 else cells[0].by_tag("a")[0].content.split(" ", 1)
# reorder name if from Columbia or Princeton
if school in (2,3):
name.reverse()
# add swimmer (last name, first name, graduating year, school) to array
swimmers.append([name[1].encode('ascii', 'ignore').strip(",").strip(), name[0].encode('ascii', 'ignore').strip(), grad_year, neu_schools[school]])
# Get all Cornell Roster id numbers for the URLs
url = URL("http://www.cornellbigred.com/roster.aspx?roster=847")
dom = DOM(url.download(cached=True))
options = dom.by_id("ctl00_cplhMainContent_ddlPastRosters").by_tag("option")
base_url = "http://www.cornellbigred.com/roster.aspx?roster="
cornell_roster_ids = []
for option in options:
cornell_roster_ids.append(str(option.attrs["value"]))
# define years array
cornell_years = []
for i in range(YEARS_TO_SCRAPE):
cornell_years.append(str(year-i))
counter = 0
for cornell_year in cornell_years:
print counter
print "Cornell" + " " + cornell_year
url_string = base_url + cornell_roster_ids[counter]
示例15: open
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
from pattern.web import URL, DOM
from pattern.db import Datasheet
import glob, re
urls = glob.glob('/Users/tnatoli/Desktop/pages/*.html')
headers = ['player', 'pos', 'team', 'owner']
f = open('player_table.txt', 'w')
f.write('\t'.join(headers) + '\n')
for u in urls:
url = URL(u)
dom = DOM(url.download(cached=False))
tbody = dom.by_id('statTable0').by_tag('tbody')[0]
for tr in tbody.by_tag('tr'):
pname = tr.by_class('ysf-player-name')[0].by_tag('a')[0].content
team_pos = tr.by_class('ysf-player-team-pos')[0].by_tag('span')[0].content
team = re.sub('\(', '', team_pos.split(' - ')[0])
pos = re.sub('\)', '', team_pos.split(' - ')[1])
owner_links = tr.by_class('owner')[0].by_tag('a')
if owner_links:
owner = owner_links[0].content
else:
owner = 'FA'
line = '\t'.join([pname, team, pos, owner])
print line
for l in line:
try:
l.encode('ascii')