本文整理汇总了Python中pattern.web.DOM.by_class方法的典型用法代码示例。如果您正苦于以下问题:Python DOM.by_class方法的具体用法?Python DOM.by_class怎么用?Python DOM.by_class使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pattern.web.DOM
的用法示例。
在下文中一共展示了DOM.by_class方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: process_page
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_class [as 别名]
def process_page():
url = URL("http://www.imdb.com/search/title?num_votes=5000,&sort=user_rating,desc&start=1&title_type=tv_series")
dom = DOM(url.download(cached=True))
domIndex = 0
for title in dom.by_class("title"):
theTitle = str(title.by_tag("a")[0].content).encode('ascii', 'replace')
titleCatalog.append(Title(theTitle))
try:
match = re.search("^(\d+).*$", str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace'))
#print match.group(1)
# titleCatalog[domIndex].addRunTime( str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace'))
titleCatalog[domIndex].addRunTime(match.group(1))
except Exception, e:
pass
try:
titleCatalog[domIndex].addRank( str(dom.by_class("value")[domIndex].content).encode('ascii', 'replace'))
except Exception, e:
pass
示例2: extract_percentages
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_class [as 别名]
def extract_percentages(dom):
file_url = URL(TARGET_URL)
file_dom = DOM(file_url.download())
percentage_list = []
if file_dom.by_class('percentage'):
for item in file_dom.by_class('percentage'):
percentage_list.append(item.content.encode('utf-8'))
return percentage_list[0]
else:
return "nodata"
示例3: scrape_api
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_class [as 别名]
def scrape_api(county_num):
if county_num<10:
county_num = '0' + str(county_num)
else:
county_num = str(county_num)
print county_num
#url = 'http://dq.cde.ca.gov/dataquest/Acnt2012/2011Base_Co.aspx?cYear=&cSelect=02'
url = 'http://dq.cde.ca.gov/dataquest/Acnt2012/2011Base_Co.aspx?cYear=&cSelect=' + county_num
abs_url = URL(string = url)
dom = DOM(abs_url.download(cached=True))#download the DOM
#grab the value for each district and sum them up to obtain the county total value
districts = dom.by_class('medium\+_left')
num_students_county_total = 0
api_county_total = 0
for n in districts:
#grab and sum number of students
district_num_students = n.parent.by_class("medium_center")[0].content
if not " " in district_num_students:
#cast to int
district_num_students = int(district_num_students.replace(',',''))
num_students_county_total += district_num_students
#grab and sum API for each district
district_api = n.parent.by_class("medium_center")[1].content
#remove any asterii
district_api = district_api.replace('*','')
#cast to int
district_api = int(district_api.replace(',',''))
#add the API weighted by the number of students in the current district
api_county_total += district_api*district_num_students
#divide the weighted sum of APIs by the total number of students in the county
average_api = api_county_total/num_students_county_total
API_num_students = dom.by_class('medium\+_left')[0].parent.by_class("medium_center")[0].content
#use county number as a placeholder for the county name for now, as the county name is not easily scrapable
county = county_num
# write all the collected data to a new row of the output file
writer.writerow([str(county), str(num_students_county_total),str(average_api)])
示例4: get_by_year
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_class [as 别名]
def get_by_year(year):
url = URL("http://www.imdb.com/event/ev0000003/" + str(year))
dom = DOM(url.download(cached=True))
dictAll = {}
awards = dom.by_class('award')
awardTitles = awards[0].by_tag('h2')
awardList = []
for award in awardTitles:
awardList.append(award.content)
prize = awards[0].by_tag('blockquote')
for index, title in enumerate(prize[1:25]):
winner = title.by_tag('strong')[0].by_tag('a')[0].content
winner_id = str(title.by_tag('strong')[0].by_tag('a')[0].attrs['href'][-8:-1])
nomineeList = []
for each in title.by_tag('strong')[1::]:
name = each.by_tag('a')[0].content
id = str(each.by_tag('a')[0].attrs['href'][-8:-1])
nomineeList.append((clean_unicode(name),id))
winnersAndNominees = {}
winnersAndNominees['winner'] = (clean_unicode(winner),winner_id)
winnersAndNominees['nominees'] = nomineeList
dictAll[awardList[index]] = winnersAndNominees
return dictAll
示例5: scrape_top_250
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_class [as 别名]
def scrape_top_250(url):
'''
Scrape the IMDB top 250 movies index page.
Args:
url: pattern.web.URL instance pointing to the top 250 index page
Returns:
A list of strings, where each string is the URL to a movie's page on
IMDB, note that these URLS must be absolute (i.e. include the http
part, the domain part and the path part).
'''
movie_urls = []
# YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
# URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
# Pak de html van de url en maak er een DOM van
html = url.download()
dom = DOM(html)
# Elke url begint met deze root, deze root is nodig voor het absolute pad
root = 'http://www.imdb.com'
# De url van elke film zit in een td tag met class titleColumn
for movie in dom.by_class("titleColumn"):
# Maak een DOM van de inhoud tussen de td tags om daarin te kunnen zoeken
movieinfo = DOM(movie.content)
# Het relatieve pad van elke film is de waarde van 'href' van de eerste 'a' tag
# Concatenate de root en het relatieve pad voor het absolute pad en append aan movie_urls
movie_urls.append(root + movieinfo.by_tag("a")[0].attrs.get("href",""))
# return the list of URLs of each movie's page on IMDB
return movie_urls
示例6: scrape_starrtest
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_class [as 别名]
def scrape_starrtest(county_num):
if county_num<10:
county_num = '0' + str(county_num)
else:
county_num = str(county_num)
print county_num
#url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=01&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1'
url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=' + str(county_num) + '&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1'
abs_url = URL(string = url)
dom = DOM(abs_url.download(cached=True))#download the DOM
#sciend_num = dom.by_class("rm")[4].content
scicst_num = dom.by_class("rm")[3].content
math_num = dom.by_class("rm")[2].content
hist_num = dom.by_class("rm")[1].content
ela_num = dom.by_class("rm")[0].content
#sciend_percent = dom.by_class("rs")[4].content[:5]
scicst_percent = dom.by_class("rs")[3].content[:5]
math_percent = dom.by_class("rs")[2].content[:5]
hist_percent = dom.by_class("rs")[1].content[:5]
ela_percent = dom.by_class("rs")[0].content[:5]
county = dom.by_tag("h2")[0].content
# write all the collected data to a new row of the output file
writer.writerow([county, ela_num,ela_percent, hist_num, hist_percent, math_num, math_percent,scicst_num, scicst_percent])
示例7: process_page
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_class [as 别名]
def process_page():
url = URL("http://www.imdb.com/search/title?num_votes=5000,&sort=user_rating,desc&start=1&title_type=tv_series")
dom = DOM(url.download(cached=True))
domIndex = 0
for title in dom.by_class("title"):
theTitle = str(title.by_tag("a")[0].content).encode('ascii', 'replace')
#print theTitle
#titleCatalog.append(Title(title.by_tag("a")[0].content))
titleCatalog.append(Title(theTitle))
try:
# print dom.by_class("runtime")[domIndex].content
titleCatalog[domIndex].addRunTime( str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace'))
except:
pass
try:
# print dom.by_class("value")[domIndex].content
titleCatalog[domIndex].addRank( str(dom.by_class("value")[domIndex].content).encode('ascii', 'replace'))
except:
pass
try:
for genre in dom.by_class("genre")[domIndex].by_tag("a"):
# print genre.content
titleCatalog[domIndex].addGenre( str(genre.content).encode('ascii', 'replace'))
except:
pass
try:
for credit in dom.by_class("credit")[domIndex].by_tag("a"):
# print credit.content
titleCatalog[domIndex].addActors( str(credit.content).encode('ascii', 'replace'))
except:
pass
domIndex += 1
示例8: scrape_page
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_class [as 别名]
def scrape_page(url):
html = url.download()
dom = DOM(html)
table = DOM(dom.by_class("wikitable")[0].content)
countrylist = table.by_tag("tr")[1:]
pointsdict = {}
for c in countrylist:
infodict = {}
infodict["name"] = c.by_tag("a")[-1].content.encode("utf-8")
infodict["Overall"] = int(c.by_tag("td")[2].content.encode("utf-8"))
infodict["Female"] = int(c.by_tag("td")[4].content.encode("utf-8"))
infodict["Male"] = int(c.by_tag("td")[6].content.encode("utf-8"))
if infodict["Overall"] > 80:
infodict["fillKey"] = "HIGH"
elif infodict["Overall"] > 70:
infodict["fillKey"] = "ABVAVG"
elif infodict["Overall"] > 60:
infodict["fillKey"] = "AVG"
elif infodict["Overall"] > 50:
infodict["fillKey"] = "BELAVG"
else:
infodict["fillKey"] = "LOW"
code = ""
for countryCode in countryCodes:
if infodict["name"] == countryCode[2]:
code = countryCode[1]
break
# Als de code niet gevonden is, sla het land over
if code == "":
pass
# Anders maak een key van de code met value de infodict van dat land
else:
pointsdict[code] = infodict
json.dump(pointsdict, open("lifeexpectancy.json", "wb"))
示例9: scrape_top_250
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_class [as 别名]
def scrape_top_250(url):
'''
Scrape the IMDB top 250 movies index page.
Args:
url: pattern.web.URL instance pointing to the top 250 index page
Returns:
A list of strings, where each string is the URL to a movie's page on
IMDB, note that these URLS must be absolute (i.e. include the http
part, the domain part and the path part).
'''
movie_urls = []
# YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
# URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
url = URL(url)
dom = DOM(url.download())
for e in dom.by_class('titleColumn'):
for href in e('a')[:1]:
movie_urls.append("http://www.imdb.com" + href.attrs["href"])
# return the list of URLs of each movie's page on IMDB
return movie_urls
示例10: get_info
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_class [as 别名]
def get_info(baseurl, out_filename, npages=200):
output = open(out_filename, "w")
w = writer.UnicodeWriter(output)
# TODO: fix this header
w.writerow(
[
"Title",
"Rating",
"Calories (kcal)",
"Cholesterol (mg)",
"Fat (g)",
"Protein (g)",
"Fiber (g)",
"Sodium (mg)",
"Cook Time",
"Ingredients",
"Full Ingredients",
]
)
for page in range(1, npages):
try:
url = URL(baseurl + "?Page=%d" % page)
dom = DOM(url.download(cached=True))
links = dom.by_class("rectitlediv")
# goes through the 20 recipes on a given page
for index in range(len(links)):
# print index
# get the link name
title = links[index].content.split("/recipe/")[1].split("/detail")[0]
# download individual recipe
rpage = URL(os.path.join(base, title, end))
pdom = DOM(rpage.download(cached=True))
# average rating value
rating = pdom.by_attribute(itemprop="ratingValue")[0].source.split('"')[3]
# list of nutrition elements
nut_list = pdom.by_class("nutrSumWrap")[0].by_class("nutrSumList")
nut_vals = []
for i in range(len(nut_list)):
val = nut_list[i].by_attribute(id="lblNutrientValue")[0].content
nut_vals.append(val)
nuts = "\t".join(nut_vals)
# time needed to cook
try:
cook_hours = pdom.by_attribute(id="cookHoursSpan")[0].content
cook_hours = cook_hours.replace("<em>", " ").replace("</em>", " ")
except:
cook_hours = "0"
try:
cook_mins = pdom.by_attribute(id="cookMinsSpan")[0].content
cook_mins = cook_mins.replace("<em>", " ").replace("</em>", " ")
except:
cook_mins = "0"
mins = str(int(cook_hours.split()[0]) * 60 + int(cook_mins.split()[0]))
# ingredients
## gets the block containing both the amount and the amount
all_ings = pdom.by_attribute(itemprop="ingredients")
ing_units = []
ing_vals = []
for ing_index in range(len(all_ings)):
tmp_ing = all_ings[ing_index].by_id("lblIngName").content
if " " in all_ings[ing_index].content:
continue
try:
tmp_amount = all_ings[ing_index].by_id("lblIngAmount").content
except:
tmp_amount = "" # LET THIS BE THE EMPTY CHAR we decide on
ing_units.append(tmp_amount)
ing_vals.append(tmp_ing)
ings = ";".join(ing_vals)
ing_units = [x + "|" for x in ing_units]
str_ings = [str(x) for x in zip(ing_units, ing_vals)]
str_ings = [x.replace(",", " ") for x in str_ings]
full_ings = ";".join(str_ings)
full_ings = (
full_ings.replace("u'", "")
.replace("'", "")
.replace(", u", "")
.replace("(", "")
.replace(")", "")
.replace(" ", " ")
)
assert len(ing_vals) == len(ing_units)
w.writerow([title, rating, nuts, mins, ings, full_ings])
except:
pass
output.close()
示例11: open
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_class [as 别名]
self.writerow(row)
# Creating the csv output file for writing into as well as defining the writer
output = open("data_output_WIKI_EA.csv", "wb")
writer = UnicodeWriter(output)
# add header row
writer.writerow(["State", "Rank", "EA", "Degree"])
# get the DOM object to scrape for links
url = URL("http://en.wikipedia.org/wiki/List_of_U.S._states_by_educational_attainment")
dom = DOM(url.download(cached=True))
# get the tables where all info is contained
all_data_tables = dom.by_class("wikitable")
# define the variable to store all the WIKI data
all_wiki_data = []
# loop through each row
for ind_data_table in all_data_tables:
degree = ""
for ind_data_header in ind_data_table.by_tag("th"):
if "H.S. Graduate" in plaintext(ind_data_header.content):
degree = "High School"
if "Bachelor's Degree" in plaintext(ind_data_header.content):
degree = "Undergraduate"
if "Advanced Degree" in plaintext(ind_data_header.content):
示例12: get_title_attributes
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_class [as 别名]
def get_title_attributes(title, titleLink):
url = URL(titleLink)
dom = DOM(url.download(cached=True))
titleObj = Title(title.encode('ascii','replace'))
print "Movie: ", title
# Get Directors
print "-> About to print directors... "
directors = dom.by_attribute(itemprop="director")[0]
directorNames = directors.by_tag("a")
for director in directorNames:
print director.content
dirName = unicodedata.normalize('NFD', director.content).encode('ascii','replace')
#str(director.content).encode("utf-8")
print "Director ===> ", dirName
titleObj.addDirectors( dirName )
# Get writers
print "-> About to print writers... "
try:
writers = dom.by_attribute(itemprop="writer")
for writer in writers:
# print writer[1][1].content
titleObj.addWriters( str(writer[1][1].content).encode('ascii', 'replace'))
except:
pass
print "--> About to get actors... "
try:
actors = dom.by_attribute(itemprop="actors" )
for actor in actors:
# print actor[1][1].content
titleObj.addActors( str(actor[1][1].content).encode('ascii', 'replace'))
except:
pass
print "--> Aboutb to get rating information... "
try:
ratingsInfo = dom.by_class("star-box-giga-star")
for rating in ratingsInfo:
# print rating.content
titleObj.addRating(str(rating.content).encode('ascii', 'replace'))
except:
pass
print "--> About to print other stuff... "
for item in dom.by_class("infobar"):
try:
objMatch = re.search("(\d+)", item.by_tag("time")[0].content )
if objMatch:
# print objMatch.group(1)
titleObj.addRunTime( str(objMatch.group(1)).encode('ascii', 'replace'))
except:
pass
for genreItem in item.by_tag("a"):
try:
objMatch = re.search("genre", genreItem.attributes['href'] )
if objMatch:
titleObj.addGenre(str(genreItem.content).encode('ascii', 'replace'))
# print genreItem.attributes['href']
# print genreItem.content
except:
pass
return titleObj
示例13: open
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_class [as 别名]
# Create csv and add a header row
output = open("races_data.csv", "wb")
writer = csv.writer(output)
writer.writerow(["Race","Year", "POS", "Num", "rider ID", "Rider URL", "Rider01", "rider02" , "Machine", "Time", "Speed" ])
# Set up base URL and main URL. ERA 5 = 1991 - 2012
eras = ["1","2","3","4","5"]
for era in eras:
print "Era:" + era
url = URL("http://www.iomtt.com/TT-Database/Events.aspx?meet_code=TT2012&era=" + era)
text_url = "http://www.iomtt.com"
# Get a hold of the dom and then Grab each Year's URL which is embeded on li tags.
dom = DOM(url.download(cached=True))
years = dom.by_class("ttDatabasePipeSeparator floatleft")[0].by_tag("li")
# Iterate over each year
for year in years:
#Print commands are useful to monitor progress.
print("year:")
print year.by_tag("a")[0].attributes.get('href','')
#Find the current year's URL and download its DOM.
new_url = URL(text_url + year.by_tag("a")[0].attributes.get('href',''))
year_url = URL(new_url)
year_dom = DOM(year_url.download(cached=True))
#races = year_dom.by_class("panelinner clearfix")[0].by_tag("ul")[0].by_tag("li")
races_div = races = year_dom.by_class("ttDatabase")[0].by_class("panelinner")[1].by_tag("ul")
if len(races_div) > 1:
races = races_div[0].by_tag("li")
示例14: URL
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_class [as 别名]
# Ratings
# Number of Ratings
page_urls = []
tableRows = dom.by_id('main').by_tag('table')[1].by_tag('tr')
for tr in tableRows[1:]:
a = tr.by_tag('a')[0]
page_urls.append(clean_unicode(abs_url(a.attributes.get('href', ''), url.string)))
for p in page_urls:
p_url = URL(p)
p_dom = DOM(p_url.download(cached=True))
title = clean_unicode(p_dom.by_class('header')[0].content)
title = plaintext(strip_between('<span', '</span>', title))
runtime = clean_unicode(p_dom.by_class('infobar')[0].by_tag('time')[0].content)
genres = []
for genre in p_dom.by_class('infobar')[0].by_tag('a')[:-1]:
genres.append(clean_unicode(genre.content))
directors = []
writers = []
actors = []
text_blocks = p_dom.by_class('txt-block')[:3]
for t in text_blocks:
spans = t.by_tag('span')
示例15: URL
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_class [as 别名]
movieUrl = URL(movieTitleLinks.group(0))
movieDom = DOM(movieUrl.download(cached=True))
#=======================================================================
# Get the title
#=======================================================================
for movie in movieDom.by_tag("title"):
title = re.sub(' \(\d+\) - IMDb','', movie.content.encode('ascii','ignore').strip())
#=======================================================================
# Get the runtime
#=======================================================================
for movie in movieDom.by_class("infobar"):
time = re.search('\d+ min', movie.content.encode('ascii', 'ignore').strip())
runtime = re.sub(' min','', time.group(0))
#===================================================================
# Get the genres
#===================================================================
genre = []
for g in movie.by_tag('a'):
type = re.sub('\n|\d+.*|\(.*\)','', g.content.encode('ascii', 'ignore').strip('\r\n'))
if ((type != ' \n') and not (re.match('^\s+', type))):
genre.append(type)