本文整理汇总了Python中pattern.web.DOM.by_attribute方法的典型用法代码示例。如果您正苦于以下问题:Python DOM.by_attribute方法的具体用法?Python DOM.by_attribute怎么用?Python DOM.by_attribute使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pattern.web.DOM
的用法示例。
在下文中一共展示了DOM.by_attribute方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1:
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_attribute [as 别名]
type = re.sub('\n|\d+.*|\(.*\)','', g.content.encode('ascii', 'ignore').strip('\r\n'))
if ((type != ' \n') and not (re.match('^\s+', type))):
genre.append(type)
genresStr = ';'.join(genre)
#=======================================================================
# Get the directors
#=======================================================================
directors = []
for movie in movieDom.by_attribute(itemprop="director"):
# Get rid of the html tags
dir = re.sub('<[a-zA-Z\/][^>]*>','', movie.content.encode('ascii','ignore').lstrip('\r\n'))
# Get rid of new line
dirs = re.sub('\n', '', dir)
# Directors for other movies have leading spaces - don't add them
if not re.match('^\s+', dirs):
directors.append(dirs)
directorsStr = ';'.join(directors)
#=======================================================================
示例2: get_info
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_attribute [as 别名]
def get_info(baseurl, out_filename, npages=200):
output = open(out_filename, "w")
w = writer.UnicodeWriter(output)
# TODO: fix this header
w.writerow(
[
"Title",
"Rating",
"Calories (kcal)",
"Cholesterol (mg)",
"Fat (g)",
"Protein (g)",
"Fiber (g)",
"Sodium (mg)",
"Cook Time",
"Ingredients",
"Full Ingredients",
]
)
for page in range(1, npages):
try:
url = URL(baseurl + "?Page=%d" % page)
dom = DOM(url.download(cached=True))
links = dom.by_class("rectitlediv")
# goes through the 20 recipes on a given page
for index in range(len(links)):
# print index
# get the link name
title = links[index].content.split("/recipe/")[1].split("/detail")[0]
# download individual recipe
rpage = URL(os.path.join(base, title, end))
pdom = DOM(rpage.download(cached=True))
# average rating value
rating = pdom.by_attribute(itemprop="ratingValue")[0].source.split('"')[3]
# list of nutrition elements
nut_list = pdom.by_class("nutrSumWrap")[0].by_class("nutrSumList")
nut_vals = []
for i in range(len(nut_list)):
val = nut_list[i].by_attribute(id="lblNutrientValue")[0].content
nut_vals.append(val)
nuts = "\t".join(nut_vals)
# time needed to cook
try:
cook_hours = pdom.by_attribute(id="cookHoursSpan")[0].content
cook_hours = cook_hours.replace("<em>", " ").replace("</em>", " ")
except:
cook_hours = "0"
try:
cook_mins = pdom.by_attribute(id="cookMinsSpan")[0].content
cook_mins = cook_mins.replace("<em>", " ").replace("</em>", " ")
except:
cook_mins = "0"
mins = str(int(cook_hours.split()[0]) * 60 + int(cook_mins.split()[0]))
# ingredients
## gets the block containing both the amount and the amount
all_ings = pdom.by_attribute(itemprop="ingredients")
ing_units = []
ing_vals = []
for ing_index in range(len(all_ings)):
tmp_ing = all_ings[ing_index].by_id("lblIngName").content
if " " in all_ings[ing_index].content:
continue
try:
tmp_amount = all_ings[ing_index].by_id("lblIngAmount").content
except:
tmp_amount = "" # LET THIS BE THE EMPTY CHAR we decide on
ing_units.append(tmp_amount)
ing_vals.append(tmp_ing)
ings = ";".join(ing_vals)
ing_units = [x + "|" for x in ing_units]
str_ings = [str(x) for x in zip(ing_units, ing_vals)]
str_ings = [x.replace(",", " ") for x in str_ings]
full_ings = ";".join(str_ings)
full_ings = (
full_ings.replace("u'", "")
.replace("'", "")
.replace(", u", "")
.replace("(", "")
.replace(")", "")
.replace(" ", " ")
)
assert len(ing_vals) == len(ing_units)
w.writerow([title, rating, nuts, mins, ings, full_ings])
except:
pass
output.close()
示例3: get_title_attributes
# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_attribute [as 别名]
def get_title_attributes(title, titleLink):
url = URL(titleLink)
dom = DOM(url.download(cached=True))
titleObj = Title(title.encode('ascii','replace'))
print "Movie: ", title
# Get Directors
print "-> About to print directors... "
directors = dom.by_attribute(itemprop="director")[0]
directorNames = directors.by_tag("a")
for director in directorNames:
print director.content
dirName = unicodedata.normalize('NFD', director.content).encode('ascii','replace')
#str(director.content).encode("utf-8")
print "Director ===> ", dirName
titleObj.addDirectors( dirName )
# Get writers
print "-> About to print writers... "
try:
writers = dom.by_attribute(itemprop="writer")
for writer in writers:
# print writer[1][1].content
titleObj.addWriters( str(writer[1][1].content).encode('ascii', 'replace'))
except:
pass
print "--> About to get actors... "
try:
actors = dom.by_attribute(itemprop="actors" )
for actor in actors:
# print actor[1][1].content
titleObj.addActors( str(actor[1][1].content).encode('ascii', 'replace'))
except:
pass
print "--> Aboutb to get rating information... "
try:
ratingsInfo = dom.by_class("star-box-giga-star")
for rating in ratingsInfo:
# print rating.content
titleObj.addRating(str(rating.content).encode('ascii', 'replace'))
except:
pass
print "--> About to print other stuff... "
for item in dom.by_class("infobar"):
try:
objMatch = re.search("(\d+)", item.by_tag("time")[0].content )
if objMatch:
# print objMatch.group(1)
titleObj.addRunTime( str(objMatch.group(1)).encode('ascii', 'replace'))
except:
pass
for genreItem in item.by_tag("a"):
try:
objMatch = re.search("genre", genreItem.attributes['href'] )
if objMatch:
titleObj.addGenre(str(genreItem.content).encode('ascii', 'replace'))
# print genreItem.attributes['href']
# print genreItem.content
except:
pass
return titleObj