本文整理汇总了Python中pattern.web.DOM类的典型用法代码示例。如果您正苦于以下问题:Python DOM类的具体用法?Python DOM怎么用?Python DOM使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了DOM类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: scrape_top_250
def scrape_top_250(url):
'''
Scrape the IMDB top 250 movies index page.
Args:
url: pattern.web.URL instance pointing to the top 250 index page
Returns:
A list of strings, where each string is the URL to a movie's page on
IMDB, note that these URLS must be absolute (i.e. include the http
part, the domain part and the path part).
'''
movie_urls = []
# YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
# URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
# Pak de html van de url en maak er een DOM van
html = url.download()
dom = DOM(html)
# Elke url begint met deze root, deze root is nodig voor het absolute pad
root = 'http://www.imdb.com'
# De url van elke film zit in een td tag met class titleColumn
for movie in dom.by_class("titleColumn"):
# Maak een DOM van de inhoud tussen de td tags om daarin te kunnen zoeken
movieinfo = DOM(movie.content)
# Het relatieve pad van elke film is de waarde van 'href' van de eerste 'a' tag
# Concatenate de root en het relatieve pad voor het absolute pad en append aan movie_urls
movie_urls.append(root + movieinfo.by_tag("a")[0].attrs.get("href",""))
# return the list of URLs of each movie's page on IMDB
return movie_urls
示例2: getRandomHistoryDOM
def getRandomHistoryDOM(language):
url = URL("http://"+language+".wikipedia.org/wiki/Special:Random")
#Gets the url only of the page this redirects to
redirectUrl = url.redirect
try:
#Grab the name of the wikipedia article from the url
urlComponents = string.split(redirectUrl, '/')
except AttributeError:
#Use some recursion if we encounter a page with no history, or some other error
return getRandomHistoryDOM(language)
#Get the history section of the article
redirectUrl = "http://"+language+".wikipedia.org/w/index.php?title="+urlComponents[4]+"&action=history"
print "Current article is: " +str(urlComponents[4])
#print redirectUrl
url = URL(redirectUrl);
dom = DOM(url.download(cached=False))
try:
historyList = dom.by_id("pagehistory").by_tag("li")
return historyList, urlComponents[4]
except AttributeError:
#Use some recursion if we encounter a page with no history, or some other error
dom = getRandomHistoryDOM(language)
return getRandomHistoryDOM(language)
示例3: scrape_beer_info_urls
def scrape_beer_info_urls(url):
'''
Scrape the top 30 beer discounts from Yenom.com
'''
# Download the HTML file
html = url.download()
# Parse the HTML file into a DOM representation
dom = DOM(html)
table = dom.by_tag("table.hikashop_products_table adminlist table table-striped table-hover")[0]
i = 0
info_urls = []
# Loop through all beer discounts
for listItem in table.by_tag("tr")[1:]:
print
print i
i += 1
print
# Get URL
links = listItem.by_tag("a")
# Some of the rows in the table are separators between supermarkets so they do not have a link
if len(links) > 0:
#print Links[0].content.encode("utf-8")
print HOME_URL + links[0].attrs["href"]
info_urls.append(HOME_URL + links[0].attrs["href"])
# return the list of URLs for each info page
return info_urls
"""
示例4: getReviews
def getReviews(self):
params = {
'id' : "comments",
'oid' : 0,
'showAll' : 'yes'
}
reviews = []
i=0
for rs in self.conn.resturants.find():
reviews = []
if not rs.get('reviews'):
oid = str(rs['url']).split('=')[1]
params['oid'] = oid
req = DOM(URL(self.xmlUrl, query=params).download())
for item in req.by_tag('item'):
if item.by_tag('description'):
content = plaintext(item.by_tag('description')[0].content)
reviews.append(self.parseReview(content))
# print reviews[0:3]
rs['reviews'] = reviews
self.conn.resturants.save(rs)
print 'saved reviews for', rs['name']
else:
print 'already have reviews for', rs['name']
示例5: scrape_top_250
def scrape_top_250(url):
"""
Scrape the IMDB top 250 movies index page.
Args:
url: pattern.web.URL instance pointing to the top 250 index page
Returns:
A list of strings, where each string is the URL to a movie's page on
IMDB, note that these URLS must be absolute (i.e. include the http
part, the domain part and the path part).
"""
# This piece of code is needed to use the dom structure while it is not given as argument.
TOP_250_URL = "http://www.imdb.com/chart/top"
top_250_url = URL(TOP_250_URL)
top_250_html = top_250_url.download(cached=True)
dom = DOM(top_250_html)
movie_urls = []
"""
Searches in the HTML of the top 250 page of IMDB for the urls of the individual pages per film.
Uses CSS selectors to find the right urls and subsequently places them in a list
"""
for e in dom.by_tag("td.titleColumn"):
for a in e.by_tag("a")[:1]:
main = "http://www.imdb.com"
Locallink = main + a.attrs["href"]
movie_urls.append(Locallink)
# return the list of URLs of each movie's page on IMDB
return movie_urls
示例6: scrape_top_250
def scrape_top_250(url):
'''
Scrape the IMDB top 250 movies index page.
Args:
url: pattern.web.URL instance pointing to the top 250 index page
Returns:
A list of strings, where each string is the URL to a movie's page on
IMDB, note that these URLS must be absolute (i.e. include the http
part, the domain part and the path part).
'''
movie_urls = []
# YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
# URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
top_250_url = URL(url)
top_250_html = top_250_url.download(cached=True)
top_250_dom = DOM(top_250_html)
for a in top_250_dom.by_tag("td.titleColumn")[:1]:
for b in a.by_tag("a"):
link_ext = b.attrs["href"].encode("utf-8")
link_base = "http://www.imdb.com"
link = link_base+link_ext
movie_urls.append(link)
# return the list of URLs of each movie's page on IMDB
return movie_urls
示例7: scrape_top_250
def scrape_top_250(url):
'''
Scrape the IMDB top 250 movies index page.
Args:
url: pattern.web.URL instance pointing to the top 250 index page
Returns:
A list of strings, where each string is the URL to a movie's page on
IMDB, note that these URLS must be absolute (i.e. include the http
part, the domain part and the path part).
'''
movie_urls = []
# Grab web page
movie_html = URL(url).download(cached=True)
# Extract relevant information for each movie
movie_dom = DOM(movie_html)
for a in movie_dom.by_tag("td.titleColumn"):
for b in a.by_tag("a"):
b = str(b)
title = b.split('"')[1]
url = "http://www.imdb.com", b.split('"')[1]
urly = "".join(url)
movie_urls.append(urly)
# return the list of URLs of each movie's page on IMDB
return movie_urls
示例8: scrape_top_250
def scrape_top_250(url):
'''
Scrape the IMDB top 250 movies index page.
Args:
url: pattern.web.URL instance pointing to the top 250 index page
Returns:
A list of strings, where each string is the URL to a movie's page on
IMDB, note that these URLS must be absolute (i.e. include the http
part, the domain part and the path part).
'''
movie_urls = []
# YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
# URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
# Create a DOM of the URL.
html = url.download(cashed=True)
dom = DOM(html)
for movie_table in dom.by_tag("table.chart full-width"):
for movie_table_row in movie_table.by_tag("tr")[1:251]: # The first row is redundant, so start from index 1.
for movie_table_row_cell in movie_table_row.by_tag("td.titleColumn"):
for a in movie_table_row_cell.by_tag("a"):
# Obtain the path of the URL to the movie's page, create an absolute URL, and append it to the list 'movie_urls'.
movie_url_path = a.attrs["href"]
absolute_movie_url = "".join(["http://www.imdb.com/", movie_url_path])
movie_urls.append(absolute_movie_url)
# Return the list of URLs of each movie's page on IMDB.
return movie_urls
示例9: scrape_top_250
def scrape_top_250(url):
'''
Scrape the IMDB top 250 movies index page.
Args:
url: pattern.web.URL instance pointing to the top 250 index page
Returns:
A list of strings, where each string is the URL to a movie's page on
IMDB, note that these URLS must be absolute (i.e. include the http
part, the domain part and the path part).
'''
from pattern.web import abs
movie_urls = []
html = url.download(cached=True)
dom = DOM(html)
for a in dom.by_tag("tbody.lister-list"):
for b in a.by_tag("td.titleColumn"):
for c in b.by_tag("a"):
link = c.attrs.get("href","")
link = abs(link, base=url.redirect or url.string)
movie_urls.append(link)
# YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
# URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
# return the list of URLs of each movie's page on IMDB
return movie_urls
示例10: scrape_top_250
def scrape_top_250(url):
'''
Scrape the IMDB top 250 movies index page.
Args:
url: pattern.web.URL instance pointing to the top 250 index page
Returns:
A list of strings, where each string is the URL to a movie's page on
IMDB, note that these URLS must be absolute (i.e. include the http
part, the domain part and the path part).
'''
movie_urls = []
# YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
# URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
# initieer movie_html en en movie_dom naar imdb top 250 site
movie_html = URL(url).download(cached=True)
movie_dom = DOM(movie_html)
# zoek op de site naar td.titlecolumn waar link in zit
for films in movie_dom.by_tag("td.titleColumn"):
# zoek link in td.titlecolumn
link = films.by_tag('a')[0]
# maak abslote path en voeg het toe aan de lijst movies_urls
link = "http://www.imdb.com" + link.attrs.get("href","")
movie_urls.append(link)
# return the list of URLs of each movie's page on IMDB
return movie_urls
示例11: scrape_top_250
def scrape_top_250(url):
'''
Scrape the IMDB top 250 movies index page.
Args:
url: pattern.web.URL instance pointing to the top 250 index page
Returns:
A list of strings, where each string is the URL to a movie's page on
IMDB, note that these URLS must be absolute (i.e. include the http
part, the domain part and the path part).
'''
movie_urls = []
dom = DOM(URL(TOP_250_URL).download()) #set domain
for td in dom.by_tag("td.titleColumn")[:250]: #loop over movies
for a in td.by_tag("a"):
a = str(a)
a = a.split('"')
link = "http://www.imdb.com" + a[1]
movie_urls.append(link)
print movie_urls
# YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
# URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
# return the list of URLs of each movie's page on IMDB
return movie_urls
示例12: process_page
def process_page():
url = URL("http://www.imdb.com/search/title?num_votes=5000,&sort=user_rating,desc&start=1&title_type=tv_series")
dom = DOM(url.download(cached=True))
domIndex = 0
for title in dom.by_class("title"):
theTitle = str(title.by_tag("a")[0].content).encode('ascii', 'replace')
titleCatalog.append(Title(theTitle))
try:
match = re.search("^(\d+).*$", str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace'))
#print match.group(1)
# titleCatalog[domIndex].addRunTime( str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace'))
titleCatalog[domIndex].addRunTime(match.group(1))
except Exception, e:
pass
try:
titleCatalog[domIndex].addRank( str(dom.by_class("value")[domIndex].content).encode('ascii', 'replace'))
except Exception, e:
pass
示例13: scrape_top_250
def scrape_top_250(url):
'''
Scrape the IMDB top 250 movies index page.
Args:
url: pattern.web.URL instance pointing to the top 250 index page
Returns:
A list of strings, where each string is the URL to a movie's page on
IMDB, note that these URLS must be absolute (i.e. include the http
part, the domain part and the path part).
'''
movie_urls = []
url = URL(url)
html = url.download()
dom = DOM(html)
homeUrl = 'http://www.imdb.com'
# YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
# URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
for e in dom.by_tag("td.titleColumn"):
absoluteUrl = ''
for a in e.by_tag("a"):
link = a.attributes.get("href","")
absoluteUrl = homeUrl + link
movie_urls.append(absoluteUrl)
# return the list of URLs of each movie's page on IMDB
return movie_urls
示例14: scrape_top_250
def scrape_top_250(url):
'''
Scrape the IMDB top 250 movies index page.
Args:
url: pattern.web.URL instance pointing to the top 250 index page
Returns:
A list of strings, where each string is the URL to a movie's page on
IMDB, note that these URLS must be absolute (i.e. include the http
part, the domain part and the path part).
'''
# YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
# URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
movie_urls = []
index_html = URL(url).download(cached=True)
index_dom = DOM(index_html)
# Get all information from IMDB
for i in index_dom.by_tag("td.titleColumn")[:250]:
# Get title and append in tvserieslist
for j in i.by_tag("a")[:1]:
url = j.attributes["href"]
#movie_urls.append(str(title[0]))
movie_urls.append("http://www.imdb.com" + url)
# return the list of URLs of each movie's page on IMDB
return movie_urls
示例15: make_json
def make_json(url):
json_dict = {}
# Geef de data een titel
json_dict["data"] = "percentage renewable energy"
# Pak de DOM van de tabel van alle landen
html = url.download()
dom = DOM(DOM(html).by_class("wikitable")[1].content)
# Maak een list met info over de landen
countrylist = dom.by_tag("tr")[1:]
# Lege list om de data aan te appenden
pointslist = []
for countryinfo in countrylist:
# Lege list om land en percentage renewable energy aan te appenden
infopair = []
# Neem de naam van het land en append dat aan infopair
infopair.append(DOM(countryinfo.content).by_tag("a")[0].attrs.get("title", "").encode("utf-8"))
# Neem het percentage renewable energy van het land en append dat aan infopair
infopair.append(DOM(countryinfo.content).by_tag("td")[8].content.encode("utf-8"))
# Append de list aan pointslist voor een nested list
pointslist.append(infopair)
# Geef de dictionary de key 'points' met value de nested list pointslist
json_dict["points"] = pointslist
# Dump de dictionary als JSON naar de textfile json.txt
json.dump(json_dict, open("json.txt", "wb"))