Python DOM.by_tag方法代码示例

本文整理汇总了Python中pattern.web.DOM.by_tag方法的典型用法代码示例。如果您正苦于以下问题：Python DOM.by_tag方法的具体用法？Python DOM.by_tag怎么用？Python DOM.by_tag使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pattern.web.DOM的用法示例。

在下文中一共展示了DOM.by_tag方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: extract_tvseries

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_tag [as 别名]
def extract_tvseries(dom):
    '''
    Extract a list of highest ranking TV series from DOM (of IMDB page).

    Each TV series entry should contain the following fields:
    - TV Title
    - Ranking
    - Genres (comma separated if more than one)
    - Actors/actresses (comma separated if more than one)
    - Runtime (only a number!)
    '''

    dom = DOM(URL(TARGET_URL).download())

    # global list for storing all series information
    series_list = []

    # collect all info of the series, one series at a time
    for l in range(NUMBER_OF_SERIES):

        # temporary variables to make strings
        genre = ''
        credit = ''

        # get rank for each series
        rank = dom.by_tag("tr.detailed")[l].by_tag("span.value")[0].content

        # get runtime for each series
        time = dom.by_tag("span.runtime")[l]
        time = plaintext(time.content)[:-5]
      
        # get all genres for each series
        for m in dom.by_tag("span.genre")[l].by_tag("a"):
            genre += m.content + ", "
        genre = genre[:-2].encode('ascii', 'ignore').decode('ascii')

        # get all actors for each series
        for m in dom.by_tag("span.credit")[l].by_tag("a"):
            credit += m.content + ", "
        credit = credit[:-2].encode('ascii', 'ignore').decode('ascii')

        # get title for each series
        title = dom.by_tag("tr.detailed")[l].by_tag("a")[1].content

        # store info for each series
        series = [title, rank, genre, credit, time]
        series_list.append(series)

    return series_list

开发者ID:PatrickJonk，项目名称:DataProcessing，代码行数:51，代码来源:tvscraper.py

示例2: scrape_top_250

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_tag [as 别名]
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    
    # Pak de html van de url en maak er een DOM van
    html = url.download()
    dom = DOM(html)
    # Elke url begint met deze root, deze root is nodig voor het absolute pad
    root = 'http://www.imdb.com'
    
    # De url van elke film zit in een td tag met class titleColumn
    for movie in dom.by_class("titleColumn"):
    	# Maak een DOM van de inhoud tussen de td tags om daarin te kunnen zoeken
        movieinfo = DOM(movie.content)
        # Het relatieve pad van elke film is de waarde van 'href' van de eerste 'a' tag
        # Concatenate de root en het relatieve pad voor het absolute pad en append aan movie_urls
        movie_urls.append(root + movieinfo.by_tag("a")[0].attrs.get("href",""))
        
                    
    # return the list of URLs of each movie's page on IMDB
    return movie_urls

开发者ID:Lesliedao，项目名称:DataProcessing，代码行数:35，代码来源:imdb-crawler.py

示例3: getReviews

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_tag [as 别名]
	def getReviews(self):
		params = {
			'id' : "comments",
			'oid' : 0,
			'showAll' : 'yes'
		}
		reviews = []

		i=0
		for rs in self.conn.resturants.find():
			reviews = []
			if not rs.get('reviews'):
				oid = str(rs['url']).split('=')[1]
				params['oid'] = oid
				req = DOM(URL(self.xmlUrl, query=params).download())
				for item in req.by_tag('item'):
					if item.by_tag('description'):
						content = plaintext(item.by_tag('description')[0].content)
						reviews.append(self.parseReview(content))
				
				# print reviews[0:3]
				rs['reviews'] = reviews
				self.conn.resturants.save(rs)
				print 'saved reviews for', rs['name']	
			else:
				print 'already have reviews for', rs['name']

开发者ID:debovis，项目名称:python-analysis，代码行数:28，代码来源:scrape.py

示例4: scrape_top_250

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_tag [as 别名]
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    dom = DOM(URL(TOP_250_URL).download()) #set domain
    for td in dom.by_tag("td.titleColumn")[:250]: #loop over movies
        for a in td.by_tag("a"):
            a = str(a)
            a = a.split('"')
            link = "http://www.imdb.com" + a[1]
            movie_urls.append(link)
    print movie_urls

       


    
           

    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.



    # return the list of URLs of each movie's page on IMDB
    return movie_urls

开发者ID:casbootuva，项目名称:DataProcessing，代码行数:37，代码来源:imdb-crawler.py

示例5: scrape_beer_info_urls

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_tag [as 别名]
def scrape_beer_info_urls(url):
    '''
    Scrape the top 30 beer discounts from Yenom.com
    '''
    # Download the HTML file
    html = url.download()
    # Parse the HTML file into a DOM representation
    dom = DOM(html)
    table = dom.by_tag("table.hikashop_products_table adminlist table table-striped table-hover")[0]
    
    i = 0
    info_urls = []
    # Loop through all beer discounts
    for listItem in table.by_tag("tr")[1:]:
        print 
        print i
        i += 1
        print
        # Get URL
        links = listItem.by_tag("a")
        # Some of the rows in the table are separators between supermarkets so they do not have a link
        if len(links) > 0:
            #print Links[0].content.encode("utf-8")
            print HOME_URL + links[0].attrs["href"]
            info_urls.append(HOME_URL + links[0].attrs["href"])

    # return the list of URLs for each info page
    return info_urls

    """

开发者ID:A-meerdervan，项目名称:DataProcessingGit，代码行数:32，代码来源:yenom-beer-crawler.py

示例6: scrape_top_250

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_tag [as 别名]
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    
    top_250_url = URL(url)
    top_250_html = top_250_url.download(cached=True)
    top_250_dom = DOM(top_250_html)

    for a in top_250_dom.by_tag("td.titleColumn")[:1]:
        for b in a.by_tag("a"):
            link_ext = b.attrs["href"].encode("utf-8")
            link_base = "http://www.imdb.com"
            link = link_base+link_ext
            movie_urls.append(link)
             

    # return the list of URLs of each movie's page on IMDB
    return movie_urls

开发者ID:zkkeser，项目名称:data_processing，代码行数:32，代码来源:imdb-crawler.py

示例7: scrape_starrtest

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_tag [as 别名]
def scrape_starrtest(county_num):
	if county_num<10:
		county_num = '0' + str(county_num)
	else:
		county_num = str(county_num)
	
	print county_num
	#url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=01&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1'
	url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=' + str(county_num) + '&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1'
	abs_url = URL(string = url)
	dom = DOM(abs_url.download(cached=True))#download the DOM

	
	#sciend_num = dom.by_class("rm")[4].content
	scicst_num = dom.by_class("rm")[3].content
	math_num = dom.by_class("rm")[2].content
	hist_num = dom.by_class("rm")[1].content
	ela_num = dom.by_class("rm")[0].content
	
	#sciend_percent = dom.by_class("rs")[4].content[:5]
	scicst_percent = dom.by_class("rs")[3].content[:5]
	math_percent = dom.by_class("rs")[2].content[:5]
	hist_percent = dom.by_class("rs")[1].content[:5]
	ela_percent = dom.by_class("rs")[0].content[:5]
	
	county = dom.by_tag("h2")[0].content
	
	
	# write all the collected data to a new row of the output file
	writer.writerow([county, ela_num,ela_percent, hist_num, hist_percent, math_num, math_percent,scicst_num, scicst_percent])

开发者ID:lisayao，项目名称:Education-in-California，代码行数:32，代码来源:Education_Scraper.py

示例8: scrape_top_250

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_tag [as 别名]
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []

    dom = DOM(url.download())
    from pattern.web import abs
    url = URL("http://imdb.com")
    for x in dom.by_tag("td.titleColumn"):
        x = x.by_tag("a")[0]
        x = x.attrs.get("href","")
        x = abs(x, base=url.redirect or url.string)
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.



    # return the list of URLs of each movie's page on IMDB
    return movie_urls

开发者ID:jordi1992，项目名称:DataProcessing，代码行数:30，代码来源:imdb-crawler.py

示例9: scrape_top_250

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_tag [as 别名]
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []

    # Grab web page
    movie_html = URL(url).download(cached=True)

    # Extract relevant information for each movie
    movie_dom = DOM(movie_html)

    for a in movie_dom.by_tag("td.titleColumn"):
        for b in a.by_tag("a"):
            b = str(b)
            title = b.split('"')[1]
            url = "http://www.imdb.com", b.split('"')[1]
            urly = "".join(url)
            movie_urls.append(urly)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls

开发者ID:MounirHader，项目名称:Dataprocessing，代码行数:32，代码来源:imdb-crawler.py

示例10: scrape_top_250

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_tag [as 别名]
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''

    movie_urls = []

    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    # Create a DOM of the URL.
    html = url.download(cashed=True)
    dom = DOM(html)

    for movie_table in dom.by_tag("table.chart full-width"):
        for movie_table_row in movie_table.by_tag("tr")[1:251]: # The first row is redundant, so start from index 1.
            for movie_table_row_cell in movie_table_row.by_tag("td.titleColumn"):
                for a in movie_table_row_cell.by_tag("a"):
                    # Obtain the path of the URL to the movie's page, create an absolute URL, and append it to the list 'movie_urls'. 
                    movie_url_path = a.attrs["href"]
                    absolute_movie_url = "".join(["http://www.imdb.com/", movie_url_path])
                    movie_urls.append(absolute_movie_url)

    # Return the list of URLs of each movie's page on IMDB.
    return movie_urls

开发者ID:vincent-erich，项目名称:DataProcessing，代码行数:35，代码来源:imdb-crawler.py

示例11: scrape_top_250

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_tag [as 别名]
def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    
    # initieer movie_html en en movie_dom naar imdb top 250 site
    movie_html = URL(url).download(cached=True)
    movie_dom = DOM(movie_html)
        # zoek op de site naar td.titlecolumn waar link in zit
    for films in movie_dom.by_tag("td.titleColumn"):
        # zoek link in td.titlecolumn 
        link = films.by_tag('a')[0]
        # maak abslote path en voeg het toe aan de lijst movies_urls
        link = "http://www.imdb.com" + link.attrs.get("href","")
        movie_urls.append(link)


    # return the list of URLs of each movie's page on IMDB
    return movie_urls

开发者ID:rubenpostma，项目名称:data-processing2，代码行数:32，代码来源:imdbmoviescraper.py

示例12: obtain_data

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_tag [as 别名]
def obtain_data(url):
	'''
	Scrape the Wikipedia page.

	Args:
		url: pattern.web.URL instance pointing to the Wikipedia page

	Returns:
		A list of lists, where each sublist represents a data point. Each
		sublist contains two elements: a string with the name of the country,
		and a string with the size of the population of that country. 
	'''

	# Create a DOM of the URL.
	html = url.download(cached=True)
	dom = DOM(html)

	data_points = []

	for countries_table in dom.by_tag("table.wikitable sortable"):
		for table_row in countries_table.by_tag("tr")[1:]:	# The first row is the header, so start at index 1.
			table_row_content = []
			# Obtain the content of the row.
			for table_row_cell in table_row.by_tag("td"):
				table_row_cell_content = unicode(plaintext(table_row_cell.content))
				table_row_content.append(table_row_cell_content)
			# Obtain the country name and the population size.
			country = table_row_content[1].split("[")[0].split(" (")[0]
			population = "".join(table_row_content[2].split(","))
			data_point = [country, population]
			data_points.append(data_point)

	return data_points

开发者ID:vincent-erich，项目名称:DataProcessing，代码行数:35，代码来源:wikipedia_crawler.py

示例13: fbMessageDump

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_tag [as 别名]
class fbMessageDump(MessageDump):
	def __init__(self, dump, p1, p2 = None, **kwargs):
		super(fbMessageDump, self).__init__(dump, p1, **kwargs)
		
	def construct_dump(self):
		f = open(self.dump, "r")
		self.dump = DOM(f.read())
		f.close()
		
	def construct_threads(self):
		for i in self.dump.by_tag("div.thread"):
			cur_thread = msg_classes.Thread()
			cur_thread.p1 = self.p1
			thread_exists = False
			if plaintext(i.by_tag("span.profile fn")[0].content) == self.p1: 
				cur_thread.p2 = plaintext(i.by_tag("span.profile fn")[1].content)
			else:
				cur_thread.p2 = plaintext(i.by_tag("span.profile fn")[0].content)
			# TODO if p1 and p2 have the same name, error!
			# assert cur_thread.p1 != cur_thread.p2 
			for e in i.by_tag("div.message"):
				cur_thread.add_message(
						plaintext(e.by_tag("div.from")[0].content).encode("utf-8"), 
						e.by_tag("abbr.time published")[0].attributes['title'].encode("utf-8"),
						plaintext(e.by_tag("div.msgbody")[0].content).encode("utf-8")
						)
			cur_thread.construct_conversations() 
			for t in self.threads:
				if t.p2 == cur_thread.p2:
					thread_exists = True 
					t.combine(cur_thread)

			if not thread_exists:
				self.threads.append(cur_thread)

开发者ID:alex-mcleod，项目名称:py_msg，代码行数:36，代码来源:read.py

示例14: scrape_top_250

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_tag [as 别名]
def scrape_top_250(url):
    """
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    """

    # This piece of code is needed to use the dom structure while it is not given as argument.
    TOP_250_URL = "http://www.imdb.com/chart/top"
    top_250_url = URL(TOP_250_URL)
    top_250_html = top_250_url.download(cached=True)
    dom = DOM(top_250_html)
    movie_urls = []

    """
    Searches in the HTML of the top 250 page of IMDB for the urls of the individual pages per film.
    Uses CSS selectors to find the right urls and subsequently places them in a list
    """

    for e in dom.by_tag("td.titleColumn"):
        for a in e.by_tag("a")[:1]:
            main = "http://www.imdb.com"
            Locallink = main + a.attrs["href"]
            movie_urls.append(Locallink)
    # return the list of URLs of each movie's page on IMDB
    return movie_urls

开发者ID:thomasjurriaan，项目名称:datapro，代码行数:34，代码来源:crawling+-+Dit+is+de+oude+code+die+dus+niet+werkt.py

示例15: extract_tvseries

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_tag [as 别名]
def extract_tvseries(dom):

    url = URL(TARGET_URL)
    dom = DOM(url.download(cached=True))
    #print dom.body.content
    x = 0
    csv_row = []
    for series in dom.by_tag('td.title'):    
        title = series.by_tag('a')[0].content.encode('ascii', 'ignore')
        ranking = series.by_tag('span.value')[0].content.encode('ascii', 'ignore')
        genres = series.by_tag('span.genre')[0].by_tag('a')
        genres = [g.content.encode('ascii', 'ignore') for g in genres]
        actors = series.by_tag('span.credit')[0].by_tag('a')
        actors = [a.content.encode('ascii', 'ignore') for a in actors]
        x = x + 1
        try:
            runtime = series.by_tag('span.runtime')[0].content.encode('ascii', 'ignore')
        except:
            runtime = "Unknown"
        #print x, title, ranking, genres, actors, runtime

        csv_titles = title
        csv_ranking = ranking
        csv_genres = genres
        csv_actors = actors
        csv_runtime = runtime
        row = [csv_titles, csv_ranking, csv_genres, csv_actors, csv_runtime]
        csv_row.append(row)

    return csv_row

开发者ID:jordi1992，项目名称:DataProcessing，代码行数:32，代码来源:tvscraper.py

注：本文中的pattern.web.DOM.by_tag方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。