当前位置: 首页>>代码示例>>Python>>正文


Python DOM.by_id方法代码示例

本文整理汇总了Python中pattern.web.DOM.by_id方法的典型用法代码示例。如果您正苦于以下问题:Python DOM.by_id方法的具体用法?Python DOM.by_id怎么用?Python DOM.by_id使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pattern.web.DOM的用法示例。


在下文中一共展示了DOM.by_id方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: getRandomHistoryDOM

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
def getRandomHistoryDOM(language):
    url = URL("http://"+language+".wikipedia.org/wiki/Special:Random")
    #Gets the url only of the page this redirects to
    redirectUrl = url.redirect
    try:
        #Grab the name of the wikipedia article from the url
        urlComponents = string.split(redirectUrl, '/')
    except AttributeError:
        #Use some recursion if we encounter a page with no history, or some other error
        return getRandomHistoryDOM(language)

    #Get the history section of the article
    redirectUrl = "http://"+language+".wikipedia.org/w/index.php?title="+urlComponents[4]+"&action=history"
    print "Current article is: " +str(urlComponents[4])
    #print redirectUrl
    url = URL(redirectUrl);
    dom = DOM(url.download(cached=False))
    try:
        historyList = dom.by_id("pagehistory").by_tag("li")
        return historyList, urlComponents[4]
    except AttributeError:
        #Use some recursion if we encounter a page with no history, or some other error
        dom = getRandomHistoryDOM(language)

    return getRandomHistoryDOM(language)
开发者ID:eucalyptustree,项目名称:wikiMap,代码行数:27,代码来源:wikiScraper1.2.py

示例2: scrape_education

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
def scrape_education(county_num):
	if county_num<10:
		county_num = '0' + str(county_num)
	else:
		county_num = str(county_num)
	
	print county_num
	#url = 'http://dq.cde.ca.gov/dataquest/Staff/StaffEduLvl.aspx?cYear=2011-12&cChoice=CoEduc&TheCounty=01,ALAMEDA&cType=T&cGender=&Submit=1'
	url = 'http://dq.cde.ca.gov/dataquest/Staff/StaffEduLvl.aspx?cYear=2011-12&cChoice=CoEduc&TheCounty=' + county_num + '01,ALAMEDA&cType=T&cGender=&Submit=1'
	abs_url = URL(string = url)
	dom = DOM(abs_url.download(cached=True))#download the DOM

	

	other = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[12].content.replace(',','')
	associates = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[11].content.replace(',','')
	bachelors = str(int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[9].content.replace(',','')) + int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[10].content.replace(',','')))

	masters = str(int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[4].content.replace(',','')) + int(dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[5].content.replace(',','')))
	jurisdoctor = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[3].content.replace(',','')
	doctorate = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("td")[2].content.replace(',','')
	
	bachelors_and_less = str(int(bachelors) + int(associates) + int(other))
	
	post_grad = str(int(masters) + int(jurisdoctor) + int(doctorate))
	
	county = dom.by_id("ctl00_ContentPlaceHolder1_gdTotal").by_tag("a")[0].content

	# write all the collected data to a new row of the output file
	writer.writerow([county, bachelors_and_less, post_grad, associates, bachelors, masters, jurisdoctor, doctorate])
开发者ID:lisayao,项目名称:Education-in-California,代码行数:32,代码来源:Education_Scraper.py

示例3: getTitle

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
 def getTitle(self, link):
     html = URL(link).download()
     body = DOM(html).body
     node = body.by_id("main-article-info")
     if node:
         title = node.children[1].content.strip()
     else:
         title = ''
     return title
开发者ID:Carlosmr,项目名称:WhooshSearcher,代码行数:11,代码来源:spiders.py

示例4: getVisByCountry

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
def getVisByCountry(site):
    countries = {}
    url = URL(base + site)

    aDom = DOM(url.download(cached=True))
    if aDom.by_id("visitors-by-country") is not None:
        vis = aDom.by_id("visitors-by-country")

        countries = {}
        for r in vis.by_class("tr1"):
         if r.by_tag("a")[0].attributes.get("id") == "toggleMoreCountryVisits":
           pass
         else:
           #print r.by_tag("a")[0].content
           country = r.by_tag("a")[0].content.split("&nbsp; ")[1].strip()
           pct = float(r.by_tag("p")[1].content[0:-1])
           #print country, pct
           countries[country] = pct
    sites[site] = countries
开发者ID:acheson,项目名称:cs171pj2,代码行数:21,代码来源:alexa.py

示例5: htmlParser

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
 def htmlParser(self,link):
     html = URL(link).download()
     body = DOM(html).body
     content = body.by_id("content")
     if content:
         plaincontent = plaintext(content.content, linebreaks=2, indentation = True)
         pretty = unicode(plaincontent.strip())
     else:
         pretty=''            
     return pretty
开发者ID:Carlosmr,项目名称:WhooshSearcher,代码行数:12,代码来源:spiders.py

示例6: load

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
def load(year, pagenum, pagerank):
	strnum = str(year)
	url = URL("http://www.imdb.com/search/title?at=0&sort=moviemeter,asc&start="
			   +str(pagenum)+"&title_type=feature&year="+strnum+","+strnum)
	dom = DOM(url.download(timeout=30, cached=True))
	htmlsource = dom.by_id("main").by_class("results")[0].by_class("title")[pagerank].by_tag("a")[0].source
	urlpiece = re.search(r'/title/+[t0-9]+/', htmlsource)
	finalurl = "http://www.imdb.com" + urlpiece.group(0)
	url2 = URL(finalurl)
	return url2
开发者ID:kennyzlei,项目名称:movietrends-cs171,代码行数:12,代码来源:datascrape.py

示例7: getTextAboutResturants

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
	def getTextAboutResturants(self):
		# get text about resturants
		i=0
		for rs in self.conn.resturants.find():
			if not rs.get('information'):
				information = {}
				request = DOM(URL(rs['url']).download())
				# Tags
				if request.by_id('LocationMetaData'):
					source = str(request.by_id('LocationMetaData').source.encode('cp1252', 'ignore'))
					tags = Element(source[source.find('<b>Tags: </b>'):]).by_tag('a')
					if tags:
						information['parsedTags'] = [ (tag.attributes['href'], tag.content) for tag in tags]
				# Review 
				if request.by_id('LocationDescription'):
					information["review"] = plaintext(request.by_id('LocationDescription').content)
				# Details
				if request.by_id('LocationRestaurantDetails'):
					information["details"] = request.by_id('LocationRestaurantDetails').by_tag('p')[0].content
				rs['details'] = information
				print information
				self.conn.resturants.save(rs)
			else:
				print i, rs['name']

			i +=1
开发者ID:debovis,项目名称:python-analysis,代码行数:28,代码来源:scrape.py

示例8: getResturants

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
	def getResturants(self):
		# get all resturants and urls
		for page in range(1,48):
			params["page"] = page
			request = DOM(URL(url, query=params).download())
			searchResults = request.by_id('searchResults')
			pageResults = searchResults.by_class('locationListing clearfix')
			for item in pageResults:
				link = item.by_tag('h4')[0].by_tag('a')[-1]
				name = plaintext(link.content)
				address = link.attributes['href']
				resturant = { 'name' : name, 'url' : address}
				conn['resturants'].insert(resturant)
开发者ID:debovis,项目名称:python-analysis,代码行数:15,代码来源:scrape.py

示例9: get_search_string

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
def get_search_string(search, proxy):
    if search == "Schindler's List":
        search = "Schindler"
    if search == "One Flew Over the Cuckoo's Nest":
        search = "one flew over"
    if search == "It's a Wonderful Life":
        search = "wonderful life"
    if search == u"L\xe9on: The Professional":
        search = "the professional"
    if search == "Terminator 2: Judgment Day":
        search = "Terminator 2"
    if search == u"Am\xe9lie":
        search = "Amelie"
    if search == "L.A. Confidential":
        search = "Confidential"
    if search == "Pan's Labyrinth":
        search = "pan"
    if search == "A Few Dollars More":
        search = "dollars"
    if search == "The Secret in Their Eyes":
        search = "El secreto de sus ojos"
    if search == "The King's Speech":
        search = "the king"
    if search == "Howl's Moving Castle":
        search = "howl"
    if search == "Harry Potter and the Deathly Hallows: Part 2":
        search = "harry potter"
    if search == "Who's Afraid of Virginia Woolf?":
        search = "virginia woolf"
    if search == "Rosemary's Baby":
        search = "rosemary"
    url = URL("http://1channel.ch")
    dom = DOM(url.download(cached=False, timeout=20, proxy=proxy))
    a = dom.by_id("searchform")
    s_base = a.attributes.get("action")
    s_text = "_keywords=" + search.replace(" ","+")
    key = a.by_attribute(name="key")[0].attributes.get("value")
    s_section = a.by_attribute(name="search_section")[0].attributes.get("value")
    search_string = s_base + s_text + "&key=" + key + "&search_section=" + s_section
    return search_string
开发者ID:acheson,项目名称:cs171pj2,代码行数:42,代码来源:ch1.py

示例10: scrape_truancy

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
def scrape_truancy(county_num):
	if county_num<10:
		county_num = '0' + str(county_num)
	else:
		county_num = str(county_num)
	
	print county_num
	#url = 'http://dq.cde.ca.gov/dataquest/SuspExp/suspexplrate.aspx?cYear=2011-12&cType=ALL&cCDS=01000000000000&cName=ALAMEDA&cLevel=County&cChoice=cSusExpRt'
	url = 'http://dq.cde.ca.gov/dataquest/SuspExp/suspexplrate.aspx?cYear=2011-12&cType=ALL&cCDS=' + county_num + '000000000000&cName=ALAMEDA&cLevel=County&cChoice=cSusExpRt'
	abs_url = URL(string = url)
	dom = DOM(abs_url.download(cached=True))#download the DOM


	county = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("a")[0].content
	total_enrollment = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[3].content
	suspensions = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[4].content
	suspension_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[5].content
	expulsions = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[6].content
	expulsion_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[7].content
	truants = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[8].content
	trauncy_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[1].by_tag("td")[9].content

	
	#For the first county only, also grab the statewide totals
	if county_num=='01':
		state_total_enrollment = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[3].content
		state_suspensions = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[4].content
		state_suspension_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[5].content
		state_expulsions = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[6].content
		state_expulsion_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[7].content
		state_truants = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[8].content
		state_trauncy_rate = dom.by_id("ContentPlaceHolder1_gdTotal").by_tag("tr")[2].by_tag("td")[9].content
		
		# write the statewide total data to the top row of the output file
		writer.writerow(["California Total", state_total_enrollment,state_suspensions, state_suspension_rate, state_expulsions, state_expulsion_rate, state_truants, state_trauncy_rate])

	

	# write all the collected data to a new row of the output file
	writer.writerow([county, total_enrollment,suspensions, suspension_rate, expulsions, expulsion_rate, truants, trauncy_rate])
开发者ID:lisayao,项目名称:Education-in-California,代码行数:42,代码来源:Education_Scraper.py

示例11: dfun

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
		return
	dfunv = dfun(dom2)
	if dfunv == "":
		return
	writer.writerow([tfun(data), runfun(data), gfun(data), dfun(data),
				 wfun(data), afun(data), msfun(data), rtfun(data),
				 rtnmfun(data), bfun(text), bousfun(text), 
				 bowfun(text), mpaafun(data), dfun(dom2)])

	
#this handles tags

year = 2000
while year < 2011:
	pagenum = 1
	while pagenum < 101:
		pagerank = 0
		while pagerank < 50:
			url2 = load(year, pagenum, pagerank)
			dom2 = DOM(url2.download(timeout=30, cached=True))
			data = dom2.by_id("overview-top")
			text = loadbus(url2)
			print dfun(data)
			entrytest(data,text,dom2)
			pagerank += 1
		pagenum += 50
	year += 1


output.close()
开发者ID:kennyzlei,项目名称:movietrends-cs171,代码行数:32,代码来源:datascrape.py

示例12: loadbus

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
def loadbus(url):
	url = URL(str(url)+"business?ref_=tt_dt_bus")
	dom = DOM(url.download(timeout=30, cached=True))
	return (dom.by_id("tn15content").content).encode('ascii', 'ignore')
开发者ID:kennyzlei,项目名称:movietrends-cs171,代码行数:6,代码来源:datascrape.py

示例13:

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
#build the final list of variables called season_label_container
roster_labels_container=[]
for label in roster_labels:
    roster_labels_container.append(label.content.encode("utf8"))

#add in a column for the team acronym to act as a key
roster_labels_container.insert(0,"team_id")
roster_labels_container.insert(1,"Season")

#the roster_container holds all of the players for the specified team/year
roster_container = []
roster_container.append(roster_labels_container)
print roster_labels_container

# this is so powerful - I just needed to look and find the id for the roster table
all_divs = dom.by_id("roster")
#roster_trs holds a list of players info
roster_trs = all_divs.by_tag("tr")

#iterate through each player in the roster
for trs in roster_trs:

    #this will hold the final encoded info/stats pulled from the current player
    this_roster_farian = []
    #now add the team id and the season
    this_roster_farian.append(this_team_acronym) 
    this_roster_farian.append(this_season) 
    
    for t in trs.by_tag("td"):
            #the player name has a link to the player, 
            #e.g. '<a href="/players/m/milledr01.html">Drew Miller</a>'
开发者ID:mercicle,项目名称:Scraping-And-Visualizing-NHL-Data,代码行数:33,代码来源:roster_by_year.py

示例14: int

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
            elif year_name[0:2] == "Fr":
                grad_year = int(neu_year) + 4
            # split name into first and last (adjusting for title rows)
            name = cells[0].by_tag("a")
            name = cells[0].content.split(" ", 1) if len(name) == 0 else cells[0].by_tag("a")[0].content.split(" ", 1)
            # reorder name if from Columbia or Princeton
            if school in (2,3):
                name.reverse()
            # add swimmer (last name, first name, graduating year, school) to array
            swimmers.append([name[1].encode('ascii', 'ignore').strip(",").strip(), name[0].encode('ascii', 'ignore').strip(), grad_year, neu_schools[school]])


# Get all Cornell Roster id numbers for the URLs
url = URL("http://www.cornellbigred.com/roster.aspx?roster=847")
dom = DOM(url.download(cached=True))
options = dom.by_id("ctl00_cplhMainContent_ddlPastRosters").by_tag("option")
base_url = "http://www.cornellbigred.com/roster.aspx?roster="
cornell_roster_ids = []
for option in options:
    cornell_roster_ids.append(str(option.attrs["value"]))

# define years array
cornell_years = []
for i in range(YEARS_TO_SCRAPE):
	cornell_years.append(str(year-i))

counter = 0 
for cornell_year in cornell_years:
    print counter
    print "Cornell" + " " + cornell_year
    url_string = base_url + cornell_roster_ids[counter]
开发者ID:2dpodcast,项目名称:cs109-3,代码行数:33,代码来源:roster_scraper.py

示例15: open

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_id [as 别名]
from pattern.web import URL, DOM
from pattern.db import Datasheet
import glob, re


urls = glob.glob('/Users/tnatoli/Desktop/pages/*.html')
headers = ['player', 'pos', 'team', 'owner']

f = open('player_table.txt', 'w')
f.write('\t'.join(headers) + '\n')

for u in urls:
    url = URL(u)
    dom = DOM(url.download(cached=False))
    tbody = dom.by_id('statTable0').by_tag('tbody')[0]
    for tr in tbody.by_tag('tr'):
        pname = tr.by_class('ysf-player-name')[0].by_tag('a')[0].content
        team_pos = tr.by_class('ysf-player-team-pos')[0].by_tag('span')[0].content
        team = re.sub('\(', '', team_pos.split(' - ')[0])
        pos = re.sub('\)', '', team_pos.split(' - ')[1])
        owner_links = tr.by_class('owner')[0].by_tag('a')
        if owner_links:
            owner = owner_links[0].content
        else:
            owner = 'FA'
        line = '\t'.join([pname, team, pos, owner])
        print line
        for l in line:
            try:
                l.encode('ascii')
开发者ID:tnat1031,项目名称:fantasy_sports_scrape,代码行数:32,代码来源:baseball_scrape.py


注:本文中的pattern.web.DOM.by_id方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。