本文整理汇总了Python中pattern.web.URL类的典型用法代码示例。如果您正苦于以下问题:Python URL类的具体用法?Python URL怎么用?Python URL使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了URL类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: scrape_starrtest
def scrape_starrtest(county_num):
if county_num<10:
county_num = '0' + str(county_num)
else:
county_num = str(county_num)
print county_num
#url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=01&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1'
url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=' + str(county_num) + '&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1'
abs_url = URL(string = url)
dom = DOM(abs_url.download(cached=True))#download the DOM
#sciend_num = dom.by_class("rm")[4].content
scicst_num = dom.by_class("rm")[3].content
math_num = dom.by_class("rm")[2].content
hist_num = dom.by_class("rm")[1].content
ela_num = dom.by_class("rm")[0].content
#sciend_percent = dom.by_class("rs")[4].content[:5]
scicst_percent = dom.by_class("rs")[3].content[:5]
math_percent = dom.by_class("rs")[2].content[:5]
hist_percent = dom.by_class("rs")[1].content[:5]
ela_percent = dom.by_class("rs")[0].content[:5]
county = dom.by_tag("h2")[0].content
# write all the collected data to a new row of the output file
writer.writerow([county, ela_num,ela_percent, hist_num, hist_percent, math_num, math_percent,scicst_num, scicst_percent])
示例2: scrape_top_250
def scrape_top_250(url):
'''
Scrape the IMDB top 250 movies index page.
Args:
url: pattern.web.URL instance pointing to the top 250 index page
Returns:
A list of strings, where each string is the URL to a movie's page on
IMDB, note that these URLS must be absolute (i.e. include the http
part, the domain part and the path part).
'''
movie_urls = []
#absolute_url = 'http://www.imdb.com'
# YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
# URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
url = URL(url)
dom = DOM(url.download(cached=True))
# return the dom value
for e in dom('.titleColumn'):
for link in e('a'):
movie_urls.append(abs(link.attributes.get('href')), )
# return url list
return movie_urls
示例3: download_single_image
def download_single_image(url_link, pic_prefix_str, target_folder, image_size):
""" Download data according to the url link given.
Args:
url_link (str): url str.
pic_prefix_str (str): pic_prefix_str for unique label the pic
"""
file_ext = os.path.splitext(url_link)[1] #use for checking valid pic ext
temp_filename = pic_prefix_str + ".jpg"
temp_filename_full_path = os.path.join(target_folder, temp_filename)
valid_image_ext_list = ['.png','.jpg','.jpeg', '.gif', '.bmp', '.tiff'] #not comprehensive
url = URL(url_link)
if url.redirect:
return # if there is re-direct, return
if file_ext not in valid_image_ext_list:
return #return if not valid image extension
# save as test.gif
print url_link
try:
response = url.download()
img = resize_image(response, image_size)
img.save(temp_filename_full_path, "JPEG")
except Exception as e:
#if self.__print_download_fault:
print 'Problem with processing this data: ', str(e), url_link
示例4: downloading_csv
def downloading_csv(self, download_type = 'hist'):
""" Download the csv information for particular stock.
download_type can be hist or div. If hist, will download the hist price.
If div, will download dividend history.
Kwargs:
download_type (str): hist or div (default hist).
"""
self.download_fault = 0
if download_type == 'hist':
target_url = self.hist_quotes_full_url
sav_filename = os.path.join(self.hist_quotes_csvfile_path,'hist_stock_price_'+ self.individual_stock_sym+ '.csv')
elif download_type == 'div':
target_url = self.div_history_full_url
sav_filename = os.path.join(self.hist_quotes_csvfile_path,'div_hist_'+ self.individual_stock_sym+ '.csv')
else:
print 'wrong download type'
raise
url = URL(target_url)
f = open(self.tempfile_sav_location, 'wb') # save as test.gif
try:
f.write(url.download())#if have problem skip
except:
if self.__print_download_fault: print 'Problem with processing this data: ', target_url
self.download_fault =1
f.close()
if not self.download_fault:
if self.enable_save_raw_file:
shutil.copyfile(self.tempfile_sav_location,sav_filename )
示例5: convertMapData
def convertMapData():
print '[2/2] Convert map data'
# output dictionary
d3mapData = {}
# download the file
url = URL(DATASET3)
data = url.download()
# create array
data = list(json.loads(data))
# fill output dictionary
for dataRow in data:
if dataRow['Year'] == '2014':
population = dataRow['Value']
fillColor = defineColor(dataRow['Value'])
d3mapData[dataRow['Country Code']] = {'population': population, 'fillKey': fillColor}
print '[2/2] Write to json'
# write output dictionary to json file
with open('D3LinkedViews/data_map.json', 'wb') as output_file:
json.dump(d3mapData, output_file)
print '[2/2] Finish'
示例6: scrape_top_250
def scrape_top_250(url):
'''
Scrape the IMDB top 250 movies index page.
Args:
url: pattern.web.URL instance pointing to the top 250 index page
Returns:
A list of strings, where each string is the URL to a movie's page on
IMDB, note that these URLS must be absolute (i.e. include the http
part, the domain part and the path part).
'''
movie_urls = []
# YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
# URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
from pattern.web import abs
url = URL("http://www.imdb.com/chart/top")
dom = DOM(url.download(cached = True))
for e in dom.by_tag("td.titleColumn")[:250]:
for link in e.by_tag("a"):
link = link.attrs.get("href","")
link = abs(link, base=url.redirect or url.string)
movie_urls.append(link)
# return the list of URLs of each movie's page on IMDB
return movie_urls
示例7: process_page
def process_page():
url = URL("http://www.imdb.com/search/title?num_votes=5000,&sort=user_rating,desc&start=1&title_type=tv_series")
dom = DOM(url.download(cached=True))
domIndex = 0
for title in dom.by_class("title"):
theTitle = str(title.by_tag("a")[0].content).encode('ascii', 'replace')
titleCatalog.append(Title(theTitle))
try:
match = re.search("^(\d+).*$", str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace'))
#print match.group(1)
# titleCatalog[domIndex].addRunTime( str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace'))
titleCatalog[domIndex].addRunTime(match.group(1))
except Exception, e:
pass
try:
titleCatalog[domIndex].addRank( str(dom.by_class("value")[domIndex].content).encode('ascii', 'replace'))
except Exception, e:
pass
示例8: extract_data
def extract_data(stock_ticker):
url_base = 'http://financials.morningstar.com/ajax/exportKR2CSV.html?&callback=?&t='
url_end = '®ion=usa&culture=en-US&cur=&order=asc'
# May add more exchanges later on, but these cover the main US stock exchanges: Nasdaq, New York SE, and Pink Sheets (OTC stocks), respectively
# Loops through main stock exchanges to get proper URL for data extraction
stock_exchange_list = ['XNAS:','XNYS:','PINX:']
for exchange in stock_exchange_list:
test = URL(url_base+exchange+stock_ticker+url_end)
if sys.getsizeof(test.download()) > 35: #A broken URL produces an empty string, which has memory size 33; size 35 allows for minor variation in the size
break
temp_data = 'C:/Users/Owner/Documents/temp.csv'
f = open(temp_data, mode='w')
try:
f.write(test.download())
except:
raise IOError('There was an error processing this data')
sys.exit(1)
f.close()
try:
stock_data_df = pd.read_csv(temp_data, header=2,thousands =',',index_col=0,skiprows=[19,20,31,41,42,43,48,58,53,64,65,72,73,95,101,102])
except:
raise IOError('Problem downloading files')
os.remove(temp_data)
sys.exit(1)
os.remove(temp_data)
stock_data_df = stock_data_df.transpose()
return(stock_data_df)
示例9: get_patent
def get_patent(url):
url = URL(url + "/fulltext")
html = url.download()
dom = DOM(html)
title = plaintext(dom("h3 a")[0].content)
body = plaintext(dom("#contents")[0].content)
return [title, body]
示例10: cats
def cats(self, namespace=0, start=None, acmin=1, count=100, cached=True, **kwargs):
""" Returns an iterator over all article titles (for a given namespace id).
"""
kwargs.setdefault("unicode", True)
kwargs.setdefault("throttle", self.throttle)
# Fetch article titles (default) or a custom id.
id = kwargs.pop("_id", "title")
id = "*"
# Loop endlessly (= until the last request no longer yields an "apcontinue").
# See: http://www.mediawiki.org/wiki/API:Allpages
while start != -1:
url = URL(self._url, method=GET, query={
"action": "query",
"list": "allcategories",
"acfrom": start or "",
"aclimit": min(count, 500),
"acprop": "size",
"acmin": max(1, acmin),
"format": "json"
})
data = url.download(cached=cached, **kwargs)
data = json.loads(data)
for x in data.get("query", {}).get("allcategories", {}):
# print(x)
if x.get(id):
# yield x[id]
x['name'] = x.pop('*')
yield x
start = data.get("query-continue", {}).get("allcategories", {})
start = start.get("accontinue", start.get("acfrom", -1))
raise StopIteration
示例11: scrape_top_250
def scrape_top_250(url):
'''
Scrape the IMDB top 250 movies index page.
Args:
url: pattern.web.URL instance pointing to the top 250 index page
Returns:
A list of strings, where each string is the URL to a movie's page on
IMDB, note that these URLS must be absolute (i.e. include the http
part, the domain part and the path part).
'''
movie_urls = []
# YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
# URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
# Download the HTML file
url = URL(url)
html = url.download()
# Parse the HTML file into a DOM representation
dom = DOM(html)
# Iterate through all 250 table rows on the index page
for movies in dom('.lister-list > tr'):
# take the movie's href attribute and put it in href
href = movies('td.titleColumn a')[0].attrs["href"]
# append the href attribute to the string, but also add http://www.imdb.com/ in front of it
movie_urls.append("http://www.imdb.com/" + href)
# return the list of URLs of each movie's page on IMDB
return movie_urls
示例12: getRandomHistoryDOM
def getRandomHistoryDOM(language):
url = URL("http://"+language+".wikipedia.org/wiki/Special:Random")
#Gets the url only of the page this redirects to
redirectUrl = url.redirect
try:
#Grab the name of the wikipedia article from the url
urlComponents = string.split(redirectUrl, '/')
except AttributeError:
#Use some recursion if we encounter a page with no history, or some other error
return getRandomHistoryDOM(language)
#Get the history section of the article
redirectUrl = "http://"+language+".wikipedia.org/w/index.php?title="+urlComponents[4]+"&action=history"
print "Current article is: " +str(urlComponents[4])
#print redirectUrl
url = URL(redirectUrl);
dom = DOM(url.download(cached=False))
try:
historyList = dom.by_id("pagehistory").by_tag("li")
return historyList, urlComponents[4]
except AttributeError:
#Use some recursion if we encounter a page with no history, or some other error
dom = getRandomHistoryDOM(language)
return getRandomHistoryDOM(language)
示例13: download_pdfs
def download_pdfs():
"""download pdfs from fda"""
# where to save pdfs
path = 'classifier_docs/pdfs/'
# create directory if it doesn't exist
if not os.path.exists(path):
os.makedirs(path)
# load in non-standard pdf urls from 2012 to serve as control text
# note: had to lookup urls manually
# drugs are erivedge (203388) and sirturo (204384)
# also, menhibrix (125363) has no medical review available
urls = ['http://www.accessdata.fda.gov/drugsatfda_docs/nda/2012/203388Orig1s000MedRpdf.pdf',
'http://www.accessdata.fda.gov/drugsatfda_docs/nda/2012/204384Orig1s000MedR_.pdf']
for url in urls:
m = re.search('20..\/(\d{6})', url)
app_num = m.group(1)
url = URL(url)
# make sure that url points to PDF, print error otherwise
if url.mimetype in MIMETYPE_PDF:
# write pdf for medical review if it doesn't exist
fn = path + app_num + '.pdf'
if not os.path.exists(fn):
print "writing {} from {}".format(fn, url)
f = open(fn, 'w')
f.write(url.download(cached = False))
f.close()
else:
print "{} already exists".format(fn)
else:
print "warning: {} did not resolve to pdf".format(url)
return
示例14: downloadPDFs
def downloadPDFs(dictListJSON, state, jsonExists = False):
#state = dictListJSON[0, 2]
dlJSONFile = open(dictListJSON, "r")
dictJSON = json.load(dlJSONFile)
dlJSONFile.close()
#some condition to check if the JSON already exists
if jsonExists:
pdfDictList = dictJSON
else:
pdfDictList = findPDFLinks(dictJSON, state)
count = 0
for dict in pdfDictList:
#test if date > 01/01/13
fileName = "".join(str(dict["AdvertiserInfo"]).split())
print "Writing to " + fileName
url = dict["PDFLink"]
url = re.sub(' ', '%20', url)
print url
if url != "NO URL":
urlOpened = URL(url)
f = open(fileName, 'wb')
#download to state pdfs directory
f.write(urlOpened.download(cached=False))
f.close()
count += 1
if count > 4:
break
示例15: get_patent_urls
def get_patent_urls(keyword, limit=10):
keyword = urllib.quote_plus(keyword)
base_url = "http://www.lens.org"
url = URL(base_url + "/lens/search?ft=true&l=en&st=true&n=" + str(limit) + "&q=" + keyword)
dom = DOM(url.download())
links = [base_url + a.attributes.get("href") for a in dom("a.link")]
return links