本文整理汇总了Python中mechanize.Browser.retrieve方法的典型用法代码示例。如果您正苦于以下问题:Python Browser.retrieve方法的具体用法?Python Browser.retrieve怎么用?Python Browser.retrieve使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类mechanize.Browser
的用法示例。
在下文中一共展示了Browser.retrieve方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: download
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
def download(url, filename = "", saveto = "", overwrite = 2, suffix = ""):
try :
if (filename == "") :
filename = url.split("/")[-1]
filename = filename.split("?")[0]
do_download = True
if( not saveto.endswith("/")) :
saveto = saveto + "/"
if(overwrite == 2 and os.path.isfile(saveto + filename)) :
br = Browser()
br.open(url)
remote_time = time.strptime(br.response().info()["last-modified"], "%a, %d %b %Y %H:%M:%S GMT")
local_time = time.gmtime((os.stat(saveto + filename + suffix).st_mtime))
do_download = (remote_time > local_time)
elif (overwrite == 0 and os.path.isfile(saveto + filename)) :
do_download = False
if(do_download) :
br = Browser()
os.chdir(saveto)
br.retrieve(url,filename+suffix)
print("Downloaded " + url + " succesfully")
else :
print(url + " exists already")
except:
print("Failed: " + url)
示例2: go
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
def go(thread_num):
global connections
global successful
global errors
file_name = str(thread_num)
br = Browser()
while(True):
try:
br.retrieve('http://szuku.pl/jcaptcha/jpeg/imageCaptcha', file_name + '.jpeg')
ocr = popen('./convert.sh ' + file_name).read()
captcha = only_letters_or_digits(ocr)
br.open('http://szuku.pl/teaser/save',
urlencode({'email': rand_mail(),
'captcha': captcha}))
connections += 1
if ok_regex.search(br.response().read()):
successful += 1
except Exception, e:
print e
errors += 1
示例3: GetServerConfigFile
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
def GetServerConfigFile(url_aastra, return_file):
br = Browser()
br.add_password(url_aastra, "admin", "22222")
try:
br.retrieve(url_aastra + "/servercfg.html", return_file)
return True
except:
log.warn("Maybe isn't a aastra phone? Are you Sure?")
return False
示例4: main
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
def main(page, regex, path):
start_time = time.time()
br = Browser()
br.set_handle_robots(False)
br.open(page)
#br.open('http://storage.googleapis.com/books/ngrams/books/datasetsv2.html')
eng_all = re.compile(regex)
#eng_all = re.compile('.*googlebooks-eng-all.*20120701.*')
#print page, regex, path
n = 0
maxlen = 0
link_list = []
for link in br.links():
if eng_all.match(link.url):
n += 1
maxlen = max(len(os.path.basename(link.url)), maxlen)
link_list.append(link.url)
sys.stderr.write('Found Link: %s\n' % link.url)
answer = raw_input("\n\nAre you sure you want to download the above %i file(s)? (Y/N): " % n)
if answer == 'N' or answer == 'n':
sys.exit(0)
sys.stderr.write('\n\nDownloading files to: %s\n' % path)
digits = len('%d' % n)
disp_time = datetime.datetime.now
for i, link in enumerate(link_list):
download_start = time.time()
file_name = os.path.basename(link)
full_path = os.path.join(path, file_name)
if os.path.exists(full_path):
sys.stderr.write('%s exists, not downloading\n' % full_path)
continue
try:
sys.stderr.write('[%s] Downloading(%-*i of %i): %*s' % (str(disp_time().time())[:8], digits, i+1, n,
maxlen + 2, file_name))
br.retrieve(link, filename=full_path)
except:
sys.stderr.write('\n\nSomething happened, deleting last file: %s\n' % full_path)
os.remove(full_path)
sys.exit(0)
sys.stderr.write(' of size %s MB in %5.2f min\n' % ("{:7.2f}".format(float(os.stat(full_path).st_size)/1000000),
(time.time() - download_start)/60))
br.clear_history()
sys.stderr.write('\ndownloaded %i files to %s directory in %15f seconds\n' % (n, path, time.time()-start_time))
示例5: down_image
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
def down_image(self, img):
print "down image from " + img
down_br = Browser()
down_cj = CookieJar()
down_br.set_cookiejar(down_cj)
fn = tempfile.mktemp(suffix='.png')
return down_br.retrieve(img, filename = fn)[0]
示例6: take_action
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
def take_action(self, parsed_args):
config_dir = '~/.kaggle-cli'
config_dir = os.path.expanduser(config_dir)
if os.path.isdir(config_dir):
config = ConfigParser.ConfigParser(allow_no_value=True)
config.readfp(open(config_dir + '/config'))
if parsed_args.username:
username = parsed_args.username
else:
username = config.get('user', 'username')
if parsed_args.password:
password = parsed_args.password
else:
password = config.get('user', 'password')
if parsed_args.competition:
competition = parsed_args.competition
else:
competition = config.get('user', 'competition')
base = 'https://www.kaggle.com'
login_url = base
data_url = '/'.join([base, 'c', competition, 'data'])
browser = Browser()
browser.open(login_url)
browser.select_form(nr=0)
browser['UserName'] = username
browser['Password'] = password
browser.submit()
browser.open(data_url)
data_page = html.fromstring(browser.response().read())
src_urls = map(
lambda x: base + x.attrib['href'],
data_page.cssselect('#data-files a'))
for url in src_urls:
self.app.stdout.write('downloading %s\n' % url)
browser.retrieve(url, url.split('/')[-1])
示例7: main
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
def main():
## create a browser object
## NWEA has a pretty aggressive robots.txt
## here's what we'll do about that: ignore it
br = Browser()
#br.set_handle_redirect(False)
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
## open the login page, form is called loginForm
br.open(LOGIN_URL)
br.select_form(name="loginForm")
br['username'] = USERNAME
br['password'] = SECRET
response = br.submit() ## submit and store response
print 'credentials successful, logged in'
#print response.read()
#once logged in, navigate to reports page
br.open(BASE_URL + '/report/home/map')
#CDF file looks like "https://kippteamschools-admin.mapnwea.org/report/download/cdf/7492"
#get the matching cdf and build the full url
cdf_string = '/report/download/cdf/[0-9]+'
file_target = br.find_link(url_regex=cdf_string)
file_loc = BASE_URL + file_target.url
print 'cdf is located at %s' % (file_loc)
#retrieve will get file at the location and save to a temp directory
cdf_zipped = br.retrieve(file_loc)[0]
print 'temp file is located at %s' % cdf_zipped
sourceZip = ZipFile(cdf_zipped, 'r')
print
print 'beginning unzip'
for name in sourceZip.namelist():
print 'extracted %s...' % (name)
sourceZip.extract(name, UNZIPPED_DEST)
sourceZip.close()
示例8: CourseraDownloader
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
#.........这里部分代码省略.........
logging.critical("couldn't authenticate")
sys.exit(1)
logging.info("successfully authenticated")
def set_csrf_token(self):
self.csrf_token = self.get_cookie_value('csrf_token')
def set_session(self):
self.session = self.get_cookie_value('CAUTH')
def get_cookie_value(self, search_name):
for cookie in self.br._ua_handlers['_cookies'].cookiejar:
if cookie.name == search_name:
return cookie.value
def set_auth_headers(self):
self.br.addheaders = [
('Cookie', 'csrftoken=%s' % self.csrf_token),
('Referer', 'https://accounts.coursera.org/signin'),
('X-CSRFToken', self.csrf_token),
]
def set_download_headers(self):
self.br.addheaders = [
(
'Cookie',
'csrftoken=%s;CAUTH=%s' % (self.csrf_token, self.session)
),
]
def is_authenticated(self, test_page):
m = re.search(
'https://class.coursera.org/%s/auth/logout' % self.course_name,
test_page
)
return m is not None
def download(self):
course_dir = os.path.join(TARGETDIR, self.course_name)
if not os.path.exists(course_dir):
os.mkdir(course_dir)
page = self.br.open(self.lectures_url)
doc = BeautifulSoup(page)
parts, part_titles = self.get_parts(doc)
for idx, part in enumerate(parts):
if self.item_is_needed(self.parts_ids, idx):
part_dir = os.path.join(
course_dir,
'%02d - %s' % (
(idx + 1),
self.escape_name(part_titles[idx].text).strip()
)
)
self.download_part(part_dir, part)
def download_part(self, dir_name, part):
if not os.path.exists(dir_name):
os.mkdir(dir_name)
rows, row_names = self.get_rows(part)
for idx, row in enumerate(rows):
if self.item_is_needed(self.rows_ids, idx):
self.download_row(
dir_name,
'%02d - %s' % (
(idx + 1),
row_names[idx].text.strip()
),
row
)
def download_row(self, dir_name, name, row):
resources = self.get_resources(row)
for resource in resources:
if self.item_is_needed(self.types, resource[1]):
self.download_resource(dir_name, name, resource)
def download_resource(self, dir_name, name, resource):
res_url = resource[0]
res_type = resource[1]
url, content_type = self.get_real_resource_info(res_url)
ext = self.get_file_ext(url, content_type, res_type)
if ext:
filename = self.get_file_name(dir_name, name, ext)
self.retrieve(url, filename)
def retrieve(self, url, filename):
if os.path.exists(filename) and not self.force:
logging.info("skipping file '%s'" % filename)
else:
logging.info("downloading file '%s'" % filename)
logging.debug("URL: %s" % url)
try:
self.br.retrieve(url, filename, reporter)
except KeyboardInterrupt:
if os.path.exists(filename): os.remove(filename)
raise
except Exception, ex:
if os.path.exists(filename): os.remove(filename)
logging.debug(ex)
logging.info("couldn't download the file")
示例9: CoreEmulator
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
class CoreEmulator(Emulator):
def __init__(self, username, password):
super(CoreEmulator, self).__init__(username, password)
self.setup_emulator()
def setup_emulator(self):
self.browser = Browser()
self.browser.set_handle_robots(False)
self.browser.addheaders = moodle.USER_AGENT
self.cookiejar = CookieJar()
self.browser.set_cookiejar(self.cookiejar)
def session_expired(self):
return self.browser.geturl().endswith(moodle.LOGIN_LOCATION)
@throws_moodlefuse_error(exception.LoginException)
def login(self):
self.open_login_page(self.browser.open)
self.browser.select_form(
predicate=lambda form: form.attrs.get('id') == attributes.LOGIN
)
self.browser.form.set_value(self.username, name='username')
self.browser.form.set_value(self.password, name='password')
resp = self.browser.submit()
if resp.geturl().endswith(moodle.LOGIN_LOCATION):
raise Exception
@throws_moodlefuse_error(resource_errors.UnableToDownloadResource)
def download(self, destination, source):
source = str(source)
if not source.startswith('http://') and not source.startswith('file://'):
source = config['TEST_DATA'] + '/' + source
self.browser.retrieve(source, destination)
def open_link(self, url):
response = self.browser.open(url)
return BeautifulSoup(response.read())
def check_form_checkbox(self, checkboxname):
self.browser.find_control(checkboxname).items[0].selected = True
def uncheck_form_checkbox(self, checkboxname):
self.browser.find_control(checkboxname).items[0].selected = False
def add_form_content(self, inputname, content):
self.browser.form.set_value(content, name=inputname)
def close_form(self):
self.browser.submit()
def set_form_to_first_form(self):
self.browser.select_form(nr=0)
def set_form_to_form_with_control_value(self, value):
for form in self.browser.forms():
for control in form.controls:
if control.value == value:
self.browser.form = form
@throws_moodlefuse_error(exception.UnableToToggleEditing)
def turn_course_editing_on(self):
self.set_form_to_form_with_control_value(moodle.EDIT_ON_MOODLE_BUTTON_TEXT)
response = self.browser.submit()
return BeautifulSoup(response.read())
def _setup_assignments_for_parsing(self, submission_filter):
self.set_form_to_form_with_control_value('Save and update table')
self.browser.form["filter"] = [submission_filter]
self.browser.form["perpage"] = ["100"]
self.uncheck_form_checkbox('quickgrading')
response = self.browser.submit()
return BeautifulSoup(response.read())
def filter_assignment_submissions(self):
return self._setup_assignments_for_parsing("submitted")
def unfilter_assignment_submissions(self):
return self._setup_assignments_for_parsing("")
@throws_moodlefuse_error(exception.UnableToToggleEditing)
def turn_course_editing_off(self):
self.set_form_to_form_with_control_value(moodle.EDIT_OFF_MOODLE_BUTTON_TEXT)
response = self.browser.submit()
return BeautifulSoup(response.read())
@throws_moodlefuse_error(course_errors.InvalidMoodleIndex)
def get_courses(self):
return self.open_link(config['MOODLE_INDEX_ADDRESS'])
@throws_moodlefuse_error(course_errors.UnableToObtainCategoryList)
def get_course_categories(self, url):
return self.open_link(url)
@throws_moodlefuse_error(resource_errors.UnableToObtainResourceList)
def get_course_resource_names(self, url):
return self.open_link(url)
#.........这里部分代码省略.........
示例10: RegPublDownloader
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
class RegPublDownloader(LegalSource.Downloader):
def __init__(self,baseDir="data"):
self.dir = baseDir + "/regpubl/downloaded"
if not os.path.exists(self.dir):
Util.mkdir(self.dir)
self.config = ConfigObj("%s/%s.ini" % (self.dir, __moduledir__))
# Why does this say "super() argument 1 must be type, not classobj"
# super(RegPublDownloader,self).__init__()
self.browser = Browser()
def DownloadAll(self):
# we use mechanize instead of our own Robot class to list
# available documents since we can't get the POST/cookie based
# search to work.
doctype = '160'
log.info(u'Selecting documents of type %s' % doctype)
self.browser.open("http://www.regeringen.se/sb/d/108/action/browse/c/%s" % doctype)
log.info(u'Posting search form')
self.browser.select_form(nr=1)
self.browser.submit()
pagecnt = 1
done = False
while not done:
log.info(u'Result page #%s' % pagecnt)
for l in self.browser.links(url_regex=r'/sb/d/108/a/\d+'):
self._downloadSingle(l.absolute_url)
self.browser.back()
try:
self.browser.find_link(text='N\xe4sta sida')
self.browser.follow_link(text='N\xe4sta sida')
except LinkNotFoundError:
log.info(u'No next page link found, this was the last page')
done = True
pagecnt += 1
self.config['last_update'] = datetime.date.today()
self.config.write()
def DownloadNew(self):
if 'last_update' in self.config:
then = datetime.datetime.strptime(self.config['last_update'], '%Y-%m-%d')
else:
# assume last update was more than a year ago
then = datetime.datetime.now() - datetime.timedelta(-367)
now = datetime.datetime.now()
if (now - then).days > 30:
pass
# post a "last 30 days" query
elif (now - then).days > 365:
pass
# post a "last 12 months" query
else:
# post a full query
self.DownloadAll()
def _downloadSingle(self,url):
docid = re.match(r'http://www.regeringen.se/sb/d/108/a/(\d+)', url).group(1)
fname = "%s/%s/index.html" % (self.dir, docid)
log.info(u' Loading docidx %s' % url)
self.browser.open(url)
if not os.path.exists(fname):
Util.ensureDir(fname)
self.browser.retrieve(url,fname)
for l in self.browser.links(url_regex=r'/download/(\w+\.pdf).*'):
filename = re.match(r'http://www.regeringen.se/download/(\w+\.pdf).*',l.absolute_url).group(1)
# note; the url goes to a redirect script; however that
# part of the URL tree (/download/*) is off-limits for
# robots. But we can figure out the actual URL anyway!
if len(docid) > 4:
path = "c6/%02d/%s/%s" % (int(docid[:-4]),docid[-4:-2],docid[-2:])
else:
path = "c4/%02d/%s" % (int(docid[:-2]),docid[-2:])
fileurl = "http://regeringen.se/content/1/%s/%s" % (path,filename)
df = "%s/%s/%s" % (self.dir,docid, filename)
if not os.path.exists(df):
log.info(u' Downloading %s' % (fileurl))
self.browser.retrieve(fileurl, df)
else:
log.info(u' Already downloaded %s' % (fileurl))
示例11: CourseraDownloader
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
class CourseraDownloader(object):
login_url = ''
home_url = ''
lectures_url = ''
course_name = ''
def __init__(self, config):
self.parts_ids = config['parts']
self.rows_ids = config['rows']
self.types = config['types']
self.force = config['force']
self.br = Browser()
self.br.set_handle_robots(False)
def authenticate(self):
self.br.open(self.login_url)
self.br.form = self.br.forms().next()
self.br['email'] = EMAIL
self.br['password'] = PASSWORD
self.br.submit()
home_page = self.br.open(self.home_url)
if not self.is_authenticated(home_page.read()):
log("couldn't authenticate")
sys.exit(1)
log("successfully authenticated")
def is_authenticated(self, test_page):
m = re.search(
'https://class.coursera.org/%s/auth/logout' % self.course_name,
test_page)
return m is not None
def download(self):
course_dir = os.path.join(TARGETDIR, self.course_name)
if not os.path.exists(course_dir):
os.mkdir(course_dir)
page = self.br.open(self.lectures_url)
doc = BeautifulSoup(page)
parts, part_titles = self.get_parts(doc)
for idx, part in enumerate(parts):
if self.item_is_needed(self.parts_ids, idx):
part_dir = os.path.join(
course_dir,
'%02d - %s' % ((idx + 1),
part_titles[idx].text.strip()))
self.download_part(part_dir, part)
def download_part(self, dir_name, part):
if not os.path.exists(dir_name):
os.mkdir(dir_name)
rows, row_names = self.get_rows(part)
for idx, row in enumerate(rows):
if self.item_is_needed(self.rows_ids, idx):
self.download_row(dir_name, '%02d - %s' % ((idx + 1),
row_names[idx].text.strip()), row)
def download_row(self, dir_name, name, row):
resources = self.get_resources(row)
for resource in resources:
if self.item_is_needed(self.types, resource[1]):
self.download_resource(dir_name, name, resource)
def download_resource(self, dir_name, name, resource):
res_url = resource[0]
res_type = resource[1]
url, content_type = self.get_real_resource_info(res_url)
ext = self.get_file_ext(url, content_type, res_type)
filename = self.get_file_name(dir_name, name, ext)
self.retrieve(url, filename)
def retrieve(self, url, filename):
if os.path.exists(filename) and not self.force:
log("skipping file '%s'" % filename)
else:
log("downloading file '%s'" % filename)
try:
self.br.retrieve(url, filename)
except KeyboardInterrupt:
raise
except:
log("couldn't download the file")
def item_is_needed(self, etalons, sample):
return (len(etalons) == 0) or (sample in etalons)
def get_file_name(self, dir_name, name, ext):
name = self.escape_name(name)
return ('%s.%s' % (os.path.join(dir_name, name), ext))
def escape_name(self, name):
return name.replace('/', '_').replace('\\', '_')
def get_real_resource_info(self, res_url):
try:
src = self.br.open(res_url)
try:
url = src.geturl()
content_type = src.info().get('content-type', '')
return (url, content_type)
finally:
#.........这里部分代码省略.........
示例12: process_sites
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
def process_sites(sites_dict, rent_src):
br = Browser()
for site in sites_dict.keys():
print 'Searching site ', site
all_rent_links = []
for site_urls in sites_dict.get(site):
all_locs_rent_links = []
for loc in site_urls.keys():
print >> sys.stderr, 'Looking for new rent for a price range in ', site, loc
time.sleep(3)
result = br.retrieve(site_urls.get(loc))
temp_file_name = result[0]
html_f = open(temp_file_name)
soup = BeautifulSoup(html_f)
rent_links = get_rent_links(soup,site)
all_locs_rent_links.extend(rent_links)
all_rent_links.extend(all_locs_rent_links)
print >> sys.stderr, len(all_rent_links), 'links found!'
#TODO: any better way to do below
try:
f = open(site+'_'+rent_src,'r')
except IOError:
f = open(site+'_'+rent_src,'w')
pickle.dump([],f)
f.close()
f = open(site+'_'+rent_src,'r')
old_pure_rent_links = pickle.load(f)
f.close()
print >> sys.stderr, len(old_pure_rent_links), 'old links stored in file!'
#below actually IS list of string!
#new_all_rent_links = list(set(all_rent_links) - set(old_all_rent_links))
new_all_rent_links = []
#old_pure_links = [link[0] for link in old_all_rent_links]
new_pure_rent_links = []
for l in all_rent_links:
if not l[0] in old_pure_rent_links and not l[0] in new_pure_rent_links:
new_all_rent_links.append(l)
#only the link info is stored in the file to save the space
new_pure_rent_links.append(l[0])
print >> sys.stderr, len(new_all_rent_links), ' of new rents found!'
#think of using lambda TODO:
if new_all_rent_links:
info = ('\n\n'.join([' '.join(link) for link in new_all_rent_links])).encode('utf-8')
print 'info', info
last_parag = "\n\n\n 去http://91biji.com/groups/%E7%A7%9F%E6%88%BF/bookmarkbook/管理你的看房记录(如果点击上面链接后看到想继续考虑的房子,你可以用91biji.com将其加入你在的租房小组。) \n\n 91biji.com 敬上"
content = '从 '+site+' 新找到的租房信息:\n\n'+info+last_parag
mailserver = get_mail_server()
sendEmail(mailserver, u'[email protected]', to_addr_list, '('+rent_src+')'+' 从 '+site+' 新找到的'+str(len(new_all_rent_links))+'条租房信息', content)
mailserver.close()
old_pure_rent_links.extend(new_pure_rent_links)
f = open(site+'_'+rent_src,'w')
pickle.dump(old_pure_rent_links,f)
f.close()
示例13: CourseraDownloader
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
#.........这里部分代码省略.........
# get the headers
headers = r.info()
# get the content length (if present)
clen = int(headers['Content-Length']) if 'Content-Length' in headers else -1
# build the absolute path we are going to write to
fname = target_fname or sanitiseFileName(CourseraDownloader.getFileName(headers)) or CourseraDownloader.getFileNameFromURL(url)
filepath = os.path.join(target_dir,fname)
dl = True
if os.path.exists(filepath):
if clen > 0:
fs = os.path.getsize(filepath)
delta = clen - fs
# all we know is that the current filesize may be shorter than it should be and the content length may be incorrect
# overwrite the file if the reported content length is bigger than what we have already by at least k bytes (arbitrary)
# TODO this is still not foolproof as the fundamental problem is that the content length cannot be trusted
# so this really needs to be avoided and replaced by something else, eg., explicitly storing what downloaded correctly
if delta > 2:
print ' - "%s" seems incomplete, downloading again' % fname
else:
print ' - "%s" already exists, skipping' % fname
dl = False
else:
# missing or invalid content length
# assume all is ok...
dl = False
try:
if dl: self.browser.retrieve(url,filepath)
except Exception as e:
print "Failed to download url %s to %s: %s" % (url,filepath,e)
def download_course(self,cname,dest_dir="."):
"""Download all the contents (quizzes, videos, lecture notes, ...) of the course to the given destination directory (defaults to .)"""
# Ensure we are logged in
self.login(cname)
# get the lecture url
course_url = self.lecture_url_from_name(cname)
(weeklyTopics, allClasses) = self.get_downloadable_content(course_url)
print '* Got all downloadable content for ' + cname
course_dir = os.path.abspath(os.path.join(dest_dir,cname))
# ensure the target dir exists
if not os.path.exists(course_dir):
os.mkdir(course_dir)
print "* " + cname + " will be downloaded to " + course_dir
# ensure the course directory exists
if not os.path.exists(course_dir):
os.makedirs(course_dir)
# download the standard pages
print " - Downloading lecture/syllabus pages"
self.download(self.HOME_URL % cname,target_dir=course_dir,target_fname="index.html")
self.download(course_url,target_dir=course_dir,target_fname="lectures.html")
示例14: str
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
import re
import time
import os
from mechanize import Browser
filenm = str((sys.argv)[1])
url = "http://smog.rice.edu"
br = Browser()
br.set_handle_robots(False)
br.open(url+"/cgi-bin/GenTopGro.pl")
br.select_form(nr=0)
fp = open(filenm,"r")
br.form.add_file(fp, "text/plain", filenm, name='uploaded_file')
br["nickname"] = "mark"
br.submit()
fp.close()
time.sleep(5)
for link in br.links():
loc = link.url.find("mark")
if loc > 0:
f = link.url[loc:]
br.retrieve(url+link.url, os.path.join(os.getcwd(), f))
break
示例15: CourseraDownloader
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
class CourseraDownloader(object):
login_url = ''
lectures_url = ''
def __init__(self, parts_ids=[], rows_ids=[], types=[]):
self.parts_ids = parts_ids
self.rows_ids = rows_ids
self.types = types
self.br = Browser()
self.br.set_handle_robots(False)
def authenticate(self):
self.br.open(self.login_url)
self.br.form = self.br.forms().next()
self.br['email'] = EMAIL
self.br['password'] = PASSWORD
self.br.submit()
def download(self):
page = self.br.open(self.lectures_url)
doc = BeautifulSoup(page)
parts = self.get_parts(doc)
for idx, part in enumerate(parts):
if self.item_is_needed(self.parts_ids, idx):
self.download_part('%02d' % (idx + 1), part)
def download_part(self, dir_name, part):
if not os.path.exists(dir_name):
os.mkdir(dir_name)
rows = self.get_rows(part)
for idx, row in enumerate(rows):
if self.item_is_needed(self.rows_ids, idx):
self.download_row(dir_name, '%02d' % (idx + 1), row)
def download_row(self, dir_name, name, row):
resources = self.get_resources(row)
for resource in resources:
if self.item_is_needed(self.types, resource[1]):
self.download_resource(dir_name, name, resource)
def item_is_needed(self, etalons, sample):
return (len(etalons) == 0) or (sample in etalons)
def download_resource(self, dir_name, name, resource):
res_url = resource[0]
res_type = resource[1]
url, content_type = self.get_real_resource_info(res_url)
ext = self.get_file_ext(url, content_type, res_type)
filename = self.get_file_name(dir_name, name, ext)
self.br.retrieve(url, filename)
# Download subtitles in .srt format together with .txt.
if res_type == 'txt':
m = REG_TXT_RES.match(url)
if m:
ext = 'srt'
url = '%s=%s' % (m.group(1), ext)
filename = self.get_file_name(dir_name, name, ext)
try:
self.br.retrieve(url, filename)
except:
# Ignore if there is no subtitles in .srt format.
pass
def get_file_name(self, dir_name, name, ext):
return ('%s.%s' % (os.path.join(dir_name, name), ext)).lower()
def get_real_resource_info(self, res_url):
try:
src = self.br.open(res_url)
try:
url = src.geturl()
content_type = src.info().get('content-type', '')
return (url, content_type)
finally:
src.close()
except:
return (res_url, '')
def get_file_ext(self, url, content_type, res_type):
m = REG_URL_FILE.search(url)
if m:
return m.group(2)
m = REG_CONT_TYPE_EXT.match(content_type)
if m:
return m.group(1)
return DEFAULT_EXT[res_type]
def get_parts(self, doc):
return select(doc, 'ul.item_section_list')
def get_rows(self, doc):
return select(doc, 'div.item_resource')
def get_resources(self, doc):
resources = []
for a in select(doc, 'a'):
url = a.get('href')
img = select(a, 'img[src]')[0]
src = img.get('src')
#.........这里部分代码省略.........