当前位置: 首页>>代码示例>>Python>>正文


Python Browser.retrieve方法代码示例

本文整理汇总了Python中mechanize.Browser.retrieve方法的典型用法代码示例。如果您正苦于以下问题:Python Browser.retrieve方法的具体用法?Python Browser.retrieve怎么用?Python Browser.retrieve使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在mechanize.Browser的用法示例。


在下文中一共展示了Browser.retrieve方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: download

# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
def download(url, filename = "", saveto = "", overwrite = 2, suffix = ""):
	try :
		if (filename == "") :
			filename = url.split("/")[-1]
			filename = filename.split("?")[0]
		do_download = True
		if( not saveto.endswith("/")) :
			saveto = saveto + "/"
		if(overwrite == 2 and os.path.isfile(saveto + filename)) :
			br = Browser()
			br.open(url)
			remote_time = time.strptime(br.response().info()["last-modified"], "%a, %d %b %Y %H:%M:%S GMT")
			local_time  = time.gmtime((os.stat(saveto + filename + suffix).st_mtime))
			do_download = (remote_time > local_time)
		elif (overwrite == 0 and os.path.isfile(saveto + filename)) :
			do_download = False
		if(do_download) :
			br = Browser()
			os.chdir(saveto)
			br.retrieve(url,filename+suffix)
			print("Downloaded " + url + " succesfully")
		else :
			print(url + " exists already")
	except:
		print("Failed: " + url)
开发者ID:yforster,项目名称:pythomat,代码行数:27,代码来源:pythomat.py

示例2: go

# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
def go(thread_num):
    global connections
    global successful
    global errors

    file_name = str(thread_num)
    br = Browser()
    while(True):
        try:
            br.retrieve('http://szuku.pl/jcaptcha/jpeg/imageCaptcha', file_name + '.jpeg')
            
            ocr = popen('./convert.sh ' + file_name).read()
            captcha = only_letters_or_digits(ocr)

            br.open('http://szuku.pl/teaser/save', 
                    urlencode({'email': rand_mail(),
                               'captcha': captcha}))
            
            connections += 1

            if ok_regex.search(br.response().read()):
                successful += 1

        except Exception, e:
            print e
            errors += 1
开发者ID:dillonko,项目名称:python,代码行数:28,代码来源:szuku.py

示例3: GetServerConfigFile

# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
def GetServerConfigFile(url_aastra, return_file):
    br = Browser()
    br.add_password(url_aastra, "admin", "22222")
    try:
        br.retrieve(url_aastra + "/servercfg.html", return_file)
        return True
    except:
        log.warn("Maybe isn't a aastra phone? Are you Sure?")
        return False
开发者ID:arpagon,项目名称:pyaastra,代码行数:11,代码来源:WebAdmin.py

示例4: main

# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
def main(page, regex, path):
    start_time = time.time()
    br = Browser()
    br.set_handle_robots(False)
    br.open(page)
    #br.open('http://storage.googleapis.com/books/ngrams/books/datasetsv2.html')

    eng_all = re.compile(regex)
    #eng_all = re.compile('.*googlebooks-eng-all.*20120701.*')

    #print page, regex, path
    n = 0
    maxlen = 0
    link_list = []
    for link in br.links():
        if eng_all.match(link.url):
            n += 1
            maxlen = max(len(os.path.basename(link.url)), maxlen)
            link_list.append(link.url)
            sys.stderr.write('Found Link: %s\n' % link.url)

    answer = raw_input("\n\nAre you sure you want to download the above %i file(s)? (Y/N):  " % n)
    if answer == 'N' or answer == 'n':
        sys.exit(0)

    sys.stderr.write('\n\nDownloading files to: %s\n' % path)

    digits = len('%d' % n)
    disp_time = datetime.datetime.now

    for i, link in enumerate(link_list):
        download_start = time.time()
        file_name = os.path.basename(link)
        full_path = os.path.join(path, file_name)
        if os.path.exists(full_path):
            sys.stderr.write('%s exists, not downloading\n' % full_path)
            continue
        try:
            sys.stderr.write('[%s] Downloading(%-*i of %i): %*s' % (str(disp_time().time())[:8], digits, i+1, n,
                                                                   maxlen + 2, file_name))
            br.retrieve(link, filename=full_path)
        except:
            sys.stderr.write('\n\nSomething happened, deleting last file: %s\n' % full_path)
            os.remove(full_path)
            sys.exit(0)
        sys.stderr.write(' of size %s MB in %5.2f min\n' % ("{:7.2f}".format(float(os.stat(full_path).st_size)/1000000),
                                                            (time.time() - download_start)/60))
        br.clear_history()

    sys.stderr.write('\ndownloaded %i files to %s directory in %15f seconds\n' % (n, path, time.time()-start_time))
开发者ID:on2valhalla,项目名称:NgramViewerScraper,代码行数:52,代码来源:ngrams_scraper.py

示例5: down_image

# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
 def down_image(self, img):
     print "down image from " + img
     down_br = Browser()
     down_cj = CookieJar()
     down_br.set_cookiejar(down_cj)
     fn = tempfile.mktemp(suffix='.png')
     return down_br.retrieve(img, filename = fn)[0]
开发者ID:lite,项目名称:yebob_utils,代码行数:9,代码来源:Yebob.py

示例6: take_action

# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
    def take_action(self, parsed_args):
        config_dir = '~/.kaggle-cli'
        config_dir = os.path.expanduser(config_dir)

        if os.path.isdir(config_dir):
            config = ConfigParser.ConfigParser(allow_no_value=True)
            config.readfp(open(config_dir + '/config'))

            if parsed_args.username:
                username = parsed_args.username
            else:
                username = config.get('user', 'username')

            if parsed_args.password:
                password = parsed_args.password
            else:
                password = config.get('user', 'password')

            if parsed_args.competition:
                competition = parsed_args.competition
            else:
                competition = config.get('user', 'competition')

        base = 'https://www.kaggle.com'
        login_url = base
        data_url = '/'.join([base, 'c', competition, 'data'])

        browser = Browser()

        browser.open(login_url)
        browser.select_form(nr=0)

        browser['UserName'] = username
        browser['Password'] = password

        browser.submit()

        browser.open(data_url)
        data_page = html.fromstring(browser.response().read())

        src_urls = map(
            lambda x: base + x.attrib['href'],
            data_page.cssselect('#data-files a'))

        for url in src_urls:
            self.app.stdout.write('downloading %s\n' % url)
            browser.retrieve(url, url.split('/')[-1])
开发者ID:asmith26,项目名称:kaggle-cli,代码行数:49,代码来源:download.py

示例7: main

# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
def main():
    ## create a browser object
    ## NWEA has a pretty aggressive robots.txt
    ## here's what we'll do about that: ignore it
    br = Browser()
    #br.set_handle_redirect(False)    
    br.set_handle_robots(False)
    br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]

    ## open the login page, form is called loginForm
    br.open(LOGIN_URL)
    br.select_form(name="loginForm")    
    br['username'] = USERNAME
    br['password'] = SECRET    
    response = br.submit()  ## submit and store response
    print 'credentials successful, logged in'
    #print response.read()

    #once logged in, navigate to reports page
    br.open(BASE_URL + '/report/home/map')

    #CDF file looks like "https://kippteamschools-admin.mapnwea.org/report/download/cdf/7492"
    #get the matching cdf and build the full url
    cdf_string = '/report/download/cdf/[0-9]+'
    file_target = br.find_link(url_regex=cdf_string)
    file_loc =  BASE_URL + file_target.url
    print 'cdf is located at %s' % (file_loc)

    #retrieve will get file at the location and save to a temp directory
    cdf_zipped = br.retrieve(file_loc)[0]
    print 'temp file is located at %s' % cdf_zipped

    sourceZip = ZipFile(cdf_zipped, 'r')
    print
    print 'beginning unzip'
    for name in sourceZip.namelist():
        print 'extracted %s...' % (name)
        sourceZip.extract(name, UNZIPPED_DEST)
    sourceZip.close()
开发者ID:cbini,项目名称:eduextractor,代码行数:41,代码来源:map_loader.py

示例8: CourseraDownloader

# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]

#.........这里部分代码省略.........
            logging.critical("couldn't authenticate")
            sys.exit(1)
        logging.info("successfully authenticated")

    def set_csrf_token(self):
        self.csrf_token = self.get_cookie_value('csrf_token')

    def set_session(self):
        self.session = self.get_cookie_value('CAUTH')

    def get_cookie_value(self, search_name):
        for cookie in self.br._ua_handlers['_cookies'].cookiejar:
            if cookie.name == search_name:
                return cookie.value

    def set_auth_headers(self):
        self.br.addheaders = [
            ('Cookie', 'csrftoken=%s' % self.csrf_token),
            ('Referer', 'https://accounts.coursera.org/signin'),
            ('X-CSRFToken', self.csrf_token),
        ]

    def set_download_headers(self):
        self.br.addheaders = [
            (
                'Cookie',
                'csrftoken=%s;CAUTH=%s' % (self.csrf_token, self.session)
            ),
        ]

    def is_authenticated(self, test_page):
        m = re.search(
            'https://class.coursera.org/%s/auth/logout' % self.course_name,
            test_page
        )
        return m is not None

    def download(self):
        course_dir = os.path.join(TARGETDIR, self.course_name)
        if not os.path.exists(course_dir):
            os.mkdir(course_dir)
        page = self.br.open(self.lectures_url)
        doc = BeautifulSoup(page)
        parts, part_titles = self.get_parts(doc)
        for idx, part in enumerate(parts):
            if self.item_is_needed(self.parts_ids, idx):
                part_dir = os.path.join(
                    course_dir,
                    '%02d - %s' % (
                        (idx + 1),
                        self.escape_name(part_titles[idx].text).strip()
                    )
                )
                self.download_part(part_dir, part)

    def download_part(self, dir_name, part):
        if not os.path.exists(dir_name):
            os.mkdir(dir_name)
        rows, row_names = self.get_rows(part)
        for idx, row in enumerate(rows):
            if self.item_is_needed(self.rows_ids, idx):
                self.download_row(
                    dir_name,
                    '%02d - %s' % (
                        (idx + 1),
                        row_names[idx].text.strip()
                    ),
                    row
                )

    def download_row(self, dir_name, name, row):
        resources = self.get_resources(row)
        for resource in resources:
            if self.item_is_needed(self.types, resource[1]):
                self.download_resource(dir_name, name, resource)

    def download_resource(self, dir_name, name, resource):
        res_url = resource[0]
        res_type = resource[1]
        url, content_type = self.get_real_resource_info(res_url)
        ext = self.get_file_ext(url, content_type, res_type)
        if ext:
            filename = self.get_file_name(dir_name, name, ext)
            self.retrieve(url, filename)

    def retrieve(self, url, filename):
        if os.path.exists(filename) and not self.force:
            logging.info("skipping file '%s'" % filename)
        else:
            logging.info("downloading file '%s'" % filename)
            logging.debug("URL: %s" % url)
            try:
                self.br.retrieve(url, filename, reporter)
            except KeyboardInterrupt:
                if os.path.exists(filename): os.remove(filename)
                raise
            except Exception, ex:
                if os.path.exists(filename): os.remove(filename)
                logging.debug(ex)
                logging.info("couldn't download the file")
开发者ID:Mondego,项目名称:pyreco,代码行数:104,代码来源:allPythonContent.py

示例9: CoreEmulator

# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
class CoreEmulator(Emulator):

    def __init__(self, username, password):
        super(CoreEmulator, self).__init__(username, password)
        self.setup_emulator()

    def setup_emulator(self):
        self.browser = Browser()
        self.browser.set_handle_robots(False)
        self.browser.addheaders = moodle.USER_AGENT
        self.cookiejar = CookieJar()
        self.browser.set_cookiejar(self.cookiejar)

    def session_expired(self):
        return self.browser.geturl().endswith(moodle.LOGIN_LOCATION)

    @throws_moodlefuse_error(exception.LoginException)
    def login(self):
        self.open_login_page(self.browser.open)
        self.browser.select_form(
            predicate=lambda form: form.attrs.get('id') == attributes.LOGIN
        )
        self.browser.form.set_value(self.username, name='username')
        self.browser.form.set_value(self.password, name='password')
        resp = self.browser.submit()

        if resp.geturl().endswith(moodle.LOGIN_LOCATION):
            raise Exception

    @throws_moodlefuse_error(resource_errors.UnableToDownloadResource)
    def download(self, destination, source):
        source = str(source)
        if not source.startswith('http://') and not source.startswith('file://'):
            source = config['TEST_DATA'] + '/' + source

        self.browser.retrieve(source, destination)

    def open_link(self, url):
        response = self.browser.open(url)
        return BeautifulSoup(response.read())

    def check_form_checkbox(self, checkboxname):
        self.browser.find_control(checkboxname).items[0].selected = True

    def uncheck_form_checkbox(self, checkboxname):
        self.browser.find_control(checkboxname).items[0].selected = False

    def add_form_content(self, inputname, content):
        self.browser.form.set_value(content, name=inputname)

    def close_form(self):
        self.browser.submit()

    def set_form_to_first_form(self):
        self.browser.select_form(nr=0)

    def set_form_to_form_with_control_value(self, value):
        for form in self.browser.forms():
            for control in form.controls:
                if control.value == value:
                    self.browser.form = form

    @throws_moodlefuse_error(exception.UnableToToggleEditing)
    def turn_course_editing_on(self):
        self.set_form_to_form_with_control_value(moodle.EDIT_ON_MOODLE_BUTTON_TEXT)
        response = self.browser.submit()
        return BeautifulSoup(response.read())

    def _setup_assignments_for_parsing(self, submission_filter):
        self.set_form_to_form_with_control_value('Save and update table')
        self.browser.form["filter"] = [submission_filter]
        self.browser.form["perpage"] = ["100"]
        self.uncheck_form_checkbox('quickgrading')
        response = self.browser.submit()
        return BeautifulSoup(response.read())

    def filter_assignment_submissions(self):
        return self._setup_assignments_for_parsing("submitted")

    def unfilter_assignment_submissions(self):
        return self._setup_assignments_for_parsing("")

    @throws_moodlefuse_error(exception.UnableToToggleEditing)
    def turn_course_editing_off(self):
        self.set_form_to_form_with_control_value(moodle.EDIT_OFF_MOODLE_BUTTON_TEXT)
        response = self.browser.submit()
        return BeautifulSoup(response.read())

    @throws_moodlefuse_error(course_errors.InvalidMoodleIndex)
    def get_courses(self):
        return self.open_link(config['MOODLE_INDEX_ADDRESS'])

    @throws_moodlefuse_error(course_errors.UnableToObtainCategoryList)
    def get_course_categories(self, url):
        return self.open_link(url)

    @throws_moodlefuse_error(resource_errors.UnableToObtainResourceList)
    def get_course_resource_names(self, url):
        return self.open_link(url)

#.........这里部分代码省略.........
开发者ID:BroganD1993,项目名称:MoodleFUSE,代码行数:103,代码来源:core_emulator.py

示例10: RegPublDownloader

# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
class RegPublDownloader(LegalSource.Downloader):
    
    def __init__(self,baseDir="data"):
        self.dir = baseDir + "/regpubl/downloaded"
        if not os.path.exists(self.dir):
            Util.mkdir(self.dir)
        self.config = ConfigObj("%s/%s.ini" % (self.dir, __moduledir__))

        # Why does this say "super() argument 1 must be type, not classobj"
        # super(RegPublDownloader,self).__init__()
        self.browser = Browser()
    
    def DownloadAll(self):
        # we use mechanize instead of our own Robot class to list
        # available documents since we can't get the POST/cookie based
        # search to work.
        doctype = '160'
        log.info(u'Selecting documents of type %s' % doctype)
        self.browser.open("http://www.regeringen.se/sb/d/108/action/browse/c/%s" % doctype)
        log.info(u'Posting search form')
        self.browser.select_form(nr=1)
        self.browser.submit()

        pagecnt = 1
        done = False
        while not done:
            log.info(u'Result page #%s' % pagecnt)
            for l in self.browser.links(url_regex=r'/sb/d/108/a/\d+'):
                self._downloadSingle(l.absolute_url)
                self.browser.back()
            try:
                self.browser.find_link(text='N\xe4sta sida')
                self.browser.follow_link(text='N\xe4sta sida')
            except LinkNotFoundError:
                log.info(u'No next page link found, this was the last page')
                done = True
            pagecnt += 1
        self.config['last_update'] = datetime.date.today()    
        self.config.write()
        
    def DownloadNew(self):
        if 'last_update' in self.config:
            then = datetime.datetime.strptime(self.config['last_update'], '%Y-%m-%d')
        else:
            # assume last update was more than a year ago
            then = datetime.datetime.now() - datetime.timedelta(-367)
        
        now =  datetime.datetime.now()
        if (now - then).days > 30:
            pass
            # post a "last 30 days" query
        elif (now - then).days > 365:
            pass
            # post a "last 12 months" query
        else:
            # post a full query
            self.DownloadAll()        
        
    def _downloadSingle(self,url):
        docid = re.match(r'http://www.regeringen.se/sb/d/108/a/(\d+)', url).group(1)

        fname = "%s/%s/index.html" % (self.dir, docid)
        log.info(u'    Loading docidx %s' % url)
        self.browser.open(url)
        if not os.path.exists(fname):
            Util.ensureDir(fname)
            self.browser.retrieve(url,fname)
        
        for l in self.browser.links(url_regex=r'/download/(\w+\.pdf).*'):
            filename = re.match(r'http://www.regeringen.se/download/(\w+\.pdf).*',l.absolute_url).group(1)
            # note; the url goes to a redirect script; however that
            # part of the URL tree (/download/*) is off-limits for
            # robots. But we can figure out the actual URL anyway!
            if len(docid) > 4:
                path = "c6/%02d/%s/%s" % (int(docid[:-4]),docid[-4:-2],docid[-2:])
            else:
                path = "c4/%02d/%s" % (int(docid[:-2]),docid[-2:])
            fileurl = "http://regeringen.se/content/1/%s/%s" % (path,filename)
            
            df = "%s/%s/%s" % (self.dir,docid, filename)
            if not os.path.exists(df):
                log.info(u'        Downloading %s' % (fileurl))
                self.browser.retrieve(fileurl, df)
            else:
                log.info(u'        Already downloaded %s' % (fileurl))
开发者ID:staffanm,项目名称:legacy.lagen.nu,代码行数:87,代码来源:RegPubl.py

示例11: CourseraDownloader

# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
class CourseraDownloader(object):
    login_url = ''
    home_url = ''
    lectures_url = ''
    course_name = ''

    def __init__(self, config):
        self.parts_ids = config['parts']
        self.rows_ids = config['rows']
        self.types = config['types']
        self.force = config['force']
        self.br = Browser()
        self.br.set_handle_robots(False)

    def authenticate(self):
        self.br.open(self.login_url)
        self.br.form = self.br.forms().next()
        self.br['email'] = EMAIL
        self.br['password'] = PASSWORD
        self.br.submit()
        home_page = self.br.open(self.home_url)
        if not self.is_authenticated(home_page.read()):
            log("couldn't authenticate")
            sys.exit(1)
        log("successfully authenticated")

    def is_authenticated(self, test_page):
        m = re.search(
            'https://class.coursera.org/%s/auth/logout' % self.course_name,
            test_page)
        return m is not None

    def download(self):
        course_dir = os.path.join(TARGETDIR, self.course_name)
        if not os.path.exists(course_dir):
            os.mkdir(course_dir)
        page = self.br.open(self.lectures_url)
        doc = BeautifulSoup(page)
        parts, part_titles = self.get_parts(doc)
        for idx, part in enumerate(parts):
            if self.item_is_needed(self.parts_ids, idx):
                part_dir = os.path.join(
                    course_dir,
                    '%02d - %s' % ((idx + 1),
                    part_titles[idx].text.strip()))
                self.download_part(part_dir, part)

    def download_part(self, dir_name, part):
        if not os.path.exists(dir_name):
            os.mkdir(dir_name)
        rows, row_names = self.get_rows(part)
        for idx, row in enumerate(rows):
            if self.item_is_needed(self.rows_ids, idx):
                self.download_row(dir_name, '%02d - %s' % ((idx + 1),
                                  row_names[idx].text.strip()), row)

    def download_row(self, dir_name, name, row):
        resources = self.get_resources(row)
        for resource in resources:
            if self.item_is_needed(self.types, resource[1]):
                self.download_resource(dir_name, name, resource)

    def download_resource(self, dir_name, name, resource):
        res_url = resource[0]
        res_type = resource[1]
        url, content_type = self.get_real_resource_info(res_url)
        ext = self.get_file_ext(url, content_type, res_type)
        filename = self.get_file_name(dir_name, name, ext)
        self.retrieve(url, filename)

    def retrieve(self, url, filename):
        if os.path.exists(filename) and not self.force:
            log("skipping file '%s'" % filename)
        else:
            log("downloading file '%s'" % filename)
            try:
                self.br.retrieve(url, filename)
            except KeyboardInterrupt:
                raise
            except:
                log("couldn't download the file")

    def item_is_needed(self, etalons, sample):
        return (len(etalons) == 0) or (sample in etalons)

    def get_file_name(self, dir_name, name, ext):
        name = self.escape_name(name)
        return ('%s.%s' % (os.path.join(dir_name, name), ext))

    def escape_name(self, name):
        return name.replace('/', '_').replace('\\', '_')

    def get_real_resource_info(self, res_url):
        try:
            src = self.br.open(res_url)
            try:
                url = src.geturl()
                content_type = src.info().get('content-type', '')
                return (url, content_type)
            finally:
#.........这里部分代码省略.........
开发者ID:jpxw,项目名称:coursera,代码行数:103,代码来源:coursera.py

示例12: process_sites

# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
def process_sites(sites_dict, rent_src):
    br = Browser()    
    for site in sites_dict.keys(): 
        print 'Searching site ', site
        all_rent_links = []  
        for site_urls in sites_dict.get(site):             
            all_locs_rent_links = []          
            for loc in site_urls.keys(): 
                print  >> sys.stderr, 'Looking for new rent for a price range in ', site, loc
                time.sleep(3)                
                result = br.retrieve(site_urls.get(loc))                 
                temp_file_name = result[0]
                html_f = open(temp_file_name)
                soup = BeautifulSoup(html_f)                
                rent_links = get_rent_links(soup,site)                
                all_locs_rent_links.extend(rent_links) 
            all_rent_links.extend(all_locs_rent_links)    

                             
        print >> sys.stderr, len(all_rent_links), 'links found!'        
        
        #TODO: any better way to do below
        try:
            f = open(site+'_'+rent_src,'r')
        except IOError:
            f = open(site+'_'+rent_src,'w')
            pickle.dump([],f)
            f.close()
            f = open(site+'_'+rent_src,'r')
                
        old_pure_rent_links = pickle.load(f)
        f.close()
        
        print >> sys.stderr, len(old_pure_rent_links), 'old links stored in file!'
        
        #below actually IS list of string!
        #new_all_rent_links = list(set(all_rent_links) - set(old_all_rent_links))
        new_all_rent_links = [] 
        #old_pure_links = [link[0] for link in old_all_rent_links]       
        new_pure_rent_links = []
        for l in all_rent_links:
            if not l[0] in old_pure_rent_links and not l[0] in new_pure_rent_links: 
                new_all_rent_links.append(l)
                #only the link info is stored in the file to save the space
                new_pure_rent_links.append(l[0])
        
        
        print >> sys.stderr, len(new_all_rent_links), ' of new rents found!'
        
        #think of using lambda TODO:
        if new_all_rent_links:            
            info = ('\n\n'.join(['   '.join(link) for link in new_all_rent_links])).encode('utf-8')
            print 'info', info 
            last_parag = "\n\n\n 去http://91biji.com/groups/%E7%A7%9F%E6%88%BF/bookmarkbook/管理你的看房记录(如果点击上面链接后看到想继续考虑的房子,你可以用91biji.com将其加入你在的租房小组。) \n\n 91biji.com 敬上" 
            content = '从 '+site+' 新找到的租房信息:\n\n'+info+last_parag
            mailserver = get_mail_server()  
            sendEmail(mailserver, u'[email protected]', to_addr_list, '('+rent_src+')'+' 从 '+site+' 新找到的'+str(len(new_all_rent_links))+'条租房信息', content) 
            mailserver.close()
        
        old_pure_rent_links.extend(new_pure_rent_links)
        f = open(site+'_'+rent_src,'w')
        pickle.dump(old_pure_rent_links,f)
        f.close()
开发者ID:yejia,项目名称:rent_alert,代码行数:65,代码来源:rent.py

示例13: CourseraDownloader

# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]

#.........这里部分代码省略.........

        # get the headers
        headers = r.info()

        # get the content length (if present)
        clen = int(headers['Content-Length']) if 'Content-Length' in headers else -1 
 
        # build the absolute path we are going to write to
        fname = target_fname or sanitiseFileName(CourseraDownloader.getFileName(headers)) or CourseraDownloader.getFileNameFromURL(url)
        filepath = os.path.join(target_dir,fname)

        dl = True
        if os.path.exists(filepath):
            if clen > 0: 
                fs = os.path.getsize(filepath)
                delta = clen - fs

                # all we know is that the current filesize may be shorter than it should be and the content length may be incorrect
                # overwrite the file if the reported content length is bigger than what we have already by at least k bytes (arbitrary)

                # TODO this is still not foolproof as the fundamental problem is that the content length cannot be trusted
                # so this really needs to be avoided and replaced by something else, eg., explicitly storing what downloaded correctly
                if delta > 2:
                    print '    - "%s" seems incomplete, downloading again' % fname
                else:
                    print '    - "%s" already exists, skipping' % fname
                    dl = False
            else:
                # missing or invalid content length
                # assume all is ok...
                dl = False

        try:
            if dl: self.browser.retrieve(url,filepath)
        except Exception as e:
            print "Failed to download url %s to %s: %s" % (url,filepath,e)

    def download_course(self,cname,dest_dir="."):
        """Download all the contents (quizzes, videos, lecture notes, ...) of the course to the given destination directory (defaults to .)"""

        # Ensure we are logged in
        self.login(cname)

        # get the lecture url
        course_url = self.lecture_url_from_name(cname)

        (weeklyTopics, allClasses) = self.get_downloadable_content(course_url)
        print '* Got all downloadable content for ' + cname

        course_dir = os.path.abspath(os.path.join(dest_dir,cname))

        # ensure the target dir exists
        if not os.path.exists(course_dir):
            os.mkdir(course_dir)

        print "* " + cname + " will be downloaded to " + course_dir

        # ensure the course directory exists
        if not os.path.exists(course_dir):
            os.makedirs(course_dir)

        # download the standard pages
        print " - Downloading lecture/syllabus pages"
        self.download(self.HOME_URL % cname,target_dir=course_dir,target_fname="index.html")
        self.download(course_url,target_dir=course_dir,target_fname="lectures.html")
开发者ID:afu1979,项目名称:coursera-dl,代码行数:69,代码来源:courseradownloader.py

示例14: str

# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
import re
import time
import os
from mechanize import Browser

filenm = str((sys.argv)[1])
url = "http://smog.rice.edu"

br = Browser()
br.set_handle_robots(False)
br.open(url+"/cgi-bin/GenTopGro.pl")

br.select_form(nr=0)

fp = open(filenm,"r")
br.form.add_file(fp, "text/plain", filenm, name='uploaded_file')

br["nickname"] = "mark"

br.submit()
fp.close()

time.sleep(5)

for link in br.links():
	loc = link.url.find("mark")
	if loc > 0:
		f = link.url[loc:]
		br.retrieve(url+link.url, os.path.join(os.getcwd(), f))
		break
开发者ID:zangtw,项目名称:Ensemble-based-restraints,代码行数:32,代码来源:makeSBM.py

示例15: CourseraDownloader

# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import retrieve [as 别名]
class CourseraDownloader(object):
    login_url = ''
    lectures_url = ''

    def __init__(self, parts_ids=[], rows_ids=[], types=[]):
        self.parts_ids = parts_ids
        self.rows_ids = rows_ids
        self.types = types
        self.br = Browser()
        self.br.set_handle_robots(False)

    def authenticate(self):
        self.br.open(self.login_url)
        self.br.form = self.br.forms().next()
        self.br['email'] = EMAIL
        self.br['password'] = PASSWORD
        self.br.submit()

    def download(self):
        page = self.br.open(self.lectures_url)
        doc = BeautifulSoup(page)
        parts = self.get_parts(doc)
        for idx, part in enumerate(parts):
            if self.item_is_needed(self.parts_ids, idx):
                self.download_part('%02d' % (idx + 1), part)

    def download_part(self, dir_name, part):
        if not os.path.exists(dir_name):
            os.mkdir(dir_name)
        rows = self.get_rows(part)
        for idx, row in enumerate(rows):
            if self.item_is_needed(self.rows_ids, idx):
                self.download_row(dir_name, '%02d' % (idx + 1), row)

    def download_row(self, dir_name, name, row):
        resources = self.get_resources(row)
        for resource in resources:
            if self.item_is_needed(self.types, resource[1]):
                self.download_resource(dir_name, name, resource)

    def item_is_needed(self, etalons, sample):
        return (len(etalons) == 0) or (sample in etalons)

    def download_resource(self, dir_name, name, resource):
        res_url = resource[0]
        res_type = resource[1]
        url, content_type = self.get_real_resource_info(res_url)
        ext = self.get_file_ext(url, content_type, res_type)
        filename = self.get_file_name(dir_name, name, ext)
        self.br.retrieve(url, filename)

        # Download subtitles in .srt format together with .txt.
        if res_type == 'txt':
            m = REG_TXT_RES.match(url)
            if m:
                ext = 'srt'
                url = '%s=%s' % (m.group(1), ext)
                filename = self.get_file_name(dir_name, name, ext)
                try:
                    self.br.retrieve(url, filename)
                except:
                    # Ignore if there is no subtitles in .srt format.
                    pass

    def get_file_name(self, dir_name, name, ext):
        return ('%s.%s' % (os.path.join(dir_name, name), ext)).lower()

    def get_real_resource_info(self, res_url):
        try:
            src = self.br.open(res_url)
            try:
                url = src.geturl()
                content_type = src.info().get('content-type', '')
                return (url, content_type)
            finally:
                src.close()
        except:
            return (res_url, '')

    def get_file_ext(self, url, content_type, res_type):
        m = REG_URL_FILE.search(url)
        if m:
            return m.group(2)
        m = REG_CONT_TYPE_EXT.match(content_type)
        if m:
            return m.group(1)
        return DEFAULT_EXT[res_type]

    def get_parts(self, doc):
        return select(doc, 'ul.item_section_list')

    def get_rows(self, doc):
        return select(doc, 'div.item_resource')

    def get_resources(self, doc):
        resources = []
        for a in select(doc, 'a'):
            url = a.get('href')
            img = select(a, 'img[src]')[0]
            src = img.get('src')
#.........这里部分代码省略.........
开发者ID:dedsm,项目名称:coursera,代码行数:103,代码来源:coursera.py


注:本文中的mechanize.Browser.retrieve方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。