当前位置: 首页>>代码示例>>Python>>正文


Python web.URL类代码示例

本文整理汇总了Python中pattern.web.URL的典型用法代码示例。如果您正苦于以下问题:Python URL类的具体用法?Python URL怎么用?Python URL使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了URL类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: scrape_starrtest

def scrape_starrtest(county_num):
	if county_num<10:
		county_num = '0' + str(county_num)
	else:
		county_num = str(county_num)
	
	print county_num
	#url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=01&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1'
	url = 'http://star.cde.ca.gov/star2012/ViewReport.aspx?ps=true&lstTestYear=2012&lstTestType=X&lstCounty=' + str(county_num) + '&lstDistrict=&lstSchool=&lstGroup=1&lstSubGroup=1'
	abs_url = URL(string = url)
	dom = DOM(abs_url.download(cached=True))#download the DOM

	
	#sciend_num = dom.by_class("rm")[4].content
	scicst_num = dom.by_class("rm")[3].content
	math_num = dom.by_class("rm")[2].content
	hist_num = dom.by_class("rm")[1].content
	ela_num = dom.by_class("rm")[0].content
	
	#sciend_percent = dom.by_class("rs")[4].content[:5]
	scicst_percent = dom.by_class("rs")[3].content[:5]
	math_percent = dom.by_class("rs")[2].content[:5]
	hist_percent = dom.by_class("rs")[1].content[:5]
	ela_percent = dom.by_class("rs")[0].content[:5]
	
	county = dom.by_tag("h2")[0].content
	
	
	# write all the collected data to a new row of the output file
	writer.writerow([county, ela_num,ela_percent, hist_num, hist_percent, math_num, math_percent,scicst_num, scicst_percent])
开发者ID:lisayao,项目名称:Education-in-California,代码行数:30,代码来源:Education_Scraper.py

示例2: scrape_top_250

def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.
    Args:
        url: pattern.web.URL instance pointing to the top 250 index page
    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    #absolute_url = 'http://www.imdb.com'

    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    url = URL(url)
    dom = DOM(url.download(cached=True))
    
    # return the dom value
    
    for e in dom('.titleColumn'):
        for link in e('a'):
            movie_urls.append(abs(link.attributes.get('href')), )
            
    # return url list
    return movie_urls
开发者ID:Jack-Herrer,项目名称:DataProcessing,代码行数:26,代码来源:imdb-crawler.py

示例3: download_single_image

def download_single_image(url_link, pic_prefix_str, target_folder, image_size):
    """ Download data according to the url link given.
        Args:
            url_link (str): url str.
            pic_prefix_str (str): pic_prefix_str for unique label the pic
    """
    file_ext = os.path.splitext(url_link)[1] #use for checking valid pic ext
    temp_filename = pic_prefix_str + ".jpg"
    temp_filename_full_path = os.path.join(target_folder, temp_filename)

    valid_image_ext_list = ['.png','.jpg','.jpeg', '.gif', '.bmp', '.tiff'] #not comprehensive

    url = URL(url_link)
    if url.redirect:
        return # if there is re-direct, return

    if file_ext not in valid_image_ext_list:
        return #return if not valid image extension

     # save as test.gif
    print url_link
    try:
        response = url.download()
        img = resize_image(response, image_size)
        img.save(temp_filename_full_path, "JPEG")
    except Exception as e:
        #if self.__print_download_fault:
        print 'Problem with processing this data: ', str(e), url_link
开发者ID:Saurabhbhati,项目名称:mushroom_crawler,代码行数:28,代码来源:GoogleImageExtractor.py

示例4: downloading_csv

    def downloading_csv(self, download_type = 'hist'):
        """ Download the csv information for particular stock.
            download_type can be hist or div. If hist, will download the hist price.
            If div, will download dividend history.
            Kwargs:
                download_type (str): hist or div (default hist).
        """
        self.download_fault = 0

        if download_type == 'hist':
            target_url = self.hist_quotes_full_url
            sav_filename = os.path.join(self.hist_quotes_csvfile_path,'hist_stock_price_'+ self.individual_stock_sym+ '.csv')
        elif download_type == 'div':
            target_url = self.div_history_full_url
            sav_filename = os.path.join(self.hist_quotes_csvfile_path,'div_hist_'+ self.individual_stock_sym+ '.csv')
        else:
            print 'wrong download type'
            raise

        url = URL(target_url)
        f = open(self.tempfile_sav_location, 'wb') # save as test.gif
        try:
            f.write(url.download())#if have problem skip
        except:
            if self.__print_download_fault: print 'Problem with processing this data: ', target_url
            self.download_fault =1
        f.close()

        if not self.download_fault:
            if self.enable_save_raw_file:
                shutil.copyfile(self.tempfile_sav_location,sav_filename )
开发者ID:bigdurian1029,项目名称:Foo-Stock,代码行数:31,代码来源:yahoo_finance_historical_data_extract.py

示例5: convertMapData

def convertMapData():
    print '[2/2] Convert map data'

    # output dictionary
    d3mapData = {}

    # download the file
    url = URL(DATASET3)
    data = url.download()

    # create array
    data = list(json.loads(data))

    # fill output dictionary
    for dataRow in data:
        if dataRow['Year'] == '2014':
            population = dataRow['Value']
            fillColor = defineColor(dataRow['Value'])
            d3mapData[dataRow['Country Code']] = {'population': population, 'fillKey': fillColor}

    print '[2/2] Write to json'

    # write output dictionary to json file
    with open('D3LinkedViews/data_map.json', 'wb') as output_file:
        json.dump(d3mapData, output_file)

    print '[2/2] Finish'
开发者ID:tjonger,项目名称:DataProcessing,代码行数:27,代码来源:convert.py

示例6: scrape_top_250

def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''

    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
    from pattern.web import abs
    url = URL("http://www.imdb.com/chart/top")
    dom = DOM(url.download(cached = True))
    for e in dom.by_tag("td.titleColumn")[:250]:
        for link in e.by_tag("a"):
            link = link.attrs.get("href","")
            link = abs(link, base=url.redirect or url.string)
            movie_urls.append(link)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
开发者ID:DaanvanderThiel,项目名称:DataProcessing,代码行数:27,代码来源:imdb-crawler.py

示例7: process_page

def process_page():

    url = URL("http://www.imdb.com/search/title?num_votes=5000,&sort=user_rating,desc&start=1&title_type=tv_series")
    dom = DOM(url.download(cached=True))
    domIndex = 0

    for title in dom.by_class("title"):

        theTitle = str(title.by_tag("a")[0].content).encode('ascii', 'replace')
        titleCatalog.append(Title(theTitle))
    
        try:

            match = re.search("^(\d+).*$", str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace'))
            #print match.group(1)
            # titleCatalog[domIndex].addRunTime( str(dom.by_class("runtime")[domIndex].content).encode('ascii', 'replace'))
            titleCatalog[domIndex].addRunTime(match.group(1))

        except Exception, e:
            pass

        try:
            titleCatalog[domIndex].addRank( str(dom.by_class("value")[domIndex].content).encode('ascii', 'replace'))
        except Exception, e:
            pass
开发者ID:aeggermont,项目名称:cs171,代码行数:25,代码来源:scraper.py

示例8: extract_data

def extract_data(stock_ticker):
    url_base = 'http://financials.morningstar.com/ajax/exportKR2CSV.html?&callback=?&t='
    url_end = '&region=usa&culture=en-US&cur=&order=asc'
    # May add more exchanges later on, but these cover the main US stock exchanges: Nasdaq, New York SE, and Pink Sheets (OTC stocks), respectively
    # Loops through main stock exchanges to get proper URL for data extraction
    stock_exchange_list = ['XNAS:','XNYS:','PINX:'] 
    for exchange in stock_exchange_list:
        test = URL(url_base+exchange+stock_ticker+url_end)
        if sys.getsizeof(test.download()) > 35: #A broken URL produces an empty string, which has memory size 33; size 35 allows for minor variation in the size
            break
    temp_data = 'C:/Users/Owner/Documents/temp.csv'
    f = open(temp_data, mode='w')
    try:
        f.write(test.download())
    except:
        raise IOError('There was an error processing this data')
        sys.exit(1)
    f.close()
    try:
        stock_data_df =  pd.read_csv(temp_data, header=2,thousands =',',index_col=0,skiprows=[19,20,31,41,42,43,48,58,53,64,65,72,73,95,101,102])
    except:
        raise IOError('Problem downloading files')
        os.remove(temp_data)
        sys.exit(1)
    os.remove(temp_data)
    stock_data_df = stock_data_df.transpose()
    return(stock_data_df)
开发者ID:Talleyman,项目名称:stock-analyzer,代码行数:27,代码来源:grab_data.py

示例9: get_patent

def get_patent(url):
    url = URL(url + "/fulltext")
    html = url.download()
    dom = DOM(html)
    title = plaintext(dom("h3 a")[0].content)
    body = plaintext(dom("#contents")[0].content)
    return [title, body]
开发者ID:codeaudit,项目名称:patent-generator,代码行数:7,代码来源:scraper.py

示例10: cats

    def cats(self, namespace=0, start=None, acmin=1, count=100, cached=True, **kwargs):
        """ Returns an iterator over all article titles (for a given namespace id).
        """
        kwargs.setdefault("unicode", True)
        kwargs.setdefault("throttle", self.throttle)
        # Fetch article titles (default) or a custom id.
        id = kwargs.pop("_id", "title")
        id = "*"
        # Loop endlessly (= until the last request no longer yields an "apcontinue").
        # See: http://www.mediawiki.org/wiki/API:Allpages
        while start != -1:
            url = URL(self._url, method=GET, query={
                     "action": "query",
                       "list": "allcategories",
                     "acfrom": start or "",
                    "aclimit": min(count, 500),
                    "acprop": "size",
                    "acmin": max(1, acmin),
                     "format": "json"
            })
            data = url.download(cached=cached, **kwargs)
            data = json.loads(data)
            for x in data.get("query", {}).get("allcategories", {}):
                # print(x)
                if x.get(id):
                    # yield x[id]
                    x['name'] = x.pop('*')
                    yield x

            start = data.get("query-continue", {}).get("allcategories", {})
            start = start.get("accontinue", start.get("acfrom", -1))
        raise StopIteration
开发者ID:simgee,项目名称:wikia,代码行数:32,代码来源:test2.py

示例11: scrape_top_250

def scrape_top_250(url):
    '''
    Scrape the IMDB top 250 movies index page.

    Args:
        url: pattern.web.URL instance pointing to the top 250 index page

    Returns:
        A list of strings, where each string is the URL to a movie's page on
        IMDB, note that these URLS must be absolute (i.e. include the http
        part, the domain part and the path part).
    '''
    movie_urls = []
    # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
    # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.

    # Download the HTML file
    url = URL(url)
    html = url.download()

    # Parse the HTML file into a DOM representation
    dom = DOM(html)

    # Iterate through all 250 table rows on the index page
    for movies in dom('.lister-list > tr'):
        # take the movie's href attribute and put it in href
        href = movies('td.titleColumn a')[0].attrs["href"]
        # append the href attribute to the string, but also add http://www.imdb.com/ in front of it
        movie_urls.append("http://www.imdb.com/" + href)

    # return the list of URLs of each movie's page on IMDB
    return movie_urls
开发者ID:ibeerepoot,项目名称:DataProcessing,代码行数:32,代码来源:imdb-crawler.py

示例12: getRandomHistoryDOM

def getRandomHistoryDOM(language):
    url = URL("http://"+language+".wikipedia.org/wiki/Special:Random")
    #Gets the url only of the page this redirects to
    redirectUrl = url.redirect
    try:
        #Grab the name of the wikipedia article from the url
        urlComponents = string.split(redirectUrl, '/')
    except AttributeError:
        #Use some recursion if we encounter a page with no history, or some other error
        return getRandomHistoryDOM(language)

    #Get the history section of the article
    redirectUrl = "http://"+language+".wikipedia.org/w/index.php?title="+urlComponents[4]+"&action=history"
    print "Current article is: " +str(urlComponents[4])
    #print redirectUrl
    url = URL(redirectUrl);
    dom = DOM(url.download(cached=False))
    try:
        historyList = dom.by_id("pagehistory").by_tag("li")
        return historyList, urlComponents[4]
    except AttributeError:
        #Use some recursion if we encounter a page with no history, or some other error
        dom = getRandomHistoryDOM(language)

    return getRandomHistoryDOM(language)
开发者ID:eucalyptustree,项目名称:wikiMap,代码行数:25,代码来源:wikiScraper1.2.py

示例13: download_pdfs

def download_pdfs():
    """download pdfs from fda"""

    # where to save pdfs
    path = 'classifier_docs/pdfs/'

    # create directory if it doesn't exist
    if not os.path.exists(path):
        os.makedirs(path)

    # load in non-standard pdf urls from 2012 to serve as control text
    # note: had to lookup urls manually
    # drugs are erivedge (203388) and sirturo (204384)
    # also, menhibrix (125363) has no medical review available 
    urls = ['http://www.accessdata.fda.gov/drugsatfda_docs/nda/2012/203388Orig1s000MedRpdf.pdf',
            'http://www.accessdata.fda.gov/drugsatfda_docs/nda/2012/204384Orig1s000MedR_.pdf']
    for url in urls:
        m = re.search('20..\/(\d{6})', url)
        app_num = m.group(1)
        url = URL(url)
        # make sure that url points to PDF, print error otherwise
        if url.mimetype in MIMETYPE_PDF:
            # write pdf for medical review if it doesn't exist
            fn = path + app_num + '.pdf'
            if not os.path.exists(fn):
                print "writing {} from {}".format(fn, url)
                f = open(fn, 'w')
                f.write(url.download(cached = False))
                f.close()
            else:
                print "{} already exists".format(fn)
        else:
            print "warning: {} did not resolve to pdf".format(url)

    return
开发者ID:joepax,项目名称:openpharma,代码行数:35,代码来源:classifier_docs.py

示例14: downloadPDFs

def downloadPDFs(dictListJSON, state, jsonExists = False):
    #state = dictListJSON[0, 2]
    dlJSONFile = open(dictListJSON, "r")
    dictJSON = json.load(dlJSONFile)
    dlJSONFile.close()
    #some condition to check if the JSON already exists
    if jsonExists:
        pdfDictList = dictJSON
    else:
        pdfDictList = findPDFLinks(dictJSON, state)


    count = 0
    for dict in pdfDictList:
        #test if date > 01/01/13
        fileName = "".join(str(dict["AdvertiserInfo"]).split())
        print "Writing to " + fileName
        url = dict["PDFLink"]
        url = re.sub(' ', '%20', url)
        print url
        if url != "NO URL":
            urlOpened = URL(url)
            f = open(fileName, 'wb')
            #download to state pdfs directory
            f.write(urlOpened.download(cached=False))
            f.close()
        count += 1
        if count > 4:
            break
开发者ID:decodyng,项目名称:elecTweets,代码行数:29,代码来源:sunlightScraper.py

示例15: get_patent_urls

def get_patent_urls(keyword, limit=10):
    keyword = urllib.quote_plus(keyword)
    base_url = "http://www.lens.org"
    url = URL(base_url + "/lens/search?ft=true&l=en&st=true&n=" + str(limit) + "&q=" + keyword)
    dom = DOM(url.download())
    links = [base_url + a.attributes.get("href") for a in dom("a.link")]
    return links
开发者ID:codeaudit,项目名称:patent-generator,代码行数:7,代码来源:scraper.py


注:本文中的pattern.web.URL类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。