当前位置: 首页>>代码示例>>Python>>正文


Python datasets.get_data_home函数代码示例

本文整理汇总了Python中sklearn.datasets.get_data_home函数的典型用法代码示例。如果您正苦于以下问题:Python get_data_home函数的具体用法?Python get_data_home怎么用?Python get_data_home使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了get_data_home函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_data_home

def test_data_home():
    # get_data_home will point to a pre-existing folder
    data_home = get_data_home(data_home=DATA_HOME)
    assert_equal(data_home, DATA_HOME)
    assert_true(os.path.exists(data_home))

    # clear_data_home will delete both the content and the folder it-self
    clear_data_home(data_home=data_home)
    assert_false(os.path.exists(data_home))

    # if the folder is missing it will be created again
    data_home = get_data_home(data_home=DATA_HOME)
    assert_true(os.path.exists(data_home))
开发者ID:Calvin-O,项目名称:scikit-learn,代码行数:13,代码来源:test_base.py

示例2: setup_module

def setup_module():
    check_skip_network()

    # skip the test in rcv1.rst if the dataset is not already loaded
    rcv1_dir = os.path.join(get_data_home(), "RCV1")
    if not os.path.exists(rcv1_dir):
        raise SkipTest("Download RCV1 dataset to run this test.")
开发者ID:0664j35t3r,项目名称:scikit-learn,代码行数:7,代码来源:rcv1_fixture.py

示例3: get_unclassified_data

    def get_unclassified_data(self):
        source_path = os.path.join(get_data_home(), 'tweets_unclassified\\' + self.disease)
        file_paths = []
        for root, directories, files in os.walk(source_path):
            for filename in files:
                file_path = os.path.join(root, filename)
                file_paths.append(file_path)
        print 'unclassified data loaded from ' + str(file_paths)

        tweets = []
        for file_path in file_paths:
            line_num = 0
            with codecs.open(file_path, 'r') as f:
                for line in f:
                    if line_num>0:
                        try:
                            tweets.append(Tweet(line))
                            line_num += 1
                        except:
                            print "Unexpected error in line " + line_num + ":", pickle.sys.exc_info()[0]
                    else:
                        line_num += 1
            f.closed
        print 'unclassified tweets loaded ' + str(len(tweets))
        return tweets
开发者ID:yuravariat,项目名称:TweetsClassifier,代码行数:25,代码来源:data.py

示例4: setup_working_with_text_data

def setup_working_with_text_data():
    if IS_PYPY and os.environ.get('CI', None):
        raise SkipTest('Skipping too slow test with PyPy on CI')
    check_skip_network()
    cache_path = _pkl_filepath(get_data_home(), CACHE_NAME)
    if not exists(cache_path):
        raise SkipTest("Skipping dataset loading doctests")
开发者ID:manhhomienbienthuy,项目名称:scikit-learn,代码行数:7,代码来源:conftest.py

示例5: fetch_vega_spectrum

def fetch_vega_spectrum(data_home=None):
    data_home = get_data_home(data_home)
    refspec_file = os.path.join(data_home, REFSPEC_URL.split('/')[-1])
    if not os.path.exists(refspec_file):
        print "downnloading from %s" % REFSPEC_URL
        F = urllib2.urlopen(REFSPEC_URL)
        open(refspec_file, 'w').write(F.read())

    F = open(refspec_file)

    data = np.loadtxt(F)
    return data
开发者ID:kickbean,项目名称:TextMiningWithSklearn,代码行数:12,代码来源:plot_sdss_filters.py

示例6: fetch_filter

def fetch_filter(filter, data_home=None):
    data_home = get_data_home(data_home)
    assert filter in 'ugriz'
    url = URL % filter
    loc = os.path.join(data_home, '%s.dat' % filter)
    if not os.path.exists(loc):
        print "downloading from %s" % url
        F = urllib2.urlopen(url)
        open(loc, 'w').write(F.read())

    F = open(loc)

    data = np.loadtxt(F)
    return data
开发者ID:kickbean,项目名称:TextMiningWithSklearn,代码行数:14,代码来源:plot_sdss_filters.py

示例7: fetch_sdss_spec_data

def fetch_sdss_spec_data(data_home=None):
    data_home = get_data_home(data_home)

    local_file = os.path.join(data_home, os.path.basename(DATA_URL))

    # data directory is password protected so the public can't access it    
    password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
    password_mgr.add_password(None, DATA_URL, 'pydata', 'astroML')
    handler = urllib2.HTTPBasicAuthHandler(password_mgr)
    opener = urllib2.build_opener(handler)

    # download training data
    if not os.path.exists(local_file):
        fhandle = opener.open(DATA_URL)
        open(local_file, 'w').write(fhandle.read())

    return np.load(local_file)
开发者ID:kickbean,项目名称:TextMiningWithSklearn,代码行数:17,代码来源:plot_sdss_specPCA.py

示例8: stream_reuters_documents

def stream_reuters_documents(data_path=None):
    """Iterate over documents of the Reuters dataset.

    The Reuters archive will automatically be downloaded and uncompressed if
    the `data_path` directory does not exist.

    Documents are represented as dictionaries with 'body' (str),
    'title' (str), 'topics' (list(str)) keys.

    """

    DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/'
                    'reuters21578-mld/reuters21578.tar.gz')
    ARCHIVE_FILENAME = 'reuters21578.tar.gz'

    if data_path is None:
        data_path = os.path.join(get_data_home(), "reuters")
    if not os.path.exists(data_path):
        """Download the dataset."""
        print("downloading dataset (once and for all) into %s" %
              data_path)
        os.mkdir(data_path)

        def progress(blocknum, bs, size):
            total_sz_mb = '%.2f MB' % (size / 1e6)
            current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
            if _not_in_sphinx():
                print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb),
                      end='')

        archive_path = os.path.join(data_path, ARCHIVE_FILENAME)
        urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path,
                                   reporthook=progress)
        if _not_in_sphinx():
            print('\r', end='')
        print("untarring Reuters dataset...")
        tarfile.open(archive_path, 'r:gz').extractall(data_path)
        print("done.")

    parser = ReutersParser()
    for filename in glob(os.path.join(data_path, "*.sgm")):
        for doc in parser.parse(open(filename, 'rb')):
            #print (doc)
            yield doc
开发者ID:mbonaventura,项目名称:aa2015,代码行数:44,代码来源:my_plot_out_of_core_classification.py

示例9: create_data

    def create_data(self):
        data_home = get_data_home()
        cache_path = os.path.join(data_home, 'cache\\' + self.disease + self._cl_cut + '\\' + self.cache_name)

        if os.path.exists(cache_path):
            return

        # e.g. C:\Users\[user]\scikit_learn_data\hiv
        # disease_path = os.path.join(data_home, self.disease)
        # e.g. C:\Users\[user]\scikit_learn_data\tweets\hiv
        tweets_path = os.path.join(data_home, 'tweets', self.disease + self._cl_cut)
        if not os.path.exists(tweets_path):
            return
        '''
        *** Manual process:
        Save annotation files as 'Text (MS-DOS)(*.txt)', e.g. tweets1.txt (all annotation files should keep the same format)

        *** Automated process:
        1. Get file names from the C:\Users\[user]\scikit_learn_data\tweets\hiv
        2. For each file read all tweets line by line (only those where the category is not empty)
        3. For each tweet generate a unique file
        '''

        train_path = os.path.join(tweets_path, self.train_folder)
        train_output_path = os.path.join(data_home, self.train_folder,  self.disease + self._cl_cut)
        if not os.path.exists(train_output_path):
            os.makedirs(train_output_path)

        test_path = os.path.join(tweets_path, self.test_folder)
        test_output_path = os.path.join(data_home, self.test_folder,  self.disease + self._cl_cut)
        if not os.path.exists(test_output_path):
            os.makedirs(test_output_path)

        train_tweets = self._load_tweets(train_path)
        self._generate_singular_tweet_files(train_tweets, train_output_path)
        test_tweets = self._load_tweets(test_path)
        self._generate_singular_tweet_files(test_tweets, test_output_path)
开发者ID:yuravariat,项目名称:TweetsClassifier,代码行数:37,代码来源:data.py

示例10: _fetch_drug_protein

def _fetch_drug_protein(data_home=None):
    """Fetch drug-protein dataset from the server"""

    base_url = "http://cbio.ensmp.fr/~yyamanishi/substr-domain/"

    # check if this data set has been already downloaded
    data_home = get_data_home(data_home)
    data_home = os.path.join(data_home, 'drug-protein')
    if not os.path.exists(data_home):
        os.makedirs(data_home)

    for base_name in ["drug_repmat.txt", "target_repmat.txt",
                      "inter_admat.txt"]:
        filename = os.path.join(data_home, base_name)

        if not os.path.exists(filename):
            urlname = base_url + base_name

            print("Download data at {}".format(urlname))

            try:
                url = urlopen(urlname)
            except HTTPError as e:
                if e.code == 404:
                    e.msg = "Dataset drug-protein '%s' not found." % base_name
                raise

            try:
                with open(filename, 'w+b') as fhandle:
                    shutil.copyfileobj(url, fhandle)
            except:
                os.remove(filename)
                raise

            url.close()

    return data_home
开发者ID:arjoly,项目名称:random-output-trees,代码行数:37,代码来源:datasets.py

示例11: setup_module

def setup_module(module):
    data_home = get_data_home()
    if not exists(join(data_home, '20news_home')):
        raise SkipTest("Skipping dataset loading doctests")
开发者ID:AlexandreAbraham,项目名称:scikit-learn,代码行数:4,代码来源:twenty_newsgroups_fixture.py

示例12: Memory

from sklearn.datasets import get_data_home
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.externals.joblib import Memory
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.metrics import zero_one_loss
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import check_array

# Memoize the data extraction and memory map the resulting
# train / test splits in readonly mode
memory = Memory(os.path.join(get_data_home(), 'mnist_benchmark_data'),
                mmap_mode='r')


@memory.cache
def load_data(dtype=np.float32, order='F'):
    """Load the data, then cache and memmap the train/test split"""
    ######################################################################
    ## Load dataset
    print("Loading dataset...")
    data = fetch_mldata('MNIST original')
    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]

    # Normalize features
    X = X / 255
开发者ID:1TTT9,项目名称:scikit-learn,代码行数:31,代码来源:bench_mnist.py

示例13: fetch_olivetti_faces

from sklearn.datasets import fetch_olivetti_faces
from sklearn.datasets import fetch_lfw_people
from sklearn.datasets import get_data_home


if __name__ == "__main__":
    fetch_olivetti_faces()

    print("Loading Labeled Faces Data (~200MB)")
    fetch_lfw_people(min_faces_per_person=70, resize=0.4)
    print("=> Success!")
    print("Data saved in %s" % get_data_home())
开发者ID:JeanKossaifi,项目名称:workshop_python,代码行数:12,代码来源:fetch_data.py

示例14: zip

plt.legend(loc='upper right')
plt.show()
'''
x_index = 0
y_index = 3
'''
for label,color in zip(range(len(d1.target_names)),colors):
    plt.scatter(d1.data[d1.target==label,x_index],d1.data[d1.target == label, y_index],label=d1.target_names[label],color=color) #散点图

plt.xlabel(d1.feature_names[x_index])
plt.xlabel(d1.feature_names[y_index])
plt.legend(loc='upper left')
plt.show()

'''

'''
fig = plt.figure(figsize=(6,6))
fig.subplotpars(left=0,right=1,bottom=0,top=1,hspace=0.05,wspace=0.05)

for i in range(64):
    ax = fig.add_subplot(8,8,i+1,xticks=[],yticks=[])
    ax.imshow(d3.images[i],cmap=plt.cm.binary,interpolation="nearest")
    ax.text(0,7,str(d3.target[i]))
plt.show()
'''

#china = datasets.load_sample_image('china.jpg')

print(datasets.get_data_home())
开发者ID:xxwei,项目名称:TraderCode,代码行数:30,代码来源:sklearnData.py

示例15: list

    chunk_size = 1000
    data_chunks = list(partition(chunk_size, testData))

    print ('start prediction')

    for i,chunk in enumerate(data_chunks):
        t0 = time()
        predicted = clf.classifier.predict(list(chunk))
        ranTime = time() - t0
        print ('progress ' + str(round((i+1)/float(len(data_chunks)) * 100,2)) + '% last_predict_time=' + str(ranTime))
        for j in range(len(chunk)):
            testData[i*chunk_size+j].talk_about = str(clf.labels[predicted[j]])

    print ('predict done')

    file_dir = os.path.join(get_data_home(), 'output', disease, cl_cut)

    if not os.path.exists(file_dir):
        os.makedirs(file_dir)

    file_path = os.path.join(file_dir, 'output.txt')

    with codecs.open(file_path, "w", "utf-8") as text_file:
        for i in range(len(testData)):
            try:
                tweet = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}\n". \
                    format(testData[i].tweet_id,
                           testData[i].query,
                           testData[i].disease,
                           testData[i].created_at,
                           testData[i].screen_name,
开发者ID:yuravariat,项目名称:TweetsClassifier,代码行数:31,代码来源:Classifier_aboutself_vs_aboutothers.py


注:本文中的sklearn.datasets.get_data_home函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。