本文整理汇总了Python中sklearn.datasets.get_data_home函数的典型用法代码示例。如果您正苦于以下问题:Python get_data_home函数的具体用法?Python get_data_home怎么用?Python get_data_home使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了get_data_home函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_data_home
def test_data_home():
# get_data_home will point to a pre-existing folder
data_home = get_data_home(data_home=DATA_HOME)
assert_equal(data_home, DATA_HOME)
assert_true(os.path.exists(data_home))
# clear_data_home will delete both the content and the folder it-self
clear_data_home(data_home=data_home)
assert_false(os.path.exists(data_home))
# if the folder is missing it will be created again
data_home = get_data_home(data_home=DATA_HOME)
assert_true(os.path.exists(data_home))
示例2: setup_module
def setup_module():
check_skip_network()
# skip the test in rcv1.rst if the dataset is not already loaded
rcv1_dir = os.path.join(get_data_home(), "RCV1")
if not os.path.exists(rcv1_dir):
raise SkipTest("Download RCV1 dataset to run this test.")
示例3: get_unclassified_data
def get_unclassified_data(self):
source_path = os.path.join(get_data_home(), 'tweets_unclassified\\' + self.disease)
file_paths = []
for root, directories, files in os.walk(source_path):
for filename in files:
file_path = os.path.join(root, filename)
file_paths.append(file_path)
print 'unclassified data loaded from ' + str(file_paths)
tweets = []
for file_path in file_paths:
line_num = 0
with codecs.open(file_path, 'r') as f:
for line in f:
if line_num>0:
try:
tweets.append(Tweet(line))
line_num += 1
except:
print "Unexpected error in line " + line_num + ":", pickle.sys.exc_info()[0]
else:
line_num += 1
f.closed
print 'unclassified tweets loaded ' + str(len(tweets))
return tweets
示例4: setup_working_with_text_data
def setup_working_with_text_data():
if IS_PYPY and os.environ.get('CI', None):
raise SkipTest('Skipping too slow test with PyPy on CI')
check_skip_network()
cache_path = _pkl_filepath(get_data_home(), CACHE_NAME)
if not exists(cache_path):
raise SkipTest("Skipping dataset loading doctests")
示例5: fetch_vega_spectrum
def fetch_vega_spectrum(data_home=None):
data_home = get_data_home(data_home)
refspec_file = os.path.join(data_home, REFSPEC_URL.split('/')[-1])
if not os.path.exists(refspec_file):
print "downnloading from %s" % REFSPEC_URL
F = urllib2.urlopen(REFSPEC_URL)
open(refspec_file, 'w').write(F.read())
F = open(refspec_file)
data = np.loadtxt(F)
return data
示例6: fetch_filter
def fetch_filter(filter, data_home=None):
data_home = get_data_home(data_home)
assert filter in 'ugriz'
url = URL % filter
loc = os.path.join(data_home, '%s.dat' % filter)
if not os.path.exists(loc):
print "downloading from %s" % url
F = urllib2.urlopen(url)
open(loc, 'w').write(F.read())
F = open(loc)
data = np.loadtxt(F)
return data
示例7: fetch_sdss_spec_data
def fetch_sdss_spec_data(data_home=None):
data_home = get_data_home(data_home)
local_file = os.path.join(data_home, os.path.basename(DATA_URL))
# data directory is password protected so the public can't access it
password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, DATA_URL, 'pydata', 'astroML')
handler = urllib2.HTTPBasicAuthHandler(password_mgr)
opener = urllib2.build_opener(handler)
# download training data
if not os.path.exists(local_file):
fhandle = opener.open(DATA_URL)
open(local_file, 'w').write(fhandle.read())
return np.load(local_file)
示例8: stream_reuters_documents
def stream_reuters_documents(data_path=None):
"""Iterate over documents of the Reuters dataset.
The Reuters archive will automatically be downloaded and uncompressed if
the `data_path` directory does not exist.
Documents are represented as dictionaries with 'body' (str),
'title' (str), 'topics' (list(str)) keys.
"""
DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/'
'reuters21578-mld/reuters21578.tar.gz')
ARCHIVE_FILENAME = 'reuters21578.tar.gz'
if data_path is None:
data_path = os.path.join(get_data_home(), "reuters")
if not os.path.exists(data_path):
"""Download the dataset."""
print("downloading dataset (once and for all) into %s" %
data_path)
os.mkdir(data_path)
def progress(blocknum, bs, size):
total_sz_mb = '%.2f MB' % (size / 1e6)
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
if _not_in_sphinx():
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb),
end='')
archive_path = os.path.join(data_path, ARCHIVE_FILENAME)
urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path,
reporthook=progress)
if _not_in_sphinx():
print('\r', end='')
print("untarring Reuters dataset...")
tarfile.open(archive_path, 'r:gz').extractall(data_path)
print("done.")
parser = ReutersParser()
for filename in glob(os.path.join(data_path, "*.sgm")):
for doc in parser.parse(open(filename, 'rb')):
#print (doc)
yield doc
示例9: create_data
def create_data(self):
data_home = get_data_home()
cache_path = os.path.join(data_home, 'cache\\' + self.disease + self._cl_cut + '\\' + self.cache_name)
if os.path.exists(cache_path):
return
# e.g. C:\Users\[user]\scikit_learn_data\hiv
# disease_path = os.path.join(data_home, self.disease)
# e.g. C:\Users\[user]\scikit_learn_data\tweets\hiv
tweets_path = os.path.join(data_home, 'tweets', self.disease + self._cl_cut)
if not os.path.exists(tweets_path):
return
'''
*** Manual process:
Save annotation files as 'Text (MS-DOS)(*.txt)', e.g. tweets1.txt (all annotation files should keep the same format)
*** Automated process:
1. Get file names from the C:\Users\[user]\scikit_learn_data\tweets\hiv
2. For each file read all tweets line by line (only those where the category is not empty)
3. For each tweet generate a unique file
'''
train_path = os.path.join(tweets_path, self.train_folder)
train_output_path = os.path.join(data_home, self.train_folder, self.disease + self._cl_cut)
if not os.path.exists(train_output_path):
os.makedirs(train_output_path)
test_path = os.path.join(tweets_path, self.test_folder)
test_output_path = os.path.join(data_home, self.test_folder, self.disease + self._cl_cut)
if not os.path.exists(test_output_path):
os.makedirs(test_output_path)
train_tweets = self._load_tweets(train_path)
self._generate_singular_tweet_files(train_tweets, train_output_path)
test_tweets = self._load_tweets(test_path)
self._generate_singular_tweet_files(test_tweets, test_output_path)
示例10: _fetch_drug_protein
def _fetch_drug_protein(data_home=None):
"""Fetch drug-protein dataset from the server"""
base_url = "http://cbio.ensmp.fr/~yyamanishi/substr-domain/"
# check if this data set has been already downloaded
data_home = get_data_home(data_home)
data_home = os.path.join(data_home, 'drug-protein')
if not os.path.exists(data_home):
os.makedirs(data_home)
for base_name in ["drug_repmat.txt", "target_repmat.txt",
"inter_admat.txt"]:
filename = os.path.join(data_home, base_name)
if not os.path.exists(filename):
urlname = base_url + base_name
print("Download data at {}".format(urlname))
try:
url = urlopen(urlname)
except HTTPError as e:
if e.code == 404:
e.msg = "Dataset drug-protein '%s' not found." % base_name
raise
try:
with open(filename, 'w+b') as fhandle:
shutil.copyfileobj(url, fhandle)
except:
os.remove(filename)
raise
url.close()
return data_home
示例11: setup_module
def setup_module(module):
data_home = get_data_home()
if not exists(join(data_home, '20news_home')):
raise SkipTest("Skipping dataset loading doctests")
示例12: Memory
from sklearn.datasets import get_data_home
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.externals.joblib import Memory
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.metrics import zero_one_loss
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import check_array
# Memoize the data extraction and memory map the resulting
# train / test splits in readonly mode
memory = Memory(os.path.join(get_data_home(), 'mnist_benchmark_data'),
mmap_mode='r')
@memory.cache
def load_data(dtype=np.float32, order='F'):
"""Load the data, then cache and memmap the train/test split"""
######################################################################
## Load dataset
print("Loading dataset...")
data = fetch_mldata('MNIST original')
X = check_array(data['data'], dtype=dtype, order=order)
y = data["target"]
# Normalize features
X = X / 255
示例13: fetch_olivetti_faces
from sklearn.datasets import fetch_olivetti_faces
from sklearn.datasets import fetch_lfw_people
from sklearn.datasets import get_data_home
if __name__ == "__main__":
fetch_olivetti_faces()
print("Loading Labeled Faces Data (~200MB)")
fetch_lfw_people(min_faces_per_person=70, resize=0.4)
print("=> Success!")
print("Data saved in %s" % get_data_home())
示例14: zip
plt.legend(loc='upper right')
plt.show()
'''
x_index = 0
y_index = 3
'''
for label,color in zip(range(len(d1.target_names)),colors):
plt.scatter(d1.data[d1.target==label,x_index],d1.data[d1.target == label, y_index],label=d1.target_names[label],color=color) #散点图
plt.xlabel(d1.feature_names[x_index])
plt.xlabel(d1.feature_names[y_index])
plt.legend(loc='upper left')
plt.show()
'''
'''
fig = plt.figure(figsize=(6,6))
fig.subplotpars(left=0,right=1,bottom=0,top=1,hspace=0.05,wspace=0.05)
for i in range(64):
ax = fig.add_subplot(8,8,i+1,xticks=[],yticks=[])
ax.imshow(d3.images[i],cmap=plt.cm.binary,interpolation="nearest")
ax.text(0,7,str(d3.target[i]))
plt.show()
'''
#china = datasets.load_sample_image('china.jpg')
print(datasets.get_data_home())
示例15: list
chunk_size = 1000
data_chunks = list(partition(chunk_size, testData))
print ('start prediction')
for i,chunk in enumerate(data_chunks):
t0 = time()
predicted = clf.classifier.predict(list(chunk))
ranTime = time() - t0
print ('progress ' + str(round((i+1)/float(len(data_chunks)) * 100,2)) + '% last_predict_time=' + str(ranTime))
for j in range(len(chunk)):
testData[i*chunk_size+j].talk_about = str(clf.labels[predicted[j]])
print ('predict done')
file_dir = os.path.join(get_data_home(), 'output', disease, cl_cut)
if not os.path.exists(file_dir):
os.makedirs(file_dir)
file_path = os.path.join(file_dir, 'output.txt')
with codecs.open(file_path, "w", "utf-8") as text_file:
for i in range(len(testData)):
try:
tweet = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}\n". \
format(testData[i].tweet_id,
testData[i].query,
testData[i].disease,
testData[i].created_at,
testData[i].screen_name,