本文整理汇总了Python中sklearn.datasets.load_svmlight_file方法的典型用法代码示例。如果您正苦于以下问题:Python datasets.load_svmlight_file方法的具体用法?Python datasets.load_svmlight_file怎么用?Python datasets.load_svmlight_file使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sklearn.datasets
的用法示例。
在下文中一共展示了datasets.load_svmlight_file方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: read_year_prediction_data
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def read_year_prediction_data(fileName):
feature_dim = 90
print("Reading data from disk...")
train_features, train_labels = load_svmlight_file(fileName, n_features=feature_dim, dtype=np.float32)
train_features = train_features.todense()
# normalize the data: subtract means and divide by standard deviations
label_mean = train_labels.mean()
label_std = np.sqrt(np.square(train_labels - label_mean).mean())
feature_means = train_features.mean(axis=0)
feature_stds = np.sqrt(np.square(train_features - feature_means).mean(axis=0))
train_features = (train_features - feature_means) / feature_stds
train_labels = (train_labels - label_mean) / label_std
return feature_dim, train_features, train_labels
示例2: load_data
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def load_data(path, dense=False):
"""Load data from a CSV, LibSVM or HDF5 file based on the file extension.
Args:
path (str): A path to the CSV, LibSVM or HDF5 format file.
dense (boolean): An optional variable indicating if the return matrix
should be dense. By default, it is false.
Returns:
Data matrix X and target vector y
"""
catalog = {'.csv': load_csv, '.sps': load_svmlight_file, '.h5': load_hdf5}
ext = os.path.splitext(path)[1]
func = catalog[ext]
X, y = func(path)
if dense and sparse.issparse(X):
X = X.todense()
return X, y
示例3: test_dump
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def test_dump(self):
tmpfile = "tmp_dump.txt"
try:
# loads from file
Xs, y = load_svmlight_file(datafile)
# dumps to file
dump_svmlight_file(Xs, y, tmpfile, zero_based=False)
# loads them as CSR MATRIX
X2, y2 = sk_load_svmlight_file(tmpfile)
X3 = np.ndarray(shape=X2.shape, dtype=X2.dtype)
X2.toarray(out=X3)
# check assertions
assert_array_almost_equal(Xs, X3)
assert_array_almost_equal(y, y2)
finally:
if os.path.exists(tmpfile):
os.remove(tmpfile)
示例4: test_dump_qid
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def test_dump_qid(self):
tmpfile = "/tmp/tmp_dump.txt"
try:
# loads from file
Xs, y, q = load_svmlight_file(qid_datafile, query_id=True)
# dumps to file
dump_svmlight_file(Xs, y, tmpfile, query_id=list(q), zero_based=False)
# loads them as CSR MATRIX with scikit-learn
X2, y2, q2 = sk_load_svmlight_file(tmpfile, query_id=True)
X3 = np.ndarray(shape=X2.shape, dtype=X2.dtype)
X2.toarray(out=X3)
# check assertions
assert_array_almost_equal(Xs, X3)
assert_array_almost_equal(y, y2)
assert_array_equal(q, q2)
finally:
if os.path.exists(tmpfile):
os.remove(tmpfile)
示例5: test
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def test():
url_zip_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_train.binary.bz2'
urllib.request.urlretrieve(url_zip_train, filename='train.bz2')
f_svm = open('train.svm', 'wt')
with bz2.open('train.bz2', 'rb') as f_zip:
data = f_zip.read()
f_svm.write(data.decode('utf-8'))
f_svm.close()
X, y = load_svmlight_file('train.svm')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
pipeline = make_pipeline(FeatureGradientSelector(n_epochs=1, n_features=10), LogisticRegression())
# pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())
pipeline.fit(X_train, y_train)
print("Pipeline Score: ", pipeline.score(X_train, y_train))
示例6: load_realsim
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def load_realsim(folder=REALSIM, one_hot=True, partitions_proportions=None, shuffle=False, as_tensor=True):
X, y = sk_dt.load_svmlight_file(folder + "/real-sim")
y = np.array([int(yy) for yy in y])
if one_hot:
y = to_one_hot_enc(y)
res = [Dataset(data=X, target=y)]
if partitions_proportions:
res = redivide_data(res, shuffle=shuffle, partition_proportions=partitions_proportions)
res = Datasets.from_list(res)
if as_tensor: [dat.convert_to_tensor() for dat in res]
return res
# noinspection PyPep8Naming
示例7: _read_svmlight
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def _read_svmlight(lines, out_blocks, col_size, n_features, store_sparse):
from tempfile import SpooledTemporaryFile
from sklearn.datasets import load_svmlight_file
# Creating a tmp file to use load_svmlight_file method should be more
# efficient than parsing the lines manually
tmp_file = SpooledTemporaryFile(mode="wb+", max_size=2e8)
tmp_file.writelines(lines)
tmp_file.seek(0)
x, y = load_svmlight_file(tmp_file, n_features)
if not store_sparse:
x = x.toarray()
# tried also converting to csc/ndarray first for faster splitting but it's
# not worth. Position 0 contains the X
for i in range(ceil(n_features / col_size)):
out_blocks[0][i] = x[:, i * col_size:(i + 1) * col_size]
# Position 1 contains the y block
out_blocks[1][0] = y.reshape(-1, 1)
示例8: test_load_svmlight_file
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def test_load_svmlight_file(self):
""" Tests loading a LibSVM file """
file_ = "tests/files/libsvm/1"
x_np, y_np = load_svmlight_file(file_, n_features=780)
# Load SVM and store in sparse
x, y = ds.load_svmlight_file(file_, (25, 100), n_features=780,
store_sparse=True)
self.assertTrue(_equal_arrays(x.collect(), x_np))
self.assertTrue(_equal_arrays(y.collect(), y_np))
# Load SVM and store in dense
x, y = ds.load_svmlight_file(file_, (25, 100), n_features=780,
store_sparse=False)
self.assertTrue(_equal_arrays(x.collect(), x_np.toarray()))
self.assertTrue(_equal_arrays(y.collect(), y_np))
示例9: retrieve_dataset
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def retrieve_dataset(dataset, **kwargs):
# if data not extracted, download zip and extract
outdirname = "datasets.1.17.2019"
if not os.path.exists(outdirname):
try:
from urllib import urlretrieve
except ImportError:
from urllib.request import urlretrieve
import zipfile
zipfilename = outdirname + ".zip"
urlretrieve(
"https://publictestdatasets.blob.core.windows.net/data/" + zipfilename,
zipfilename,
)
with zipfile.ZipFile(zipfilename, "r") as unzip:
unzip.extractall(".")
extension = os.path.splitext(dataset)[1]
filepath = os.path.join(outdirname, dataset)
if extension == ".npz":
# sparse format file
import scipy.sparse as sparse
return sparse.load_npz(filepath)
elif extension == ".svmlight":
from sklearn import datasets
return datasets.load_svmlight_file(filepath)
elif extension == ".json":
import json
with open(filepath, encoding="utf-8") as f:
dataset = json.load(f)
return dataset
elif extension == ".csv":
import pandas as pd
return pd.read_csv(filepath, **kwargs)
else:
raise Exception("Unrecognized file extension: " + extension)
示例10: train
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def train(self, depgraphs, modelfile):
"""
:param depgraphs : list of DependencyGraph as the training data
:type depgraphs : DependencyGraph
:param modelfile : file name to save the trained model
:type modelfile : str
"""
try:
input_file = tempfile.NamedTemporaryFile(
prefix='transition_parse.train',
dir=tempfile.gettempdir(),
delete=False)
if self._algorithm == self.ARC_STANDARD:
self._create_training_examples_arc_std(depgraphs, input_file)
else:
self._create_training_examples_arc_eager(depgraphs, input_file)
input_file.close()
# Using the temporary file to train the libsvm classifier
x_train, y_train = load_svmlight_file(input_file.name)
# The parameter is set according to the paper:
# Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
# Todo : because of probability = True => very slow due to
# cross-validation. Need to improve the speed here
model = svm.SVC(
kernel='poly',
degree=2,
coef0=0,
gamma=0.2,
C=0.5,
verbose=True,
probability=True)
model.fit(x_train, y_train)
# Save the model to file name (as pickle)
pickle.dump(model, open(modelfile, 'wb'))
finally:
remove(input_file.name)
示例11: import_libsvm_sparse
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def import_libsvm_sparse(filename):
"""Imports dataset file in libsvm sparse format"""
from sklearn.datasets import load_svmlight_file
X, y = load_svmlight_file(filename)
return Dataset(X.toarray(), y)
示例12: setUp
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def setUp(self):
dataset_filepath = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
'datasets/yeast_train.svm')
X, y = load_svmlight_file(dataset_filepath, multilabel=True)
self.X = X.todense().tolist()
self.y = MultiLabelBinarizer().fit_transform(y).tolist()
self.quota = 10
示例13: read_data
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def read_data(filename, header=True, dtype='float32', zero_based=True):
"""Read data in sparse format
Arguments
---------
filename: str
output file name
header: bool, default=True
If header is present or not
dtype: str, default='float32'
data type of values
zero_based: boolean, default=True
zwero based indices?
Returns
--------
features: csr_matrix
features matrix
labels: csr_matix
labels matrix
num_samples: int
#instances
num_feat: int
#features
num_labels: int
#labels
"""
with open(filename, 'rb') as f:
_l_shape = None
if header:
line = f.readline().decode('utf-8').rstrip("\n")
line = line.split(" ")
num_samples, num_feat, num_labels = int(
line[0]), int(line[1]), int(line[2])
_l_shape = (num_samples, num_labels)
else:
num_samples, num_feat, num_labels = None, None, None
features, labels = load_svmlight_file(f, multilabel=True)
labels = ll_to_sparse(
labels, dtype=dtype, zero_based=zero_based, shape=_l_shape)
return features, labels, num_samples, num_feat, num_labels
示例14: load_svmlight_file
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def load_svmlight_file(self):
"""
Use sklearn.datasets.load_svmlight_file to load data.svmlight.
"""
file_name = os.path.join(self.data_dir, "data.svmlight")
datasets.load_svmlight_file(file_name)
示例15: get_year_prediction_data
# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def get_year_prediction_data(dirname=None):
feature_dim = 90
if dirname is None:
dirname = os.path.join(os.path.dirname(__file__), 'data')
filename = 'YearPredictionMSD'
download_filename = os.path.join(dirname, "%s.bz2" % filename)
extracted_filename = os.path.join(dirname, filename)
if not os.path.isfile(download_filename):
print("Downloading data...")
mx.test_utils.download('https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/%s.bz2' % filename, dirname=dirname)
if not os.path.isfile(extracted_filename):
print("Extracting data...")
with bz2.BZ2File(download_filename) as fr, open(extracted_filename,"wb") as fw:
shutil.copyfileobj(fr,fw)
print("Reading data from disk...")
train_features, train_labels = load_svmlight_file(extracted_filename, n_features=feature_dim, dtype=np.float32)
train_features = train_features.todense()
# normalize the data: subtract means and divide by standard deviations
label_mean = train_labels.mean()
label_std = np.sqrt(np.square(train_labels - label_mean).mean())
feature_means = train_features.mean(axis=0)
feature_stds = np.sqrt(np.square(train_features - feature_means).mean(axis=0))
train_features = (train_features - feature_means) / feature_stds
train_labels = (train_labels - label_mean) / label_std
return feature_dim, train_features, train_labels