当前位置: 首页>>代码示例>>Python>>正文


Python datasets.load_svmlight_file方法代码示例

本文整理汇总了Python中sklearn.datasets.load_svmlight_file方法的典型用法代码示例。如果您正苦于以下问题:Python datasets.load_svmlight_file方法的具体用法?Python datasets.load_svmlight_file怎么用?Python datasets.load_svmlight_file使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sklearn.datasets的用法示例。


在下文中一共展示了datasets.load_svmlight_file方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: read_year_prediction_data

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def read_year_prediction_data(fileName):
    feature_dim = 90
    print("Reading data from disk...")
    train_features, train_labels = load_svmlight_file(fileName, n_features=feature_dim, dtype=np.float32)
    train_features = train_features.todense()

    # normalize the data: subtract means and divide by standard deviations
    label_mean = train_labels.mean()
    label_std = np.sqrt(np.square(train_labels - label_mean).mean())
    feature_means = train_features.mean(axis=0)
    feature_stds = np.sqrt(np.square(train_features - feature_means).mean(axis=0))

    train_features = (train_features - feature_means) / feature_stds
    train_labels = (train_labels - label_mean) / label_std

    return feature_dim, train_features, train_labels 
开发者ID:awslabs,项目名称:dynamic-training-with-apache-mxnet-on-aws,代码行数:18,代码来源:data_reader.py

示例2: load_data

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def load_data(path, dense=False):
    """Load data from a CSV, LibSVM or HDF5 file based on the file extension.

    Args:
        path (str): A path to the CSV, LibSVM or HDF5 format file.
        dense (boolean): An optional variable indicating if the return matrix
                         should be dense.  By default, it is false.

    Returns:
        Data matrix X and target vector y
    """

    catalog = {'.csv': load_csv, '.sps': load_svmlight_file, '.h5': load_hdf5}

    ext = os.path.splitext(path)[1]
    func = catalog[ext]
    X, y = func(path)

    if dense and sparse.issparse(X):
        X = X.todense()

    return X, y 
开发者ID:jeongyoonlee,项目名称:Kaggler,代码行数:24,代码来源:data_io.py

示例3: test_dump

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def test_dump(self):
        tmpfile = "tmp_dump.txt"
        try:
            # loads from file
            Xs, y = load_svmlight_file(datafile)

            # dumps to file
            dump_svmlight_file(Xs, y, tmpfile, zero_based=False)

            # loads them as CSR MATRIX
            X2, y2 = sk_load_svmlight_file(tmpfile)

            X3 = np.ndarray(shape=X2.shape, dtype=X2.dtype)
            X2.toarray(out=X3)

            # check assertions
            assert_array_almost_equal(Xs, X3)
            assert_array_almost_equal(y, y2)
        finally:
            if os.path.exists(tmpfile):
                os.remove(tmpfile) 
开发者ID:hpclab,项目名称:rankeval,代码行数:23,代码来源:test_svmlight_format.py

示例4: test_dump_qid

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def test_dump_qid(self):
        tmpfile = "/tmp/tmp_dump.txt"
        try:
            # loads from file
            Xs, y, q = load_svmlight_file(qid_datafile, query_id=True)

            # dumps to file
            dump_svmlight_file(Xs, y, tmpfile, query_id=list(q), zero_based=False)

            # loads them as CSR MATRIX with scikit-learn
            X2, y2, q2 = sk_load_svmlight_file(tmpfile, query_id=True)

            X3 = np.ndarray(shape=X2.shape, dtype=X2.dtype)
            X2.toarray(out=X3)

            # check assertions
            assert_array_almost_equal(Xs, X3)
            assert_array_almost_equal(y, y2)
            assert_array_equal(q, q2)
        finally:
            if os.path.exists(tmpfile):
                os.remove(tmpfile) 
开发者ID:hpclab,项目名称:rankeval,代码行数:24,代码来源:test_svmlight_format.py

示例5: test

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def test():
    url_zip_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_train.binary.bz2'
    urllib.request.urlretrieve(url_zip_train, filename='train.bz2')

    f_svm = open('train.svm', 'wt')
    with bz2.open('train.bz2', 'rb') as f_zip:
        data = f_zip.read()
        f_svm.write(data.decode('utf-8'))
    f_svm.close()


    X, y = load_svmlight_file('train.svm')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


    pipeline = make_pipeline(FeatureGradientSelector(n_epochs=1, n_features=10), LogisticRegression())
    # pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())

    pipeline.fit(X_train, y_train)

    print("Pipeline Score: ", pipeline.score(X_train, y_train)) 
开发者ID:microsoft,项目名称:nni,代码行数:23,代码来源:sklearn_test.py

示例6: load_realsim

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def load_realsim(folder=REALSIM, one_hot=True, partitions_proportions=None, shuffle=False, as_tensor=True):
    X, y = sk_dt.load_svmlight_file(folder + "/real-sim")
    y = np.array([int(yy) for yy in y])
    if one_hot:
        y = to_one_hot_enc(y)
    res = [Dataset(data=X, target=y)]
    if partitions_proportions:
        res = redivide_data(res, shuffle=shuffle, partition_proportions=partitions_proportions)
        res = Datasets.from_list(res)

    if as_tensor: [dat.convert_to_tensor() for dat in res]

    return res


# noinspection PyPep8Naming 
开发者ID:lucfra,项目名称:RFHO,代码行数:18,代码来源:datasets.py

示例7: _read_svmlight

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def _read_svmlight(lines, out_blocks, col_size, n_features, store_sparse):
    from tempfile import SpooledTemporaryFile
    from sklearn.datasets import load_svmlight_file

    # Creating a tmp file to use load_svmlight_file method should be more
    # efficient than parsing the lines manually
    tmp_file = SpooledTemporaryFile(mode="wb+", max_size=2e8)
    tmp_file.writelines(lines)
    tmp_file.seek(0)

    x, y = load_svmlight_file(tmp_file, n_features)
    if not store_sparse:
        x = x.toarray()

    # tried also converting to csc/ndarray first for faster splitting but it's
    # not worth. Position 0 contains the X
    for i in range(ceil(n_features / col_size)):
        out_blocks[0][i] = x[:, i * col_size:(i + 1) * col_size]

    # Position 1 contains the y block
    out_blocks[1][0] = y.reshape(-1, 1) 
开发者ID:bsc-wdc,项目名称:dislib,代码行数:23,代码来源:io.py

示例8: test_load_svmlight_file

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def test_load_svmlight_file(self):
        """ Tests loading a LibSVM file  """
        file_ = "tests/files/libsvm/1"

        x_np, y_np = load_svmlight_file(file_, n_features=780)

        # Load SVM and store in sparse
        x, y = ds.load_svmlight_file(file_, (25, 100), n_features=780,
                                     store_sparse=True)

        self.assertTrue(_equal_arrays(x.collect(), x_np))
        self.assertTrue(_equal_arrays(y.collect(), y_np))

        # Load SVM and store in dense
        x, y = ds.load_svmlight_file(file_, (25, 100), n_features=780,
                                     store_sparse=False)

        self.assertTrue(_equal_arrays(x.collect(), x_np.toarray()))
        self.assertTrue(_equal_arrays(y.collect(), y_np)) 
开发者ID:bsc-wdc,项目名称:dislib,代码行数:21,代码来源:test_array.py

示例9: retrieve_dataset

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def retrieve_dataset(dataset, **kwargs):
    # if data not extracted, download zip and extract
    outdirname = "datasets.1.17.2019"
    if not os.path.exists(outdirname):
        try:
            from urllib import urlretrieve
        except ImportError:
            from urllib.request import urlretrieve
        import zipfile

        zipfilename = outdirname + ".zip"
        urlretrieve(
            "https://publictestdatasets.blob.core.windows.net/data/" + zipfilename,
            zipfilename,
        )
        with zipfile.ZipFile(zipfilename, "r") as unzip:
            unzip.extractall(".")
    extension = os.path.splitext(dataset)[1]
    filepath = os.path.join(outdirname, dataset)
    if extension == ".npz":
        # sparse format file
        import scipy.sparse as sparse

        return sparse.load_npz(filepath)
    elif extension == ".svmlight":
        from sklearn import datasets

        return datasets.load_svmlight_file(filepath)
    elif extension == ".json":
        import json

        with open(filepath, encoding="utf-8") as f:
            dataset = json.load(f)
        return dataset
    elif extension == ".csv":
        import pandas as pd

        return pd.read_csv(filepath, **kwargs)
    else:
        raise Exception("Unrecognized file extension: " + extension) 
开发者ID:interpretml,项目名称:interpret-text,代码行数:42,代码来源:datasets.py

示例10: train

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def train(self, depgraphs, modelfile):
        """
        :param depgraphs : list of DependencyGraph as the training data
        :type depgraphs : DependencyGraph
        :param modelfile : file name to save the trained model
        :type modelfile : str
        """

        try:
            input_file = tempfile.NamedTemporaryFile(
                prefix='transition_parse.train',
                dir=tempfile.gettempdir(),
                delete=False)

            if self._algorithm == self.ARC_STANDARD:
                self._create_training_examples_arc_std(depgraphs, input_file)
            else:
                self._create_training_examples_arc_eager(depgraphs, input_file)

            input_file.close()
            # Using the temporary file to train the libsvm classifier
            x_train, y_train = load_svmlight_file(input_file.name)
            # The parameter is set according to the paper:
            # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
            # Todo : because of probability = True => very slow due to
            # cross-validation. Need to improve the speed here
            model = svm.SVC(
                kernel='poly',
                degree=2,
                coef0=0,
                gamma=0.2,
                C=0.5,
                verbose=True,
                probability=True)

            model.fit(x_train, y_train)
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, 'wb'))
        finally:
            remove(input_file.name) 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:42,代码来源:transitionparser.py

示例11: import_libsvm_sparse

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def import_libsvm_sparse(filename):
    """Imports dataset file in libsvm sparse format"""
    from sklearn.datasets import load_svmlight_file
    X, y = load_svmlight_file(filename)
    return Dataset(X.toarray(), y) 
开发者ID:ntucllab,项目名称:libact,代码行数:7,代码来源:dataset.py

示例12: setUp

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def setUp(self):
        dataset_filepath = os.path.join(
            os.path.dirname(os.path.realpath(__file__)),
            'datasets/yeast_train.svm')
        X, y = load_svmlight_file(dataset_filepath, multilabel=True)
        self.X = X.todense().tolist()
        self.y = MultiLabelBinarizer().fit_transform(y).tolist()
        self.quota = 10 
开发者ID:ntucllab,项目名称:libact,代码行数:10,代码来源:test_multilabel_realdata.py

示例13: read_data

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def read_data(filename, header=True, dtype='float32', zero_based=True):
    """Read data in sparse format

    Arguments
    ---------
    filename: str
        output file name
    header: bool, default=True
        If header is present or not
    dtype: str, default='float32'
        data type of values
    zero_based: boolean, default=True
        zwero based indices?

    Returns
    --------
    features: csr_matrix
        features matrix
    labels: csr_matix
        labels matrix
    num_samples: int
        #instances
    num_feat: int
        #features
    num_labels: int
        #labels
    """
    with open(filename, 'rb') as f:
        _l_shape = None
        if header:
            line = f.readline().decode('utf-8').rstrip("\n")
            line = line.split(" ")
            num_samples, num_feat, num_labels = int(
                line[0]), int(line[1]), int(line[2])
            _l_shape = (num_samples, num_labels)
        else:
            num_samples, num_feat, num_labels = None, None, None
        features, labels = load_svmlight_file(f, multilabel=True)
        labels = ll_to_sparse(
            labels, dtype=dtype, zero_based=zero_based, shape=_l_shape)
    return features, labels, num_samples, num_feat, num_labels 
开发者ID:kunaldahiya,项目名称:pyxclib,代码行数:43,代码来源:data_utils.py

示例14: load_svmlight_file

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def load_svmlight_file(self):
        """
        Use sklearn.datasets.load_svmlight_file to load data.svmlight.
        """
        file_name = os.path.join(self.data_dir, "data.svmlight")
        datasets.load_svmlight_file(file_name) 
开发者ID:recipy,项目名称:recipy,代码行数:8,代码来源:run_sklearn.py

示例15: get_year_prediction_data

# 需要导入模块: from sklearn import datasets [as 别名]
# 或者: from sklearn.datasets import load_svmlight_file [as 别名]
def get_year_prediction_data(dirname=None):
    feature_dim = 90
    if dirname is None:
        dirname = os.path.join(os.path.dirname(__file__), 'data')
    filename = 'YearPredictionMSD'
    download_filename = os.path.join(dirname, "%s.bz2" % filename)
    extracted_filename = os.path.join(dirname, filename)
    if not os.path.isfile(download_filename):
        print("Downloading data...")
        mx.test_utils.download('https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/%s.bz2' % filename, dirname=dirname)
    if not os.path.isfile(extracted_filename):
        print("Extracting data...")
        with bz2.BZ2File(download_filename) as fr, open(extracted_filename,"wb") as fw:
            shutil.copyfileobj(fr,fw)
    print("Reading data from disk...")
    train_features, train_labels = load_svmlight_file(extracted_filename, n_features=feature_dim, dtype=np.float32)
    train_features = train_features.todense()

    # normalize the data: subtract means and divide by standard deviations
    label_mean = train_labels.mean()
    label_std = np.sqrt(np.square(train_labels - label_mean).mean())
    feature_means = train_features.mean(axis=0)
    feature_stds = np.sqrt(np.square(train_features - feature_means).mean(axis=0))

    train_features = (train_features - feature_means) / feature_stds
    train_labels = (train_labels - label_mean) / label_std

    return feature_dim, train_features, train_labels 
开发者ID:mlperf,项目名称:training_results_v0.6,代码行数:30,代码来源:data_reader.py


注:本文中的sklearn.datasets.load_svmlight_file方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。