当前位置: 首页>>代码示例>>Python>>正文


Python datasets.load_svmlight_files函数代码示例

本文整理汇总了Python中sklearn.datasets.load_svmlight_files函数的典型用法代码示例。如果您正苦于以下问题:Python load_svmlight_files函数的具体用法?Python load_svmlight_files怎么用?Python load_svmlight_files使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了load_svmlight_files函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_load_svmlight_files

def test_load_svmlight_files():
    X_train, y_train, X_test, y_test = load_svmlight_files([datafile] * 2, dtype=np.float32)
    assert_array_equal(X_train.toarray(), X_test.toarray())
    assert_array_equal(y_train, y_test)
    assert_equal(X_train.dtype, np.float32)
    assert_equal(X_test.dtype, np.float32)

    X1, y1, X2, y2, X3, y3 = load_svmlight_files([datafile] * 3, dtype=np.float64)
    assert_equal(X1.dtype, X2.dtype)
    assert_equal(X2.dtype, X3.dtype)
    assert_equal(X3.dtype, np.float64)
开发者ID:kkuunnddaann,项目名称:scikit-learn,代码行数:11,代码来源:test_svmlight_format.py

示例2: classify_test

def classify_test(feature_list=[], classifiers=[], root_path='./'):
    #load data set
    datasets = []
    for name in feature_list:
        logging.log(logging.DEBUG, 'loading data: %s ...' % name)
        filenames = tuple(['./feature/%s_%s' % (name, tag) for tag in ['train.txt', 'test.txt']])
        X_train, y_train, X_test, y_test = load_svmlight_files(filenames)
        datasets.append((name, X_train, y_train, X_test, y_test))

    #make directory to store results
    result_path = path.join(root_path, 'results')
    if path.exists(result_path):
        assert path.isdir(result_path), 'data must be a directory!'
    else:
        system('mkdir ' + result_path)

    for clf in classifiers:
        for feature in datasets:
            clf_name = clf.__class__.__name__
            feature_name, X_train, y_train, X_test, y_test = feature
            combine_name = feature_name+'_'+clf_name
            info = {}

            logging.log(logging.DEBUG, 'classification test: %s ...' % combine_name)

            logging.log(logging.DEBUG, 'training...')
            t0 = time()
            clf.fit(X_train, y_train)
            t1 = time()
            info['training_time'] = t1-t0

            logging.log(logging.DEBUG, 'testing on training...')
            pred_y = clf.predict(X_train)
            training_acc = accuracy_score(y_train, pred_y)
            logging.log(logging.DEBUG, 'error rate on training set: %f' % (1.0 - training_acc))
            info['training_error'] = 1.0 - training_acc
            fout = open(path.join(result_path, combine_name+'_train.txt'), 'w')
            for y in pred_y:
                print >>fout, y
            fout.close()

            logging.log(logging.DEBUG, 'testing...')
            t0 = time()
            pred_y = clf.predict(X_test)
            t1 = time()
            info['test_time'] = t1-t0
            test_acc = accuracy_score(y_test, pred_y)
            logging.log(logging.DEBUG, 'error rate on test set: %f' % (1.0 - test_acc))
            info['test_error'] = 1.0 - test_acc
            fout = open(path.join(result_path, combine_name+'_test.txt'), 'w')
            for y in pred_y:
                print >>fout, y
            fout.close()

            yield combine_name, feature_name, clf_name, info
开发者ID:defaultstr,项目名称:movie_review,代码行数:55,代码来源:base.py

示例3: pCoverX

def pCoverX(featureFamily):
    os.chdir("C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\train")
    path = "C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\"
    data_df = pd.DataFrame()
    n_guass = 2
    train_post_array = []
    test_post_array = []
    val_post_array = []
    train_entropy_array = []
    test_entropy_array = []
    val_entropy_array = []
    fileType = featureFamily+'*.gz'
    for file in glob.glob(fileType):
        print(file)
        X_train, y_train, X_test, y_test,X_val, y_val = load_svmlight_files((gzip.open(path+"train\\"+file), gzip.open(path+"test\\"+file),gzip.open(path+"validation\\"+file)))    
        #X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(("train\\vision_cuboids_histogram.txt", "test\\vision_cuboids_histogram.txt","validation\\vision_cuboids_histogram.txt"))
        X_train = X_train[y_train!=31]
        X_test = X_test[y_test!=31]
        X_val = X_val[y_val!=31]
        y_train = y_train[y_train!=31]
        y_test = y_test[y_test!=31]
        y_val = y_val[y_val!=31]
    #========================= Feature Selection using Variance Thresold =============================================================
        X_train_new, X_test_new , X_val_new = featureSelection(X_train,X_test,X_val,y_train, log=True,tech = 'LinearSVC')
    #========================= Mixture of Guassian ============================================================
        train_prob,test_prob,val_prob = pXoverC(X_train_new, y_train, X_test_new, y_test, X_val_new, y_val, n_guass)
    #========================= Calculating Prior, Posterior and Entropy ============================================================
        prr = prior(y_train)
        train_post = posterior(train_prob,prr)
        train_entropy = entropy(train_post)
        
        train_post_array.append(train_post)
        train_entropy_array.append(train_entropy)
    
        test_post = posterior(test_prob,prr)
        test_entropy = entropy(test_post)
    
        test_post_array.append(test_post)
        test_entropy_array.append(test_entropy)
        
        val_post = posterior(val_prob,prr)
        val_entropy = entropy(val_post)
    
        val_post_array.append(val_post)
        val_entropy_array.append(val_entropy)
        
        train_acc,c_mat = checkAccuracy(train_post,y_train)
        test_acc,c_mat = checkAccuracy(test_post,y_test)
        val_acc,c_mat = checkAccuracy(val_post,y_val)
        temp = pd.DataFrame([[file,train_acc,test_acc,val_acc]])        
        data_df = data_df.append(temp,ignore_index =True)
        
    return train_post_array,test_post_array,val_post_array,train_entropy_array,test_entropy_array,val_entropy_array,data_df
开发者ID:sahuvaibhav,项目名称:Capstone,代码行数:53,代码来源:test4.py

示例4: test_load_zero_based_auto

def test_load_zero_based_auto():
    data1 = "-1 1:1 2:2 3:3\n"
    data2 = "-1 0:0 1:1\n"

    f1 = BytesIO(data1)
    X, y = load_svmlight_file(f1, zero_based="auto")
    assert_equal(X.shape, (1, 3))

    f1 = BytesIO(data1)
    f2 = BytesIO(data2)
    X1, y1, X2, y2 = load_svmlight_files([f1, f2], zero_based="auto")
    assert_equal(X1.shape, (1, 4))
    assert_equal(X2.shape, (1, 4))
开发者ID:kkuunnddaann,项目名称:scikit-learn,代码行数:13,代码来源:test_svmlight_format.py

示例5: test_load_with_qid

def test_load_with_qid():
    # load svmfile with qid attribute
    data = """
    3 qid:1 1:0.53 2:0.12
    2 qid:1 1:0.13 2:0.1
    7 qid:2 1:0.87 2:0.12"""
    X, y = load_svmlight_file(BytesIO(data), query_id=False)
    assert_array_equal(y, [3, 2, 7])
    assert_array_equal(X.todense(), [[0.53, 0.12], [0.13, 0.1], [0.87, 0.12]])
    res1 = load_svmlight_files([BytesIO(data)], query_id=True)
    res2 = load_svmlight_file(BytesIO(data), query_id=True)
    for X, y, qid in (res1, res2):
        assert_array_equal(y, [3, 2, 7])
        assert_array_equal(qid, [1, 1, 2])
        assert_array_equal(X.todense(), [[0.53, 0.12], [0.13, 0.1], [0.87, 0.12]])
开发者ID:kkuunnddaann,项目名称:scikit-learn,代码行数:15,代码来源:test_svmlight_format.py

示例6: select_feature

def select_feature(trainfilename, testfilename):
    def returnCHI(X, y):
        return chivalue
    X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename), multilabel=True)
    
    featureNum = X_train.get_shape()[1]
    chivalue = chi2(X_train, y_train)

    step = featureNum / 20;
    for i in range(1, 21):
        selectNum = step * i
        print "selecting", selectNum, "features"
        selector = SelectKBest(chi2, k=selectNum)
        X_train_new = selector.fit_transform(X_train, y_train)
        X_test_new= selector.transform(X_test)
        dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False)
        dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)
开发者ID:junjiek,项目名称:cmu-exp,代码行数:17,代码来源:chi_square_max_multilabel.py

示例7: select_feature_multilabel

def select_feature_multilabel(trainfilename, testfilename):
    def returnIG(X, y):
        return randval, p
    X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename),  multilabel=True)

    featurenum = X_train.shape[1]
    randval = randomValues(X_train, y_train)
    p = np.ones((featurenum,1), int)
    p.reshape(featurenum,1)

    featureNum = X_train.get_shape()[1]
    step = featureNum / 20;
    for i in range(1, 21):
        selectNum = step * i
        print "selecting", selectNum, "features"
        selector = SelectKBest(returnIG, k=selectNum)
        X_train_new = selector.fit_transform(X_train, y_train)
        X_test_new = selector.transform(X_test)
        dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False)
        dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)
开发者ID:junjiek,项目名称:cmu-exp,代码行数:20,代码来源:random_selection_ml.py

示例8: select_feature

def select_feature(trainfilename, testfilename):
    def returnIG(X, y):
        return ig, p
    X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename))

    featurenum = X_train.shape[1]
    ig = information_gain(X_train, y_train)
    ig = ig.reshape(featurenum,)
    p = np.ones((1,featurenum), int)
    p.reshape(featurenum,1)

    featureNum = X_train.get_shape()[1]
    step = featureNum / 20;
    for i in range(1, 21):
        selectNum = step * i
        print "selecting", selectNum, "features"
        selector = SelectKBest(returnIG, k=selectNum)
        X_train_new = selector.fit_transform(X_train, y_train)
        X_test_new = selector.transform(X_test)
        sklearn.datasets.dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False)
        sklearn.datasets.dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)
开发者ID:junjiek,项目名称:cmu-exp,代码行数:21,代码来源:ig_selection.py

示例9: run_nblcr

def run_nblcr(train, test, outfn, grams='123', clf=LogisticRegression(class_weight="auto")):
    f_train = outfn + '-train.txt'
    f_test = outfn + '-test.txt'
    
    ngram = [int(i) for i in grams]
    ptrain = []
    ntrain = []
        
    for _, row in train.iterrows():
        if row['label'] == 1:
            ptrain.append(tokenize(row['text'], ngram))
        elif row['label'] == 0:
            ntrain.append(tokenize(row['text'], ngram))
        
    pos_counts = build_dict(ptrain, ngram)
    neg_counts = build_dict(ntrain, ngram)
        
    dic, r = compute_ratio(pos_counts, neg_counts)
        
    generate_svmlight_file(train, dic, r, ngram, f_train)
    generate_svmlight_file(test, dic, r, ngram, f_test)
    
    X_train, y_train, X_test, _ = load_svmlight_files((f_train, f_test))
    
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    try:
        y_prob = clf.predict_proba(X_test)
    except:
        # for svm with probability output
        clf.set_params(probability=True)
        y_prob_pos = clf.predict(X_test)
        y_prob_neg = np.ones(X_test.shape[0]) - y_prob_pos
        y_prob = np.column_stack((y_prob_neg, y_prob_pos))
    
    return y_pred, y_prob
    
    
    
开发者ID:bluedrone,项目名称:psb-adr,代码行数:37,代码来源:maxent_nblcr.py

示例10: load_amazon

def load_amazon(source_name, target_name, data_folder=None, verbose=False):
    """
    Load the amazon sentiment datasets from svmlight format files
    inputs:
        source_name : name of the source dataset
        target_name : name of the target dataset
        data_folder : path to the folder containing the files
    outputs:
        xs : training source data matrix
        ys : training source label vector
        xt : training target data matrix
        yt : training target label vector
        xtest : testing target data matrix
        ytest : testing target label vector
    """

    if data_folder is None:
        data_folder = 'data/'

    source_file = data_folder + source_name + '_train.svmlight'
    target_file = data_folder + target_name + '_train.svmlight'
    test_file = data_folder + target_name + '_test.svmlight'

    if verbose:
        print('source file:', source_file)
        print('target file:', target_file)
        print('test file:  ', test_file)

    xs, ys, xt, yt, xtest, ytest = load_svmlight_files([source_file, target_file, test_file])

    # Convert sparse matrices to numpy 2D array
    xs, xt, xtest = (np.array(X.todense()) for X in (xs, xt, xtest))

    # Convert {-1,1} labels to {0,1} labels
    ys, yt, ytest = (np.array((y + 1) / 2, dtype=int) for y in (ys, yt, ytest))

    return xs, ys, xt, yt, xtest, ytest
开发者ID:GRAAL-Research,项目名称:domain_adversarial_neural_network,代码行数:37,代码来源:experiments_amazon.py

示例11: LDA

    probs.append(score_i)
      
  return probs
  
parser = argparse.ArgumentParser()
#parser.add_argument( "train_file" )
parser.add_argument( "-p", "--predict", help = "if is to make predictions in a test file", default = None )
parser.add_argument( "-t", "--predict_file", help = "if is to make predictions in a test file", default = None )
parser.add_argument( "-c", "--cross_validation", help = "if have make cross-validation", default = None )

args = parser.parse_args()

classifier = LDA(n_components=2)
#classifier = RandomForestClassifier()

X_url, y, X_title, y_t, X_body, y_b, X_a, y_a = load_svmlight_files(("url_train.txt", "title_train.txt", "body_train.txt", "all_train.txt"))
X = {"url":X_url, "title": X_title, "body": X_body, "all": X_a}

if(args.predict):
  print "Predicting"
  T_url, t, T_title, y_t, T_body, y_b, T_a, y_a = load_svmlight_files(("url_test.txt", "title_test.txt", "body_test.txt", "all_test.txt"))
  T = {"url": T_url, "title": T_title, "body": T_body, "all": T_a}
  probs = predict(classifier, X, y, T, t)
  
  f = open("sub_31-08_01h15.txt","w")
  f.write("label\n")
  for p in probs:
    line = "%f\n" % p
    f.write(line)
  f.close()
elif(args.cross_validation):
开发者ID:rloliveirajr,项目名称:kaggle_stumbleupon,代码行数:31,代码来源:train.py

示例12: svm_skin

def svm_skin(X_train, y_train, X_test, y_test):
    """Learn the skin data sets with SVM with Linear kernel.

    X_*: Samples.
    y_*: labels.
    """
    print 'SVM w/ Linear kernel'
    clf = svm.LinearSVC()
    clf.fit(X_train, y_train)
    score = 100 * clf.score(X_test.toarray(), y_test)

    print 'SVM score: %.2f%%' % score
    return score


if __name__ == '__main__':
    # `data_size` is an integer which controls how big the data set is.
    # Use none for to use the whole dataset.
    # split_libsvm_dataset(path='skin.txt', data_size=None)

    # Load train and test samples (X) + labels (y).
    X_train, y_train, X_test, y_test = load_svmlight_files(
        ('skin-train.libsvm', 'skin-test.libsvm'))

    svm_skin(X_train, y_train, X_test, y_test)

    # iterations, scores = adaboost_skin(X_train, y_train, X_test, y_test)
    # graph = plot_success_per_size(iterations, scores)
    # show()
开发者ID:oryband,项目名称:homework,代码行数:29,代码来源:q5.py

示例13: test_load_invalid_file2

def test_load_invalid_file2():
    load_svmlight_files([datafile, invalidfile, datafile])
开发者ID:kkuunnddaann,项目名称:scikit-learn,代码行数:2,代码来源:test_svmlight_format.py

示例14: load_svmlight_files

    # remove axis spines
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["bottom"].set_visible(False)
    ax.spines["left"].set_visible(False)

    plt.grid()
    plt.tight_layout
    plt.show()
    
    
    
os.chdir("F:\Analytics\ISB Study\Capstone\dir_data\dir_data")



X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(("train\\vision_cuboids_histogram.txt", "test\\vision_cuboids_histogram.txt","validation\\vision_cuboids_histogram.txt"))
np.unique(y_train)

sklearn_lda = LDA(n_components=30)
X_lda_sklearn = sklearn_lda.fit_transform(X_train.todense(), y_train)
plot_scikit_lda(X_lda_sklearn, title='LDA vision_cuboids_histogram')
# PCA
sklearn_pca = sklearnPCA(n_components=30)
X_pca = sklearn_pca.fit_transform(X_train.todense())
plot_pca(title = 'PCA vision_cuboids_histogram')
#
X_ldapca_sklearn = sklearn_pca.fit_transform(X_lda_sklearn)
plot_scikit_lda(X_ldapca_sklearn, title='LDA+PCA LDA vision_cuboids_histogram', mirror=(-1))
开发者ID:sahuvaibhav,项目名称:Capstone,代码行数:29,代码来源:pcalda.py

示例15: documentFrequency

from sklearn.datasets import load_svmlight_files


def documentFrequency(X, y):
    featurenum = X.shape[1]
    s = sum(X).toarray()
    p = np.ones((1, featurenum), int)
    return s.reshape(featurenum), p.reshape(featurenum, 1)


if __name__ == "__main__":
    if len(sys.argv) != 4:
        print "Usage: python threshold trainfilename testfilename"
        exit(1)
    trainfilename = sys.argv[2]
    testfilename = sys.argv[3]
    X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename))

    df = sum(X_train).toarray()[0]
    cnt = 0
    threshold = int(sys.argv[1])
    for i in range(0, len(df)):
        if df[i] >= threshold:
            cnt = cnt + 1
    selector = SelectKBest(documentFrequency, k=cnt)
    X_train = selector.fit_transform(X_train, y_train)
    X_test = selector.transform(X_test)
    sklearn.datasets.dump_svmlight_file(X_train, y_train, trainfilename + "_" + str(cnt), zero_based=False)
    sklearn.datasets.dump_svmlight_file(X_test, y_test, testfilename + "_" + str(cnt), zero_based=False)
    print cnt, "features selected"
开发者ID:junjiek,项目名称:cmu-exp,代码行数:30,代码来源:filterbyDF.py


注:本文中的sklearn.datasets.load_svmlight_files函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。