本文整理汇总了Python中sklearn.datasets.load_svmlight_files函数的典型用法代码示例。如果您正苦于以下问题:Python load_svmlight_files函数的具体用法?Python load_svmlight_files怎么用?Python load_svmlight_files使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了load_svmlight_files函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_load_svmlight_files
def test_load_svmlight_files():
X_train, y_train, X_test, y_test = load_svmlight_files([datafile] * 2, dtype=np.float32)
assert_array_equal(X_train.toarray(), X_test.toarray())
assert_array_equal(y_train, y_test)
assert_equal(X_train.dtype, np.float32)
assert_equal(X_test.dtype, np.float32)
X1, y1, X2, y2, X3, y3 = load_svmlight_files([datafile] * 3, dtype=np.float64)
assert_equal(X1.dtype, X2.dtype)
assert_equal(X2.dtype, X3.dtype)
assert_equal(X3.dtype, np.float64)
示例2: classify_test
def classify_test(feature_list=[], classifiers=[], root_path='./'):
#load data set
datasets = []
for name in feature_list:
logging.log(logging.DEBUG, 'loading data: %s ...' % name)
filenames = tuple(['./feature/%s_%s' % (name, tag) for tag in ['train.txt', 'test.txt']])
X_train, y_train, X_test, y_test = load_svmlight_files(filenames)
datasets.append((name, X_train, y_train, X_test, y_test))
#make directory to store results
result_path = path.join(root_path, 'results')
if path.exists(result_path):
assert path.isdir(result_path), 'data must be a directory!'
else:
system('mkdir ' + result_path)
for clf in classifiers:
for feature in datasets:
clf_name = clf.__class__.__name__
feature_name, X_train, y_train, X_test, y_test = feature
combine_name = feature_name+'_'+clf_name
info = {}
logging.log(logging.DEBUG, 'classification test: %s ...' % combine_name)
logging.log(logging.DEBUG, 'training...')
t0 = time()
clf.fit(X_train, y_train)
t1 = time()
info['training_time'] = t1-t0
logging.log(logging.DEBUG, 'testing on training...')
pred_y = clf.predict(X_train)
training_acc = accuracy_score(y_train, pred_y)
logging.log(logging.DEBUG, 'error rate on training set: %f' % (1.0 - training_acc))
info['training_error'] = 1.0 - training_acc
fout = open(path.join(result_path, combine_name+'_train.txt'), 'w')
for y in pred_y:
print >>fout, y
fout.close()
logging.log(logging.DEBUG, 'testing...')
t0 = time()
pred_y = clf.predict(X_test)
t1 = time()
info['test_time'] = t1-t0
test_acc = accuracy_score(y_test, pred_y)
logging.log(logging.DEBUG, 'error rate on test set: %f' % (1.0 - test_acc))
info['test_error'] = 1.0 - test_acc
fout = open(path.join(result_path, combine_name+'_test.txt'), 'w')
for y in pred_y:
print >>fout, y
fout.close()
yield combine_name, feature_name, clf_name, info
示例3: pCoverX
def pCoverX(featureFamily):
os.chdir("C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\train")
path = "C:\\Users\\Vaibhav\\Desktop\\dir_data\\dir_data\\"
data_df = pd.DataFrame()
n_guass = 2
train_post_array = []
test_post_array = []
val_post_array = []
train_entropy_array = []
test_entropy_array = []
val_entropy_array = []
fileType = featureFamily+'*.gz'
for file in glob.glob(fileType):
print(file)
X_train, y_train, X_test, y_test,X_val, y_val = load_svmlight_files((gzip.open(path+"train\\"+file), gzip.open(path+"test\\"+file),gzip.open(path+"validation\\"+file)))
#X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(("train\\vision_cuboids_histogram.txt", "test\\vision_cuboids_histogram.txt","validation\\vision_cuboids_histogram.txt"))
X_train = X_train[y_train!=31]
X_test = X_test[y_test!=31]
X_val = X_val[y_val!=31]
y_train = y_train[y_train!=31]
y_test = y_test[y_test!=31]
y_val = y_val[y_val!=31]
#========================= Feature Selection using Variance Thresold =============================================================
X_train_new, X_test_new , X_val_new = featureSelection(X_train,X_test,X_val,y_train, log=True,tech = 'LinearSVC')
#========================= Mixture of Guassian ============================================================
train_prob,test_prob,val_prob = pXoverC(X_train_new, y_train, X_test_new, y_test, X_val_new, y_val, n_guass)
#========================= Calculating Prior, Posterior and Entropy ============================================================
prr = prior(y_train)
train_post = posterior(train_prob,prr)
train_entropy = entropy(train_post)
train_post_array.append(train_post)
train_entropy_array.append(train_entropy)
test_post = posterior(test_prob,prr)
test_entropy = entropy(test_post)
test_post_array.append(test_post)
test_entropy_array.append(test_entropy)
val_post = posterior(val_prob,prr)
val_entropy = entropy(val_post)
val_post_array.append(val_post)
val_entropy_array.append(val_entropy)
train_acc,c_mat = checkAccuracy(train_post,y_train)
test_acc,c_mat = checkAccuracy(test_post,y_test)
val_acc,c_mat = checkAccuracy(val_post,y_val)
temp = pd.DataFrame([[file,train_acc,test_acc,val_acc]])
data_df = data_df.append(temp,ignore_index =True)
return train_post_array,test_post_array,val_post_array,train_entropy_array,test_entropy_array,val_entropy_array,data_df
示例4: test_load_zero_based_auto
def test_load_zero_based_auto():
data1 = "-1 1:1 2:2 3:3\n"
data2 = "-1 0:0 1:1\n"
f1 = BytesIO(data1)
X, y = load_svmlight_file(f1, zero_based="auto")
assert_equal(X.shape, (1, 3))
f1 = BytesIO(data1)
f2 = BytesIO(data2)
X1, y1, X2, y2 = load_svmlight_files([f1, f2], zero_based="auto")
assert_equal(X1.shape, (1, 4))
assert_equal(X2.shape, (1, 4))
示例5: test_load_with_qid
def test_load_with_qid():
# load svmfile with qid attribute
data = """
3 qid:1 1:0.53 2:0.12
2 qid:1 1:0.13 2:0.1
7 qid:2 1:0.87 2:0.12"""
X, y = load_svmlight_file(BytesIO(data), query_id=False)
assert_array_equal(y, [3, 2, 7])
assert_array_equal(X.todense(), [[0.53, 0.12], [0.13, 0.1], [0.87, 0.12]])
res1 = load_svmlight_files([BytesIO(data)], query_id=True)
res2 = load_svmlight_file(BytesIO(data), query_id=True)
for X, y, qid in (res1, res2):
assert_array_equal(y, [3, 2, 7])
assert_array_equal(qid, [1, 1, 2])
assert_array_equal(X.todense(), [[0.53, 0.12], [0.13, 0.1], [0.87, 0.12]])
示例6: select_feature
def select_feature(trainfilename, testfilename):
def returnCHI(X, y):
return chivalue
X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename), multilabel=True)
featureNum = X_train.get_shape()[1]
chivalue = chi2(X_train, y_train)
step = featureNum / 20;
for i in range(1, 21):
selectNum = step * i
print "selecting", selectNum, "features"
selector = SelectKBest(chi2, k=selectNum)
X_train_new = selector.fit_transform(X_train, y_train)
X_test_new= selector.transform(X_test)
dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False)
dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)
示例7: select_feature_multilabel
def select_feature_multilabel(trainfilename, testfilename):
def returnIG(X, y):
return randval, p
X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename), multilabel=True)
featurenum = X_train.shape[1]
randval = randomValues(X_train, y_train)
p = np.ones((featurenum,1), int)
p.reshape(featurenum,1)
featureNum = X_train.get_shape()[1]
step = featureNum / 20;
for i in range(1, 21):
selectNum = step * i
print "selecting", selectNum, "features"
selector = SelectKBest(returnIG, k=selectNum)
X_train_new = selector.fit_transform(X_train, y_train)
X_test_new = selector.transform(X_test)
dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False)
dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)
示例8: select_feature
def select_feature(trainfilename, testfilename):
def returnIG(X, y):
return ig, p
X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename))
featurenum = X_train.shape[1]
ig = information_gain(X_train, y_train)
ig = ig.reshape(featurenum,)
p = np.ones((1,featurenum), int)
p.reshape(featurenum,1)
featureNum = X_train.get_shape()[1]
step = featureNum / 20;
for i in range(1, 21):
selectNum = step * i
print "selecting", selectNum, "features"
selector = SelectKBest(returnIG, k=selectNum)
X_train_new = selector.fit_transform(X_train, y_train)
X_test_new = selector.transform(X_test)
sklearn.datasets.dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False)
sklearn.datasets.dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)
示例9: run_nblcr
def run_nblcr(train, test, outfn, grams='123', clf=LogisticRegression(class_weight="auto")):
f_train = outfn + '-train.txt'
f_test = outfn + '-test.txt'
ngram = [int(i) for i in grams]
ptrain = []
ntrain = []
for _, row in train.iterrows():
if row['label'] == 1:
ptrain.append(tokenize(row['text'], ngram))
elif row['label'] == 0:
ntrain.append(tokenize(row['text'], ngram))
pos_counts = build_dict(ptrain, ngram)
neg_counts = build_dict(ntrain, ngram)
dic, r = compute_ratio(pos_counts, neg_counts)
generate_svmlight_file(train, dic, r, ngram, f_train)
generate_svmlight_file(test, dic, r, ngram, f_test)
X_train, y_train, X_test, _ = load_svmlight_files((f_train, f_test))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
try:
y_prob = clf.predict_proba(X_test)
except:
# for svm with probability output
clf.set_params(probability=True)
y_prob_pos = clf.predict(X_test)
y_prob_neg = np.ones(X_test.shape[0]) - y_prob_pos
y_prob = np.column_stack((y_prob_neg, y_prob_pos))
return y_pred, y_prob
示例10: load_amazon
def load_amazon(source_name, target_name, data_folder=None, verbose=False):
"""
Load the amazon sentiment datasets from svmlight format files
inputs:
source_name : name of the source dataset
target_name : name of the target dataset
data_folder : path to the folder containing the files
outputs:
xs : training source data matrix
ys : training source label vector
xt : training target data matrix
yt : training target label vector
xtest : testing target data matrix
ytest : testing target label vector
"""
if data_folder is None:
data_folder = 'data/'
source_file = data_folder + source_name + '_train.svmlight'
target_file = data_folder + target_name + '_train.svmlight'
test_file = data_folder + target_name + '_test.svmlight'
if verbose:
print('source file:', source_file)
print('target file:', target_file)
print('test file: ', test_file)
xs, ys, xt, yt, xtest, ytest = load_svmlight_files([source_file, target_file, test_file])
# Convert sparse matrices to numpy 2D array
xs, xt, xtest = (np.array(X.todense()) for X in (xs, xt, xtest))
# Convert {-1,1} labels to {0,1} labels
ys, yt, ytest = (np.array((y + 1) / 2, dtype=int) for y in (ys, yt, ytest))
return xs, ys, xt, yt, xtest, ytest
示例11: LDA
probs.append(score_i)
return probs
parser = argparse.ArgumentParser()
#parser.add_argument( "train_file" )
parser.add_argument( "-p", "--predict", help = "if is to make predictions in a test file", default = None )
parser.add_argument( "-t", "--predict_file", help = "if is to make predictions in a test file", default = None )
parser.add_argument( "-c", "--cross_validation", help = "if have make cross-validation", default = None )
args = parser.parse_args()
classifier = LDA(n_components=2)
#classifier = RandomForestClassifier()
X_url, y, X_title, y_t, X_body, y_b, X_a, y_a = load_svmlight_files(("url_train.txt", "title_train.txt", "body_train.txt", "all_train.txt"))
X = {"url":X_url, "title": X_title, "body": X_body, "all": X_a}
if(args.predict):
print "Predicting"
T_url, t, T_title, y_t, T_body, y_b, T_a, y_a = load_svmlight_files(("url_test.txt", "title_test.txt", "body_test.txt", "all_test.txt"))
T = {"url": T_url, "title": T_title, "body": T_body, "all": T_a}
probs = predict(classifier, X, y, T, t)
f = open("sub_31-08_01h15.txt","w")
f.write("label\n")
for p in probs:
line = "%f\n" % p
f.write(line)
f.close()
elif(args.cross_validation):
示例12: svm_skin
def svm_skin(X_train, y_train, X_test, y_test):
"""Learn the skin data sets with SVM with Linear kernel.
X_*: Samples.
y_*: labels.
"""
print 'SVM w/ Linear kernel'
clf = svm.LinearSVC()
clf.fit(X_train, y_train)
score = 100 * clf.score(X_test.toarray(), y_test)
print 'SVM score: %.2f%%' % score
return score
if __name__ == '__main__':
# `data_size` is an integer which controls how big the data set is.
# Use none for to use the whole dataset.
# split_libsvm_dataset(path='skin.txt', data_size=None)
# Load train and test samples (X) + labels (y).
X_train, y_train, X_test, y_test = load_svmlight_files(
('skin-train.libsvm', 'skin-test.libsvm'))
svm_skin(X_train, y_train, X_test, y_test)
# iterations, scores = adaboost_skin(X_train, y_train, X_test, y_test)
# graph = plot_success_per_size(iterations, scores)
# show()
示例13: test_load_invalid_file2
def test_load_invalid_file2():
load_svmlight_files([datafile, invalidfile, datafile])
示例14: load_svmlight_files
# remove axis spines
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["left"].set_visible(False)
plt.grid()
plt.tight_layout
plt.show()
os.chdir("F:\Analytics\ISB Study\Capstone\dir_data\dir_data")
X_train, y_train, X_test, y_test, X_val, y_val = load_svmlight_files(("train\\vision_cuboids_histogram.txt", "test\\vision_cuboids_histogram.txt","validation\\vision_cuboids_histogram.txt"))
np.unique(y_train)
sklearn_lda = LDA(n_components=30)
X_lda_sklearn = sklearn_lda.fit_transform(X_train.todense(), y_train)
plot_scikit_lda(X_lda_sklearn, title='LDA vision_cuboids_histogram')
# PCA
sklearn_pca = sklearnPCA(n_components=30)
X_pca = sklearn_pca.fit_transform(X_train.todense())
plot_pca(title = 'PCA vision_cuboids_histogram')
#
X_ldapca_sklearn = sklearn_pca.fit_transform(X_lda_sklearn)
plot_scikit_lda(X_ldapca_sklearn, title='LDA+PCA LDA vision_cuboids_histogram', mirror=(-1))
示例15: documentFrequency
from sklearn.datasets import load_svmlight_files
def documentFrequency(X, y):
featurenum = X.shape[1]
s = sum(X).toarray()
p = np.ones((1, featurenum), int)
return s.reshape(featurenum), p.reshape(featurenum, 1)
if __name__ == "__main__":
if len(sys.argv) != 4:
print "Usage: python threshold trainfilename testfilename"
exit(1)
trainfilename = sys.argv[2]
testfilename = sys.argv[3]
X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename))
df = sum(X_train).toarray()[0]
cnt = 0
threshold = int(sys.argv[1])
for i in range(0, len(df)):
if df[i] >= threshold:
cnt = cnt + 1
selector = SelectKBest(documentFrequency, k=cnt)
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)
sklearn.datasets.dump_svmlight_file(X_train, y_train, trainfilename + "_" + str(cnt), zero_based=False)
sklearn.datasets.dump_svmlight_file(X_test, y_test, testfilename + "_" + str(cnt), zero_based=False)
print cnt, "features selected"