本文整理汇总了Python中scipy.io.arff.loadarff函数的典型用法代码示例。如果您正苦于以下问题:Python loadarff函数的具体用法?Python loadarff怎么用?Python loadarff使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了loadarff函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
def main(k=3, normalize=False, distance=True, base='mt_', ks=[]):
train, mtrain = loadarff(base + 'train.arff')
train = DataFrame(train)
test, mtest = loadarff(base + 'test.arff')
test = DataFrame(test)
cols = [col for col in mtrain.names() if mtrain[col][0] == 'numeric']
if normalize:
norms(test, train, cols)
learner = NearestNeighbor(mtrain, train, mtrain.names()[-1], distance=distance)
learner.calc(test)
import time
print 'testing', [k]
start = time.time()
err = learner.validate(test, k)
print 'Err:', err, 'Acc:', 1-err
print 'Time', time.time() - start
if not ks: return err
errs = {}
errs[k] = err
for ok in ks:
print 'testing'
start = time.time()
err = learner.validate(test, ok)
print 'Err:', err, 'Acc:', 1-err
print 'Time', time.time() - start
errs[ok] = err
return errs
示例2: compatibility_check
def compatibility_check(self):
c1_data, c1_meta = arff.loadarff(os.path.join(self.c1_folder, 'data', 'features.arff'))
c2_data, c2_meta = arff.loadarff(os.path.join(self.c1_folder, 'data', 'features.arff'))
testres = {}
# check features
if collections.Counter(c1_meta.names()) == collections.Counter(c2_meta.names()):
testres['features'] = True
else:
testres['features'] = False
# check classes
classes_c1 = list(set([x[-1] for x in c1_data]))
classes_c2 = list(set([x[-1] for x in c1_data]))
if collections.Counter(classes_c1) == collections.Counter(classes_c2):
testres['classes'] = True
else:
testres['classes'] = False
print 'Compatibility report:'
print 'features: ', testres[features]
print 'classes: ', testres['classes']
return testres
示例3: initial
def initial():
global traindata,trainmeta,attr,row,col,testdata,testmeta,trow,tcol
traindata, trainmeta = arff.loadarff(sys.argv[1])
attr = trainmeta._attrnames
row = len(traindata)
col = len(traindata[0])
testdata, testmeta = arff.loadarff(sys.argv[2])
trow = len(testdata)
tcol = len(testdata[0])
return sys.argv[3] == 'n'
示例4: main
def main():
#create the training & test sets, skipping the header row with [1:]
fnc = loadarff(open('Train/train_FNC_attrSelected.arff','r'))
sbm = loadarff(open('Train/train_SBM_attrSelected.arff','r'))
testf = genfromtxt(open('Test/test_FNC.csv','r'), delimiter=',', dtype='f8')[1:]
tests = genfromtxt(open('Test/test_SMB.csv','r'), delimiter=',', dtype='f8')[1:]
gnb = GaussianNB()
y_pred = gnb.fit(iris.data, iris.target).predict(iris.data)
predicted_probs = [[index + 1, x[1]] for index, x in enumerate(gnb.predict_proba(test))]
savetxt('Data/submission.csv', predicted_probs, delimiter=',', fmt='%d,%f',
header='MoleculeId,PredictedProbability', comments = '')
示例5: main
def main(k=3, normalize=False, distance=True, base='mt_', ks=[], regress=False, recycle=False, maxerr=.1):
train, mtrain = loadarff(base + 'train.arff')
train = DataFrame(train)
test, mtest = loadarff(base + 'test.arff')
test = DataFrame(test)
cols = [col for col in mtrain.names() if mtrain[col][0] == 'numeric']
if normalize:
norms(test, train, cols)
target = mtrain.names()[-1]
if recycle:
print len(train)
if regress:
removed = reduce_regress(target, train, k, True, maxerr=maxerr)
else:
removed = reuse_recycle(target, train, k, True)
# print removed
ixs = list(train.index)
for n in removed:
ixs.remove(n)
train = train.loc[ixs]
print len(train)
# print train.index
learner = NearestNeighbor(mtrain, train, target, distance=distance)
learner.calc(test)
tester = learner.regress if regress else learner.validate
import time
print 'testing', [k]
start = time.time()
err = tester(test, k)
print 'Err:', err, 'Acc:', 1-err
print 'Time', time.time() - start
if not ks: return err
errs = {}
errs[k] = err
for ok in ks:
print 'testing', ok
start = time.time()
err = tester(test, ok)
print 'Err:', err, 'Acc:', 1-err
print 'Time', time.time() - start
errs[ok] = err
return errs
示例6: load_data
def load_data(filename):
"""
returns an array of floats givent the specified filename.
requires scipy.io.arff.loadarff
"""
raw = loadarff(filename)[0]
return np.array([[float(i) for i in row] for row in raw])
示例7: preprocess
def preprocess(self):
if not os.path.exists(self.outputFolder):
try:
os.makedirs(self.outputFolder)
except OSError as exc:
if exc.errno != errno.EEXIST:
raise exc
pass
metadata = dict()
if not self.parameters:
self.parameters['parameter']='default'
metadata['preprocessing_params'] = self.parameters
yaml.dump(metadata,open(self.outputFolder+'/PreProcessing.yaml','w'))
if self.dataFile.split('.')[-1] == 'arff':
data,meta = loadarff(self.dataFile)
data = pd.DataFrame(data)
else:
data = pd.read_csv(self.dataFile)
data = data.fillna(self.missingValue)
if self.labelEncoding:
data = self.labelEncode(data)
data.to_csv(self.outputFolder+'/DataFile.csv',index=False)
示例8: parse_arff
def parse_arff(name):
# extract using arff package
file = arff.loadarff(open(name, 'rb'))
raw_data, metadata = file
data = [[v if type(v) is np.string_ else round(v, 14) for v in l] for l in raw_data]
return data, metadata
示例9: load_features_from_arff
def load_features_from_arff(path):
data, meta = loadarff(path)
features = pd.DataFrame(data, columns=meta)
features[features.columns[:-1]] = StandardScaler().fit_transform(features[features.columns[:-1]])
return features
示例10: load_data
def load_data(filename):
"""
load numeric data from arff file using scipy.io.arff.loadarff
returns a numpy array
"""
data = loadarff(open(filename, 'r'))[0]
return np.array([list(row) for row in data])
示例11: test
def test():
vec = DictVectorizer()
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
for filename in glob.glob(r'../dataset/UCI/*.arff'):
basename = re.sub(r'(\..*?)$','',os.path.basename(filename))
print basename
if basename != DS:
continue
# cost_matrix = pickle.load(open('../dataset/UCI/'+basename+'_cost_matrix.pkl', 'rb'))
data = arff.loadarff(filename)[0]
X = vec.fit_transform(np.array([{str(i):value for i,value in enumerate(list(row)[:-1])} for row in data])).toarray()
imp.fit(X)
X = imp.transform(X)
labels = np.array([row[-1] for row in data])
y = np.array([{v:k for k,v in enumerate(list(set(labels)))}[label] for label in labels])
random = np.random.permutation(range(len(X)))
print 'dataset ratio\t%s'%('\t'.join([alg+" "*(12-len(alg)) for alg in sorted(ALG.keys())]))
for iteration in xrange(10):
X, y, class_num, kf = X[random], y[random], set(labels), KFold(len(X), n_folds=10)
for train, test in kf:
length, train_size = len(train), 0.1
X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
X_label, X_unlabel, y_label, y_unlabel = train_test_split(X_train, y_train, test_size=1.0-train_size, random_state=0)
for R in xrange(2,10):
ones_matrix, cost_matrix = np.array([[1,1],[1,1]]), np.array([[1,1],[R,R]])
# print "%s R=%d"%(basename,R),
cross_validation("%s R=%d"%(basename,R), X_label, X_unlabel, y_label, y_unlabel, ones_matrix, cost_matrix)
exit()
示例12: getPurityMissingValues
def getPurityMissingValues(filename):
# clusters = int(filename.split('=')[1].split('.')[0])
countdict = {}
try:
x = loadarff(filename)
for row in x[0]:
# print len(x[1])
# print x[i]
# print 100000
# clusterid = row.Cluster
clusterid = row['Cluster']
if clusterid not in countdict:
countdict[clusterid] = {}
if row[3] not in countdict[clusterid]:
countdict[clusterid][row['f2']] = 1
else:
countdict[clusterid][row['f2']] += 1
maxtotal = 0
alltotal = 0
for cluster in countdict:
if cluster != '?':
maxtotal += max(countdict[cluster].values())
alltotal += sum(countdict[cluster].values())
purity = float(maxtotal) / alltotal
except:
purity = -1
return purity
示例13: split
def split(filename, train_size, reverse=False):
data, meta = arff.loadarff(filename)
orig_data = []
for line in data:
orig_data.append(list(line)[0:-1])
if reverse:
train_size = len(orig_data) - train_size
return generateTrain(tuple(orig_data), train_size)
示例14: read_dense_arff_dataset
def read_dense_arff_dataset(train_path, test_path, number_of_labels):
train_dataset, meta_train = loadarff(open(train_path, 'r'))
test_dataset, meta_test = loadarff(open(test_path, 'r'))
meta_names = meta_train.names()
attributes = meta_names[0:-number_of_labels]
classes = meta_names[-number_of_labels:len(meta_names)]
x_train = np.asarray(train_dataset[:][attributes].tolist(), dtype=np.float32)
y_train = np.asarray(train_dataset[:][classes].tolist(), dtype=np.float32)
x_test = np.asarray(test_dataset[:][attributes].tolist(), dtype=np.float32)
y_test = np.asarray(test_dataset[:][classes].tolist(), dtype=np.float32)
return x_train, y_train, x_test, y_test
示例15: RunMetrics
def RunMetrics(self, options):
Log.Info("Perform RANDOMFOREST.", self.verbose)
opts = {}
if "minimum_leaf_size" in options:
opts["minimum_leaf_size"] = int(options.pop("minimum_leaf_size"));
else:
opts["minimum_leaf_size"] = 1
if len(options) > 0:
Log.Fatal("Unknown parameters: " + str(options))
raise Exception("unknown parameters")
if len(self.dataset) < 2:
Log.Fatal("This method requires two or more datasets.")
return -1
# Split the command using shell-like syntax.
cmd = shlex.split("java -classpath " + self.path + "/weka.jar" +
":methods/weka" + " RANDOMFOREST -t " + self.dataset[0] + " -T " +
self.dataset[1] + " -M " + str(opts["minimum_leaf_size"]) )
# Run command with the nessecary arguments and return its output as a byte
# string. We have untrusted input so we disable all shell based features.
try:
s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False,
timeout=self.timeout)
except subprocess.TimeoutExpired as e:
Log.Warn(str(e))
return -2
except Exception as e:
Log.Fatal("Could not execute command: " + str(cmd))
return -1
# Datastructure to store the results.
metrics = {}
# Parse data: runtime.
timer = self.parseTimer(s)
if timer != -1:
predictions = np.genfromtxt("weka_predicted.csv", delimiter=',')
data, meta = arff.loadarff(self.dataset[2])
truelabels = np.asarray(
reduce(operator.concat, data.tolist()), dtype=np.float32)
metrics['Runtime'] = timer.total_time
try:
confusionMatrix = Metrics.ConfusionMatrix(truelabels, predictions)
metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix)
metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix)
metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix)
metrics['Recall'] = Metrics.AvgRecall(confusionMatrix)
metrics['MSE'] = Metrics.SimpleMeanSquaredError(truelabels, predictions)
except Exception as e:
# The confusion matrix can't mix binary and continuous data.
pass
Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose)
return metrics