本文整理汇总了Python中dataset.DataSet类的典型用法代码示例。如果您正苦于以下问题:Python DataSet类的具体用法?Python DataSet怎么用?Python DataSet使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了DataSet类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: eval_classifier
def eval_classifier(classifierToUse, featuresToUse, testOrTrain="train"):
print("Chosen feature: {0}".format(featuresToUse) )
print("Chosen classifier: {0}".format(classifierToUse))
fe = FeatureExtractor(featuresToUse)
dataset = DataSet(fe)
classifier = Classifier()
evaluate = Evaluation()
print "test or Train %s" % testOrTrain
for feature_class, files in getTestData(testOrTrain).items():
print "%s" % testOrTrain
for f in files:
dataset.addFile(feature_class, f)
print "Dataset initialized"
print_class_stats(dataset.classes)
print "Test set created."
a_train, a_test, c_train, c_test = train_test_split(dataset.featureVector, dataset.classes, test_size=0.9)
c_pred = classifier.classification(a_train,a_test,c_train,c_test,classifierToUse)
evaluate.evaluate(c_pred,c_test,featuresToUse,classifierToUse)
示例2: __vectorize
def __vectorize(self, data):
"""\
Train vectorization and subsequently vectorize. Accepts a DataSet
or a list of dictionaries to be vectorized.
"""
# no vectorization performed, only converted to matrix
if self.vectorizer is None:
if not isinstance(data, DataSet):
data_set = DataSet()
data_set.load_from_dict(data)
data = data_set
data.match_headers(self.data_headers, add_values=True)
# TODO pre-filtering here?
return data.as_bunch(target=self.class_attr,
select_attrib=self.select_attr).data
# vectorization needed: converted to dictionary
# and passed to the vectorizer
if isinstance(data, DataSet):
data = data.as_dict(select_attrib=self.select_attr,
mask_attrib=self.class_attr)
else:
data = [{key: val for key, val in inst.items()
if key != self.class_attr and key in self.select_attr}
for inst in data]
# pre-filter attributes if filter_attr is set
if self.filter_attr:
data = [{key: val for key, val in inst.items()
if self.filter_attr(key, val)} for inst in data]
if not self.vectorizer_trained:
self.vectorizer.fit(data)
self.vectorizer_trained = True
return self.vectorizer.transform(data).tocsr()
示例3: __init__
def __init__(self, data, interval_type=ClassIntervalType.ROOT):
f = []
for d in data:
f.append(float(d))
data = f
DataSet.__init__(self, data)
self.interval_type = interval_type
if self.interval_type != ClassIntervalType.THREESIGMA:
self.class_interval = self.calc_class_interval(interval_type, self.min, self.max, self.n);
self.construct_bins(self.min, self.max, self.class_interval, False);
else:
sigma_span = 6
min = self.mean - self.stdev * (sigma_span / 2)
max = self.mean + self.stdev * (sigma_span / 2)
self.class_interval = self.calc_class_interval(ClassIntervalType.THREESIGMA, min, max, sigma_span)
self.construct_bins(min, max, self.class_interval, True)
self.fill_bins()
self.sort_bins()
total = 0
for bin in self.bins:
total = total + bin.count()
self.bin_contents_count = total
示例4: parse
def parse(path, crawl = False):
if crawl == True:
raise StandardError()
pos_filename = os.path.join(path, "pos.lst")
neg_filename = os.path.join(path, "neg.lst")
pos_dir = os.path.join(path, "pos")
neg_dir = os.path.join(path, "neg")
if not os.path.isfile(pos_filename):
print "%s is not a file."%(pos_filename,)
return None
if not os.path.isfile(neg_filename):
print "%s is not a file."%(neg_filename,)
return None
if not os.path.isdir(pos_dir):
print "%s is not a directory."%(pos_dir,)
return None
if not os.path.isdir(neg_dir):
print "%s is not a directory."%(neg_dir,)
return None
ret = DataSet()
pos = open(pos_filename, "r")
pos_names = [line[line.rfind("/")+1:] for line in pos.read().split()]
pos.close()
for name in pos_names:
filename = os.path.join(pos_dir, name)
ret.add_obj(name, WholeImage(name))
neg = open(neg_filename, "r")
neg_names = [line[line.rfind("/")+1:] for line in neg.read().split()]
neg.close()
for name in neg_names:
ret.add_empty_image(name)
return ret
示例5: extract_data
def extract_data(raw_data_file, format_data_path, n_vectors, n_components, shift=-1, n_datums=-1, test_percentage=0):
"""
Extrait les données brutes de raw_data_file, et, à partir des paramètres de formatage renseignés, construit deux fichiers de données préformatées train et test.
Un item de données (datum) est une matrice de la forme (n_vectors,n_components*20), où n_vectors est le nombre de vecteurs à prendre pour un item de donnée, et n_components le nombre de composantes à garder sur chaque vecteur.
:param file raw_data_file: Fichier contenant les données brutes
:param str format_data_path: Chemin vers le dossier où placer les données préformatées
:param int n_vectors: Nombre de vecteurs à considérer comme étant un item de données
:param int n_components: Nombre de composantes à garder sur chaque vecteur. Si = 1, un vecteur ne contiendra que les mfcc, si = 2 un vecteur contiendra les mfcc et leurs dérivées premières, si = 3, il y aura aussi les dérivées secondes.
:param int shift: Décalage/Overlapping. Nombre de vecteurs en commun entre le dernier item de données extrait et le prochain. Attention ! Laisser à -1 pour désactiver l'overlapping. Introduire un overlapping donne des résultats surestimés en apprentissage (voir rapport de stage).
:param int n_datums: the first n_datums will be extracted. -1 to extract data from the whole file
:param int n_datums: Nombre d'items donnée à lire dans le fichier de données brutes avant l'arrêt du script. -1 = Aller jusqu'à la fin du fichier.
:param float test_percentage: Rapport attendu du nombre de données à mettre en généralisation divisé par le nombre de données à mettre en apprentissage. (\*100 = pourcentage de données en test)
:return: Les bases de données train et test (sous forme de classe DataSet)
"""
train = DataSet(format_data_path, "train")
test = DataSet(format_data_path, "test")
data = []
datum = []
feature_list = []
line_count = 0
total_line_count = 0
for feature in raw_data_file:
line_count += 1
if feature[0] == ' ':
# New data vector
feature_list = feature.split()
if feature_list[-1] == ']': feature_list.pop() # remove ending "]" for the last vector of the signal
datum.append([ float(x) for x in feature_list[:(20*n_components)] ])
if len(datum) >= n_vectors:
# Append the datum
data.append(datum)
# Shift the datum
datum = datum[shift:] if shift > 0 else []
if len(data)%20000 == 0: print "extract data >> ", len(data), " datums extracted for", line_count, "lines read"
else:
# New signal
new_str_label = feature.split('#')[0]
if new_str_label != DataSet.str_label:
if data:
# There is data to split in train/test
DataSet.split_train_test(data, test_percentage, train, test)
# Append to files
train.flush_buffer()
test.flush_buffer()
data = []
print "SPLIT : ", "train =", len(train), " - test =", len(test)
print "Line count for this label : ", line_count
print "TOTAL : ", len(train)+len(test), " datums extracted for", total_line_count + line_count, "lines read"
if n_datums > 0 and len(train) + len(test) >= n_datums: break
# Update current label
DataSet.update_label(new_str_label)
print "New LABEL : ", DataSet.str_label, "int : ", DataSet.int_label
total_line_count += line_count
line_count = 0
datum = []
print "extract data >> GRAND TOTAL : ", (len(train) + len(test)), " datums extracted for", total_line_count + line_count, "lines read"
return train, test
示例6: setField
def setField(self, label, arr, **kwargs):
"""Set the given array `arr` as the new array of the field specfied by
`label`."""
DataSet.setField(self, label, arr, **kwargs)
# refresh dimensions, in case any of these fields were modified
if label == 'input':
self.indim = self.getDimension('input')
elif label == 'target':
self.outdim = self.getDimension('target')
示例7: load_training_set
def load_training_set(self, filename, encoding='UTF-8'):
"""\
Load the given training data set into memory and strip it if
configured to via the train_part parameter.
"""
log_info('Loading training data set from ' + str(filename) + '...')
train = DataSet()
train.load_from_arff(filename, encoding)
if self.train_part < 1:
train = train.subset(0, int(round(self.train_part * len(train))),
copy=False)
return train
示例8: parse
def parse(path, crawl = False):
if crawl == True:
raise StandardError()
ret = DataSet()
filenames = os.listdir(path)
for filename in filenames:
#TODO : check validity
(fname, width, height, chans, bboxes) \
= parse_file(os.path.join(path, filename))
fname = os.path.basename(fname)
for bbox in bboxes:
ret.add_obj(fname, bbox, height, width)
return ret
示例9: ResultsToXY
def ResultsToXY(sets,x,y,foreach=[]):
""" combines observable x and y to build a list of DataSet with y vs x
this function is used to collect data from a hierarchy of DataSet objects, to prepare plots or evaluation.
the inner-most list has to contain one DataSet with props['observable'] = x and one props['observable'] = y,
this will be the pair x-y used in the collection.
The parameters are:
sets: hierarchy of datasets where the inner-most list must contain to pair x-y
x: the name of the observable to be used as x-value of the collected results
y: the name of the observable to be used as y-value of the collected results
foreach: an optional list of properties used for grouping the results. A separate DataSet object is created for each unique set of values of the specified parameers.
The function returns a list of DataSet objects.
"""
dd = depth(sets)
if dd < 2:
raise Exception('The input hierarchy does not provide a unique pair x-y. The input structure has to be a list of lists as minimum. pyalps.groupSets might help you.')
hgroups = flatten(sets, fdepth=-1)
foreach_sets = {}
for gg in hgroups:
xset = None
yset = None
for d in gg:
if d.props['observable'] == x:
xset = d
if d.props['observable'] == y:
yset = d
if xset is None or yset is None:
continue
common_props = dict_intersect([d.props for d in gg])
fe_par_set = tuple((common_props[m] for m in foreach))
if not fe_par_set in foreach_sets:
foreach_sets[fe_par_set] = DataSet()
foreach_sets[fe_par_set].props = common_props
foreach_sets[fe_par_set].props['xlabel'] = x
foreach_sets[fe_par_set].props['ylabel'] = y
if len(xset.y) == len(yset.y):
foreach_sets[fe_par_set].x = np.concatenate((foreach_sets[fe_par_set].x, xset.y))
foreach_sets[fe_par_set].y = np.concatenate((foreach_sets[fe_par_set].y, yset.y))
elif len(xset.y) == 1:
foreach_sets[fe_par_set].x = np.concatenate((foreach_sets[fe_par_set].x, np.array( [xset.y[0]]*len(yset.y) )))
foreach_sets[fe_par_set].y = np.concatenate((foreach_sets[fe_par_set].y, yset.y))
for k, res in foreach_sets.items():
order = np.argsort(res.x, kind = 'mergesort')
res.x = res.x[order]
res.y = res.y[order]
res.props['label'] = ''
for p in foreach:
res.props['label'] += '%s = %s ' % (p, res.props[p])
return foreach_sets.values()
示例10: load_test_data
def load_test_data(self, sessions_df):
data_df = read_from_csv(self.task_core.test_data_file, self.task_core.n_seed
#, max_rows=50000
)
cache_file = os.path.join(self.task_core.cache_dir, 'features_test_' + str(len(data_df.index)) + '.p')
if os.path.isfile(cache_file):
print('Loading test features from file')
x = DataSet.load_from_file(cache_file)
else:
x = ds_from_df(data_df, sessions_df, True)
print('saving test features to file')
DataSet.save_to_file(x, cache_file)
return x
示例11: __reduce__
def __reduce__(self):
# FIXME: This does actually not feel right: We have to use the DataSet
# method here, although we inherit from sequential dataset.
_, _, state, _, _ = DataSet.__reduce__(self)
creator = self.__class__
args = self.statedim, self.actiondim
return creator, args, state, iter([]), iter({})
示例12: _trim_data
def _trim_data(self, extension_fraction=None, max_interval=None):
"""
Toss out data outside of (extended) view range, and closer than max_interval seconds apart.
"""
if extension_fraction is None:
start_stamp = self._start_stamp
end_stamp = self._end_stamp
else:
extension = rospy.Duration((self._end_stamp - self._start_stamp).to_sec() * extension_fraction)
if extension.to_sec() >= self._start_stamp.to_sec():
start_stamp = rospy.Time(0, 1)
else:
start_stamp = self._start_stamp - extension
end_stamp = self._end_stamp + extension
min_x = (start_stamp - self._timeline.start_stamp).to_sec()
max_x = (end_stamp - self._timeline.start_stamp).to_sec()
for series in list(self._data.keys()):
points = self._data[series].points
num_points = len(points)
trimmed_points = []
if num_points > 0 and points[0][0] < max_x and points[-1][0] > min_x:
first_index = None
last_x = None
for i, (x, y) in enumerate(points):
if x >= min_x:
trimmed_points.append((x, y))
first_index = i
last_x = x
break
if first_index is not None:
for i, (x, y) in enumerate(points[first_index + 1:]):
if x > max_x:
break
if (max_interval is None) or (x - last_x >= max_interval):
trimmed_points.append((x, y))
last_x = x
new_data = DataSet()
new_data.set(trimmed_points)
self._data[series] = new_data
示例13: ds_from_df
def ds_from_df(data_df, sessions_df, is_test):
print('ds_from_df <<')
data_df = add_features(data_df)
data_df = add_sessions_features(data_df, sessions_df)
if not is_test:
data_df = data_df.drop(['country_destination'], axis=1)
print('ds_from_df >>')
return DataSet.create_from_df(data_df)
示例14: load_train_data
def load_train_data(self, sessions_df):
data_df = read_from_csv(self.task_core.data_file, self.task_core.n_seed
#, max_rows=50000
)
cache_file = os.path.join(self.task_core.cache_dir, 'features_train_' + str(len(data_df.index)) + '.p')
if os.path.isfile(cache_file):
print('Loading train features from file')
x = DataSet.load_from_file(cache_file)
else:
x = ds_from_df(data_df, sessions_df, False)
print('saving train features to file')
DataSet.save_to_file(x, cache_file)
labels = data_df['country_destination'].values
y = le_.transform(labels)
return x, y
示例15: exp_
def exp_(self):
#"""
data = DataSet()
self.quick = DataSet()
data.dataimport("D:\Dropbox\St Andrews\IT\IS5189 MSc Thesis\\02 Data\InnoCentive_Challenge_9933493_training_data.csv")
data.labelencode(columns=self.configLE)
xtest, xtrain, ytest, ytrain = data.split(quick=True)
self.quick.import_split(xtest, xtrain, ytest, ytrain)
self.output_str("10 percent of original dataset loaded (into train. Testset is 90 percent).")
rows_train = len(xtrain)
self.feedback("Challenge data loaded. self.quick init with " + str(rows_train) + " rows.")
correlation_list, descstats = self.quick.correlation()
self._output_last(correlation_list)
#print(test)
#a = test.sort_values(by='Correlation', ascending=True).head(20)
#b = test.sort_values(by='Correlation',ascending=False).head(20)
#print(a)
#print(b)
#print(descstats)
#self.quick.descstats()
#"""
#Clock.schedule_once(lambda dt: self.feedback("this is good"), -1)
#descstats = data.descstats(self.configLE)
############################################################
# df is short for DataFrame , to make it more readable when manipulating the Pandas DataFrame.
# Might be easier (and is shorter) to read by developers as an in house var name.
threshold = 0.7
df = correlation_list[correlation_list['Correlation'] > threshold]
df = df.sort_values(by='Correlation',ascending=False)
column_a_b = df['Var1']
column_a_b = column_a_b.append(df['Var2'])
print(df[df['Var1'] == 'C31'])
print(column_a_b.value_counts())
#print(df.head(10))
print(pd.crosstab(df['Var1'], df['Var2']))