当前位置: 首页>>代码示例>>Python>>正文


Python dataset.DataSet类代码示例

本文整理汇总了Python中dataset.DataSet的典型用法代码示例。如果您正苦于以下问题:Python DataSet类的具体用法?Python DataSet怎么用?Python DataSet使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了DataSet类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: eval_classifier

def eval_classifier(classifierToUse, featuresToUse, testOrTrain="train"):

    print("Chosen feature: {0}".format(featuresToUse) )
    print("Chosen classifier: {0}".format(classifierToUse))

    fe = FeatureExtractor(featuresToUse)
    dataset = DataSet(fe)
    classifier = Classifier()
    evaluate = Evaluation()

    print "test or Train %s" % testOrTrain
    for feature_class, files in getTestData(testOrTrain).items():
        print "%s" % testOrTrain
        for f in files:
            dataset.addFile(feature_class, f)

    print "Dataset initialized"
    print_class_stats(dataset.classes)

    print "Test set created."
    a_train, a_test, c_train, c_test = train_test_split(dataset.featureVector, dataset.classes, test_size=0.9)
    
    c_pred = classifier.classification(a_train,a_test,c_train,c_test,classifierToUse)
    
    evaluate.evaluate(c_pred,c_test,featuresToUse,classifierToUse)
开发者ID:xiao-shen,项目名称:keystroke,代码行数:25,代码来源:runit.py

示例2: __vectorize

 def __vectorize(self, data):
     """\
     Train vectorization and subsequently vectorize. Accepts a DataSet
     or a list of dictionaries to be vectorized.
     """
     # no vectorization performed, only converted to matrix
     if self.vectorizer is None:
         if not isinstance(data, DataSet):
             data_set = DataSet()
             data_set.load_from_dict(data)
             data = data_set
         data.match_headers(self.data_headers, add_values=True)
         # TODO pre-filtering here?
         return data.as_bunch(target=self.class_attr,
                              select_attrib=self.select_attr).data
     # vectorization needed: converted to dictionary
     # and passed to the vectorizer
     if isinstance(data, DataSet):
         data = data.as_dict(select_attrib=self.select_attr,
                             mask_attrib=self.class_attr)
     else:
         data = [{key: val for key, val in inst.items()
                  if key != self.class_attr and key in self.select_attr}
                 for inst in data]
     # pre-filter attributes if filter_attr is set
     if self.filter_attr:
         data = [{key: val for key, val in inst.items()
                  if self.filter_attr(key, val)} for inst in data]
     if not self.vectorizer_trained:
         self.vectorizer.fit(data)
         self.vectorizer_trained = True
     return self.vectorizer.transform(data).tocsr()
开发者ID:ryancotterell,项目名称:flect,代码行数:32,代码来源:model.py

示例3: __init__

	def __init__(self, data, interval_type=ClassIntervalType.ROOT):
		
		f = []
		for d in data:
			f.append(float(d))      
		data = f
		
		DataSet.__init__(self, data)
		self.interval_type = interval_type
		
		if self.interval_type != ClassIntervalType.THREESIGMA:    
			self.class_interval = self.calc_class_interval(interval_type, self.min, self.max, self.n);
			self.construct_bins(self.min, self.max, self.class_interval, False);
		else:
			sigma_span = 6
			min = self.mean - self.stdev * (sigma_span / 2)
			max = self.mean + self.stdev * (sigma_span / 2)
			self.class_interval = self.calc_class_interval(ClassIntervalType.THREESIGMA, min, max, sigma_span)
			self.construct_bins(min, max, self.class_interval, True)
			
		self.fill_bins()
		self.sort_bins()

		total = 0
		for bin in self.bins:
			total = total + bin.count()
		self.bin_contents_count = total
开发者ID:davidbarkhuizen,项目名称:dart,代码行数:27,代码来源:histogram.py

示例4: parse

def parse(path, crawl = False):
    if crawl == True:
        raise StandardError()
    pos_filename = os.path.join(path, "pos.lst")
    neg_filename = os.path.join(path, "neg.lst")
    pos_dir = os.path.join(path, "pos")
    neg_dir = os.path.join(path, "neg")
    if not os.path.isfile(pos_filename):
        print "%s is not a file."%(pos_filename,)
        return None
    if not os.path.isfile(neg_filename):
        print "%s is not a file."%(neg_filename,)
        return None
    if not os.path.isdir(pos_dir):
        print "%s is not a directory."%(pos_dir,)
        return None
    if not os.path.isdir(neg_dir):
        print "%s is not a directory."%(neg_dir,)
        return None

    ret = DataSet()
    pos = open(pos_filename, "r")
    pos_names = [line[line.rfind("/")+1:] for line in pos.read().split()]
    pos.close()
    for name in pos_names:
        filename = os.path.join(pos_dir, name)
        ret.add_obj(name, WholeImage(name))

    neg = open(neg_filename, "r")
    neg_names = [line[line.rfind("/")+1:] for line in neg.read().split()]
    neg.close()
    for name in neg_names:
        ret.add_empty_image(name)
        
    return ret
开发者ID:fireae,项目名称:visiongrader,代码行数:35,代码来源:inria_bool.py

示例5: extract_data

def extract_data(raw_data_file, format_data_path, n_vectors, n_components, shift=-1, n_datums=-1, test_percentage=0):
    """
    Extrait les données brutes de raw_data_file, et, à partir des paramètres de formatage renseignés, construit deux fichiers de données préformatées train et test. 
    Un item de données (datum) est une matrice de la forme (n_vectors,n_components*20), où n_vectors est le nombre de vecteurs à prendre pour un item de donnée, et n_components le nombre de composantes à garder sur chaque vecteur.

    :param file raw_data_file: 		Fichier contenant les données brutes
    :param str format_data_path:	Chemin vers le dossier où placer les données préformatées
    :param int n_vectors:		Nombre de vecteurs à considérer comme étant un item de données
    :param int n_components:		Nombre de composantes à garder sur chaque vecteur. Si = 1, un vecteur ne contiendra que les mfcc, si = 2 un vecteur contiendra les mfcc et leurs dérivées premières, si = 3, il y aura aussi les dérivées secondes.
    :param int shift: 			Décalage/Overlapping. Nombre de vecteurs en commun entre le dernier item de données extrait et le prochain. Attention ! Laisser à -1 pour désactiver l'overlapping. Introduire un overlapping donne des résultats surestimés en apprentissage (voir rapport de stage).
    :param int n_datums: 		the first n_datums will be extracted. -1 to extract data from the whole file
    :param int n_datums: 		Nombre d'items donnée à lire dans le fichier de données brutes avant l'arrêt du script. -1 = Aller jusqu'à la fin du fichier.
    :param float test_percentage: 	Rapport attendu du nombre de données à mettre en généralisation divisé par le nombre de données à mettre en apprentissage. (\*100 = pourcentage de données en test)
    :return: 				Les bases de données train et test (sous forme de classe DataSet)

    """
    train = DataSet(format_data_path, "train")
    test = DataSet(format_data_path, "test") 
    data = []
    datum = []
    feature_list = []
    line_count = 0
    total_line_count = 0
    for feature in raw_data_file:
        line_count += 1
        if feature[0] == ' ':
            # New data vector
            feature_list = feature.split()
            if feature_list[-1] == ']': feature_list.pop() # remove ending "]" for the last vector of the signal
            datum.append([ float(x) for x in feature_list[:(20*n_components)] ])
            if len(datum) >= n_vectors:
                # Append the datum
                data.append(datum)
                # Shift the datum
                datum = datum[shift:] if shift > 0 else []
                if len(data)%20000 == 0: print "extract data >> ", len(data), " datums extracted for", line_count, "lines read"
        else:
            # New signal
            new_str_label = feature.split('#')[0]
            if new_str_label != DataSet.str_label:
                if data:
                    # There is data to split in train/test
                    DataSet.split_train_test(data, test_percentage, train, test)
                    # Append to files
                    train.flush_buffer()
                    test.flush_buffer()
                    data = []
                    print "SPLIT : ", "train =", len(train), " - test =", len(test)
                    print "Line count for this label : ", line_count
                print "TOTAL : ", len(train)+len(test), " datums extracted for", total_line_count + line_count, "lines read"
                if n_datums > 0 and len(train) + len(test) >= n_datums: break
                # Update current label
                DataSet.update_label(new_str_label)
                print "New LABEL : ", DataSet.str_label, "int : ", DataSet.int_label
                total_line_count += line_count
                line_count = 0
            datum = []
    print "extract data >> GRAND TOTAL : ", (len(train) + len(test)), " datums extracted for", total_line_count + line_count, "lines read"
    return train, test
开发者ID:pombredanne,项目名称:PFECaffe,代码行数:59,代码来源:formatdata.py

示例6: setField

 def setField(self, label, arr, **kwargs):
     """Set the given array `arr` as the new array of the field specfied by
     `label`."""
     DataSet.setField(self, label, arr, **kwargs)
     # refresh dimensions, in case any of these fields were modified
     if label == 'input':
         self.indim = self.getDimension('input')
     elif label == 'target':
         self.outdim = self.getDimension('target')
开发者ID:firestrand,项目名称:pybrain-gpu,代码行数:9,代码来源:supervised.py

示例7: load_training_set

 def load_training_set(self, filename, encoding='UTF-8'):
     """\
     Load the given training data set into memory and strip it if
     configured to via the train_part parameter.
     """
     log_info('Loading training data set from ' + str(filename) + '...')
     train = DataSet()
     train.load_from_arff(filename, encoding)
     if self.train_part < 1:
         train = train.subset(0, int(round(self.train_part * len(train))),
                              copy=False)
     return train
开发者ID:imclab,项目名称:flect,代码行数:12,代码来源:model.py

示例8: parse

def parse(path, crawl = False):
    if crawl == True:
        raise StandardError()
    ret = DataSet()
    filenames = os.listdir(path)
    for filename in filenames:
        #TODO : check validity
        (fname, width, height, chans, bboxes) \
            = parse_file(os.path.join(path, filename))
        fname = os.path.basename(fname)
        for bbox in bboxes:
            ret.add_obj(fname, bbox, height, width)
    return ret
开发者ID:fireae,项目名称:visiongrader,代码行数:13,代码来源:inria.py

示例9: ResultsToXY

def ResultsToXY(sets,x,y,foreach=[]):
    """ combines observable x and y to build a list of DataSet with y vs x
 
    this function is used to collect data from a hierarchy of DataSet objects, to prepare plots or evaluation.
    the inner-most list has to contain one DataSet with props['observable'] = x and one props['observable'] = y,
    this will be the pair x-y used in the collection.

    The parameters are:
      sets:    hierarchy of datasets where the inner-most list must contain to pair x-y
      x:       the name of the observable to be used as x-value of the collected results 
      y:       the name of the observable to be used as y-value of the collected results 
      foreach: an optional list of properties used for grouping the results. A separate DataSet object is created for each unique set of values of the specified parameers.

    The function returns a list of DataSet objects.
    """
    
    dd = depth(sets)
    if dd < 2:
        raise Exception('The input hierarchy does not provide a unique pair x-y. The input structure has to be a list of lists as minimum. pyalps.groupSets might help you.')
    
    hgroups = flatten(sets, fdepth=-1)
    
    foreach_sets = {}
    for gg in hgroups:
        xset = None
        yset = None
        for d in gg:
            if d.props['observable'] == x:
                xset = d
            if d.props['observable'] == y:
                yset = d
        if xset is None or yset is None:
            continue
        
        common_props = dict_intersect([d.props for d in gg])
        fe_par_set = tuple((common_props[m] for m in foreach))
        
        if not fe_par_set in foreach_sets:
            foreach_sets[fe_par_set] = DataSet()
            foreach_sets[fe_par_set].props = common_props
            foreach_sets[fe_par_set].props['xlabel'] = x
            foreach_sets[fe_par_set].props['ylabel'] = y
        
        if len(xset.y) == len(yset.y):
            foreach_sets[fe_par_set].x = np.concatenate((foreach_sets[fe_par_set].x, xset.y))
            foreach_sets[fe_par_set].y = np.concatenate((foreach_sets[fe_par_set].y, yset.y))
        elif len(xset.y) == 1:
            foreach_sets[fe_par_set].x = np.concatenate((foreach_sets[fe_par_set].x, np.array( [xset.y[0]]*len(yset.y) )))
            foreach_sets[fe_par_set].y = np.concatenate((foreach_sets[fe_par_set].y, yset.y))
    
    for k, res in foreach_sets.items():
        order = np.argsort(res.x, kind = 'mergesort')
        res.x = res.x[order]
        res.y = res.y[order]
        res.props['label'] = ''
        for p in foreach:
            res.props['label'] += '%s = %s ' % (p, res.props[p])
        
    return foreach_sets.values()
开发者ID:dolfim,项目名称:hubbard_ladders_workflows,代码行数:59,代码来源:tools.py

示例10: load_test_data

    def load_test_data(self, sessions_df):
        data_df = read_from_csv(self.task_core.test_data_file, self.task_core.n_seed
                                #, max_rows=50000
                                )

        cache_file = os.path.join(self.task_core.cache_dir, 'features_test_' + str(len(data_df.index)) + '.p')
        if os.path.isfile(cache_file):
            print('Loading test features from file')
            x = DataSet.load_from_file(cache_file)
        else:
            x = ds_from_df(data_df, sessions_df, True)
            print('saving test features to file')
            DataSet.save_to_file(x, cache_file)

        return x
开发者ID:kaluzhny,项目名称:airbnb,代码行数:15,代码来源:tasks.py

示例11: __reduce__

 def __reduce__(self):
     # FIXME: This does actually not feel right: We have to use the DataSet
     # method here, although we inherit from sequential dataset. 
     _, _, state, _, _ = DataSet.__reduce__(self)
     creator = self.__class__
     args = self.statedim, self.actiondim
     return creator, args, state, iter([]), iter({})
开发者ID:ZachPhillipsGary,项目名称:CS200-NLP-ANNsProject,代码行数:7,代码来源:reinforcement.py

示例12: _trim_data

    def _trim_data(self, extension_fraction=None, max_interval=None):
        """
        Toss out data outside of (extended) view range, and closer than max_interval seconds apart.
        """
        if extension_fraction is None:
            start_stamp = self._start_stamp
            end_stamp   = self._end_stamp
        else:
            extension = rospy.Duration((self._end_stamp - self._start_stamp).to_sec() * extension_fraction)
            if extension.to_sec() >= self._start_stamp.to_sec():
                start_stamp = rospy.Time(0, 1)
            else:
                start_stamp = self._start_stamp - extension
            end_stamp = self._end_stamp + extension

        min_x = (start_stamp - self._timeline.start_stamp).to_sec()
        max_x = (end_stamp   - self._timeline.start_stamp).to_sec()

        for series in list(self._data.keys()):
            points     = self._data[series].points
            num_points = len(points)

            trimmed_points = []

            if num_points > 0 and points[0][0] < max_x and points[-1][0] > min_x:
                first_index = None
                last_x = None
                for i, (x, y) in enumerate(points):
                    if x >= min_x:
                        trimmed_points.append((x, y))
                        first_index = i
                        last_x = x
                        break

                if first_index is not None:
                    for i, (x, y) in enumerate(points[first_index + 1:]):
                        if x > max_x:
                            break

                        if (max_interval is None) or (x - last_x >= max_interval):
                            trimmed_points.append((x, y))
                            last_x = x

            new_data = DataSet()
            new_data.set(trimmed_points)

            self._data[series] = new_data
开发者ID:awesomebytes,项目名称:rxbag_plugins,代码行数:47,代码来源:plot_data_loader.py

示例13: ds_from_df

def ds_from_df(data_df, sessions_df, is_test):
    print('ds_from_df <<')
    data_df = add_features(data_df)
    data_df = add_sessions_features(data_df, sessions_df)
    if not is_test:
        data_df = data_df.drop(['country_destination'], axis=1)
    print('ds_from_df >>')
    return DataSet.create_from_df(data_df)
开发者ID:kaluzhny,项目名称:airbnb,代码行数:8,代码来源:tasks.py

示例14: load_train_data

    def load_train_data(self, sessions_df):
        data_df = read_from_csv(self.task_core.data_file, self.task_core.n_seed
                                #, max_rows=50000
                                )

        cache_file = os.path.join(self.task_core.cache_dir, 'features_train_' + str(len(data_df.index)) + '.p')
        if os.path.isfile(cache_file):
            print('Loading train features from file')
            x = DataSet.load_from_file(cache_file)
        else:
            x = ds_from_df(data_df, sessions_df, False)
            print('saving train features to file')
            DataSet.save_to_file(x, cache_file)

        labels = data_df['country_destination'].values
        y = le_.transform(labels)
        return x, y
开发者ID:kaluzhny,项目名称:airbnb,代码行数:17,代码来源:tasks.py

示例15: exp_

 def exp_(self):
     #"""
     data = DataSet()
     self.quick = DataSet()
     data.dataimport("D:\Dropbox\St Andrews\IT\IS5189 MSc Thesis\\02 Data\InnoCentive_Challenge_9933493_training_data.csv")
     data.labelencode(columns=self.configLE)
     xtest, xtrain, ytest, ytrain = data.split(quick=True)
     self.quick.import_split(xtest, xtrain, ytest, ytrain)
     self.output_str("10 percent of original dataset loaded (into train. Testset is 90 percent).")
     rows_train = len(xtrain)
     self.feedback("Challenge data loaded. self.quick init with " + str(rows_train) + " rows.")
     correlation_list, descstats = self.quick.correlation()
     self._output_last(correlation_list)
     #print(test)
     #a = test.sort_values(by='Correlation', ascending=True).head(20)
     #b = test.sort_values(by='Correlation',ascending=False).head(20)
     #print(a)
     #print(b)
     #print(descstats)
     #self.quick.descstats()
     #"""
     #Clock.schedule_once(lambda dt: self.feedback("this is good"), -1)
     #descstats = data.descstats(self.configLE)
     ############################################################
     # df is short for DataFrame , to make it more readable when manipulating the Pandas DataFrame.
     # Might be easier (and is shorter) to read by developers as an in house var name.
     threshold = 0.7
     df = correlation_list[correlation_list['Correlation'] > threshold]
     df = df.sort_values(by='Correlation',ascending=False)
     column_a_b = df['Var1']
     column_a_b = column_a_b.append(df['Var2'])
     print(df[df['Var1'] == 'C31'])
     print(column_a_b.value_counts())
     #print(df.head(10))
     print(pd.crosstab(df['Var1'], df['Var2']))
开发者ID:Dismeth,项目名称:gui,代码行数:35,代码来源:main.py


注:本文中的dataset.DataSet类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。