Python BaseDataset.load方法代码示例

本文整理汇总了Python中pySPACE.resources.dataset_defs.base.BaseDataset.load方法的典型用法代码示例。如果您正苦于以下问题：Python BaseDataset.load方法的具体用法？Python BaseDataset.load怎么用？Python BaseDataset.load使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pySPACE.resources.dataset_defs.base.BaseDataset的用法示例。

在下文中一共展示了BaseDataset.load方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _merge_pickle_files

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
 def _merge_pickle_files(self, target_collection_path, source_collection_pathes,
                               train_set_name_suffix, target_collection_params):
     """ Merge all collections in source_collection_pathes and store them \
         in the target collection"""
     
     # load a first collection, in which the data of all other collections 
     # is assembled
     target_collection = BaseDataset.load(source_collection_pathes[0])
     try:
         author = pwd.getpwuid(os.getuid())[4]
     except:
         author = "unknown"
         self._log("Author could not be resolved.",level=logging.WARNING)
     date = time.strftime("%Y%m%d_%H_%M_%S")
     # Delete node_chain file name
     try:
         target_collection.meta_data.pop("node_chain_file_name")
     except:
         pass
     # Update meta data and store it
     k = "test" if self.reverse else "train"
     target_collection_params["__INPUT_DATASET__"][k] = \
              [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes]
     target_collection_params["__RESULT_DIRECTORY__"] = self.result_directory
     target_collection.meta_data.update({
             "author" : author, 
             "date" : date, 
             "dataset_directory" : target_collection_path,
             "train_test" : True,
             "parameter_setting" : target_collection_params,
             "input_collection_name" : source_collection_pathes[0][len(
                                     pySPACE.configuration.storage):]
     })
   
     # merge data of all other collections to target collection
     for source_collection_path in source_collection_pathes[1:]:
         source_collection = BaseDataset.load(source_collection_path)
         for run in source_collection.get_run_numbers():
             for split in source_collection.get_split_numbers():
                 data = source_collection.get_data(run, split, 
                                                       train_set_name_suffix)
                 target_data = target_collection.get_data(run, split, 
                                                       train_set_name_suffix)
                 # actual data is stored in a list that has to be extended
                 target_data.extend(data)
                 
     # if only test data was given, the "Rest_vs" collection is stored as 
     # training data
     if not self.reverse and "test" == train_set_name_suffix: 
         # exchange the "test" in key tuple to "train" before storing
         for key in target_collection.data.keys():
             assert("test" == key[2])
             value = target_collection.data.pop(key)
             key = (key[0],key[1],"train")
             target_collection.data[key] = value
                 
     target_collection.store(target_collection_path)

开发者ID:BioinformaticsArchive，项目名称:pyspace，代码行数:59，代码来源:merge.py

示例2: _createProcesses

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
 def _createProcesses(cls, processes, result_directory, operation_spec, 
             parameter_settings, input_collections, command_template):     
  
     # For each combination of classifier, input-collection and
     # run number, create one WEKA_process
     for dataset_dir in input_collections:
         collection = BaseDataset.load(dataset_dir)
         # Determine the number of iterations and splits to be used
         iterations = collection.meta_data["runs"]
         splits = collection.meta_data["splits"] 
         if "runs" in operation_spec:
             assert(iterations in [1, operation_spec["runs"]])
             iterations = operation_spec["runs"]
         if "cv_folds" in operation_spec:
             assert(splits in [1, operation_spec["cv_folds"]])
             splits = operation_spec["cv_folds"]          
         
         for parametrization in parameter_settings: 
             for run_number in range(iterations):
                 process = WEKAClassificationProcess(dataset_dir,
                                                     command_template,
                                                     parametrization,
                                                     splits,
                                                     run_number,
                                                     result_directory)
                 processes.put(process)
     # give executing process the sign that creation is now finished                
     processes.put(False)

开发者ID:AlexanderFabisch，项目名称:pyspace，代码行数:30，代码来源:weka_classification.py

示例3: init

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
    def __init__(self, dataset_dir, command_template, parametrization,
                 run_number, split_number, operation_result_dir,
                 hide_parameters = []):
        
        super(WEKAFilterProcess, self).__init__()
        
        # Determine the directory in which the of the process' results
        # are stored
        result_collection_name = dataset_dir.split(os.sep)[-2]
        for parameter_name, parameter_value in parametrization.iteritems():
            # If this is a parameter that should not be hidden, then we have to
            # encode it in the result collection name 
            if not parameter_name in hide_parameters:
                result_collection_name += "{__%s__:%s}" % (parameter_name.upper(),
                                                           parameter_value)
                                                                     
        self.result_directory = os.path.join(operation_result_dir,
                                             result_collection_name)
        
        # Create directory for intermediate results if it does not exist yet
        create_directory(self.result_directory 
                              + os.sep + "data_run%s" % run_number)
                
        # Create collection
        collection = BaseDataset.load(dataset_dir)
        
        # The parametrization that is independent of the collection type 
        # and the specific weka command template that is executed
        self.params = {"dataset_name": dataset_dir.replace('/','_'),
                       "dataset_dir": dataset_dir,
                       "run_number": run_number,
                       "split_number": split_number,
                       "weka_class_path": pySPACE.configuration.weka_class_path,
                       "temp_results": self.result_directory}

        # Load the abbreviations
        abbreviations_file = open(os.path.join(pySPACE.configuration.spec_dir,
                                               'operations/weka_templates',
                                               'abbreviations.yaml'), 'r')
        self.abbreviations = yaml.load(abbreviations_file)
        # Add custom parameters for the weka command template
        for parameter_name, parameter_value in parametrization.iteritems():
            # Auto-expand abbreviations
            if parameter_value in self.abbreviations:
                parameter_value = self.abbreviations[parameter_value]
            self.params[parameter_name] = parameter_value
            
        # Build the WEKA command by repeatedly replacing all placeholders in 
        # the template 
        while True:
            instantiated_template = command_template % self.params
            if instantiated_template == command_template:
                # All placeholders replace 
                self.weka_command = instantiated_template
                break
            else:
                # We have to continue since we are not converged
                command_template = instantiated_template
        
        self.handler_class = None

开发者ID:Crespo911，项目名称:pyspace，代码行数:62，代码来源:weka_filter.py

示例4: _copy_file

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
 def _copy_file(self, source_collection_path, target_collection_path,
                train_set_name_suffix):
     """ Copy a dataset to a new destination 
     
     **Parameters**
     
         :source_collection_path:
             The path to the dataset that has to be copied.
             
         :target_collection_path:
             The path to where the dataset should be copied.
             
         :train_set_name_suffix:
             Either 'train' or 'test'. Specifies if the target dataset is
             handeled as training or testing data. 
     """ 
     source_collection = BaseDataset.load(source_collection_path)
     # if only test data was given, the "Rest_vs" collection is stored as 
     # training data
     if self.reverse and "test" == train_set_name_suffix: 
         # exchange the "test" in key tuple to "train" before storing
         for key in source_collection.data.keys():
             assert("test" == key[2])
             value = source_collection.data.pop(key)
             key = (key[0],key[1],"train")
             source_collection.data[key] = value
     # we store the data in the same format as before
     source_collection.store(target_collection_path, 
         source_collection.meta_data["storage_format"])

开发者ID:MMKrell，项目名称:pyspace，代码行数:31，代码来源:merge.py

示例5: create

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
    def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
        """
        A factory method that creates an WEKA operation based on the 
        information given in the operation specification operation_spec
        """
        assert(operation_spec["type"] == "weka_classification")
        # Determine all parameter combinations that should be tested
        parameter_settings = cls._get_parameter_space(operation_spec)
        
        # Read the command template from a file
        template_file = open(os.path.join(pySPACE.configuration.spec_dir,
                                               "operations",
                                               "weka_templates",
                                               operation_spec["template"]),
                             'r')
        command_template = template_file.read()
        template_file.close() 

        # number of processes
        if "runs" in operation_spec:
            number_processes = len(input_paths) * len(parameter_settings) * \
                           operation_spec["runs"]
        else: # approximate the number of processes 
            runs = []
            for dataset_dir in input_paths:
                collection = BaseDataset.load(dataset_dir)
                runs.append(collection.meta_data["runs"])
            runs = max(runs)
            number_processes = len(input_paths) * len(parameter_settings) * \
                               runs
        
        if debug == True:
            # To better debug creation of processes we don't limit the queue 
            # and create all processes before executing them
            processes = processing.Queue()
            cls._createProcesses(processes, result_directory, operation_spec, 
                                 parameter_settings, input_paths,
                                 command_template)
            # create and return the weka operation object
            return cls(processes, operation_spec, result_directory, 
                       number_processes)
        else:
            # Create all processes by calling a recursive helper method in 
            # another thread so that already created processes can be executed in 
            # parallel. Therefore a queue is used which size is maximized to 
            # guarantee that not to much objects are created (because this costs
            # memory). However, the actual number of 100 is arbitrary and might
            # be reviewed.
            processes = processing.Queue(100)
            create_process = processing.Process(target=cls._createProcesses,
                             args=( processes, result_directory, operation_spec, 
                                    parameter_settings, input_paths,
                                    command_template))
            create_process.start()            
            # create and return the weka operation object
            return cls(processes, operation_spec, result_directory, 
                       number_processes, create_process)

开发者ID:AlexanderFabisch，项目名称:pyspace，代码行数:59，代码来源:weka_classification.py

示例6: call

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
    def __call__(self):
        """ Executes this process on the respective modality """
        # Restore configuration
        pySPACE.configuration = self.configuration

        # reduce log_level for processing a second time and
        # set communication possibility for nodes to backend
        pySPACE.configuration.min_log_level = self.min_log_level
        pySPACE.configuration.logging_com = self.handler_args
        pySPACE.configuration.backend_com = self.backend_com

        ############## Prepare benchmarking ##############
        super(NodeChainProcess, self).pre_benchmarking()

        # Load the data and check that it can be processed
        # Note: This can not be done in the objects constructor since in
        # that case the whole input would need to be pickled
        # when doing the remote call
        abs_dataset_dir = os.sep.join([self.storage,
                                          self.rel_dataset_dir])

        input_collection = BaseDataset.load(abs_dataset_dir)

        # We have to remember parameters used for generating this specific
        # input dataset
        if 'parameter_setting' in input_collection.meta_data.keys():
            # but not __INPUT_DATASET__ and __RESULT_DIRECTORY__
            for k, v in input_collection.meta_data['parameter_setting'].items():
                if k not in ["__INPUT_DATASET__", "__RESULT_DIRECTORY__"]:
                    self.parameter_setting[k] = v

        NodeChainProcess._check_node_chain_dataset_consistency(self.node_chain,
                                                       input_collection)

        ############## Do the actual benchmarking ##############

        self._log("Start benchmarking run %s of node_chain %s on dataset %s"
                                % (self.run,
                                   self.node_chain_spec,
                                   self.rel_dataset_dir))


        # Do the actual benchmarking for this collection/node_chain combination
        try:
            result_collection = \
                self.node_chain.benchmark(input_collection = input_collection,
                                         run = self.run,
                                         persistency_directory = self.persistency_dir,
                                         store_node_chain = self.store_node_chain)
        except Exception, exception:
            # Send Exception to Logger
            import traceback
            print traceback.format_exc()
            self._log(traceback.format_exc(), level = logging.ERROR)
            raise

开发者ID:AlexanderFabisch，项目名称:pyspace，代码行数:57，代码来源:node_chain.py

示例7: _copy_pickle_file

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
 def _copy_pickle_file(self, source_collection_path, target_collection_path,
                       train_set_name_suffix):
     
     source_collection = BaseDataset.load(source_collection_path)
     # if only test data was given, the "Rest_vs" collection is stored as 
     # training data
     if self.reverse and "test" == train_set_name_suffix: 
         # exchange the "test" in key tuple to "train" before storing
         for key in source_collection.data.keys():
             assert("test" == key[2])
             value = source_collection.data.pop(key)
             key = (key[0],key[1],"train")
             source_collection.data[key] = value
     source_collection.store(target_collection_path)

开发者ID:BioinformaticsArchive，项目名称:pyspace，代码行数:16，代码来源:merge.py

示例8: create

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
    def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
        """ A factory method that creates an Analysis operation based on the 
        information given in the operation specification operation_spec
        """
        assert(operation_spec["type"] == "analysis")
        input_path = operation_spec["input_path"]
        summary = BaseDataset.load(os.path.join(pySPACE.configuration.storage,
                                      input_path))
        data_dict = summary.data

        # Determine the parameters that should be analyzed
        parameters = operation_spec["parameters"]
        
        # Determine the metrics that should be plotted
        metrics = operation_spec["metrics"]
        
        # Determine how many processes will be created
        number_parameter_values = [len(set(data_dict[param])) for param in parameters]
        number_processes = cls._numberOfProcesses(0, number_parameter_values)+1
        
        if debug == True:
            # To better debug creation of processes we don't limit the queue 
            # and create all processes before executing them
            processes = processing.Queue()
            cls._createProcesses(processes, result_directory, data_dict, parameters, 
                                   metrics, True)
            return cls( processes, operation_spec, result_directory, number_processes)
        else:
            # Create all plot processes by calling a recursive helper method in 
            # another thread so that already created processes can be executed
            # although creation of processes is not finished yet. Therefore a queue 
            # is used which size is limited to guarantee that not to much objects 
            # are created (since this costs memory). However, the actual number 
            # of 100 is arbitrary and might be changed according to the system at hand.
            processes = processing.Queue(100)
            create_process = processing.Process(target=cls._createProcesses,
                             args=( processes, result_directory, data_dict, 
                                    parameters, metrics, True))
            create_process.start()
            # create and return the operation object
            return cls( processes, operation_spec, result_directory, number_processes, create_process)

开发者ID:AlexanderFabisch，项目名称:pyspace，代码行数:43，代码来源:analysis.py

示例9: test_time_series_storing

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
    def test_time_series_storing(self):

        if os.path.exists('tmp') is False :
            os.makedirs('tmp')
        
        source = SimpleTimeSeriesSourceNode()
        sink = TimeSeriesSinkNode()
        sink.register_input_node(source)
        sink.set_run_number(0)
        sink.process_current_split()
        result_collection = sink.get_result_dataset()
        result_collection.store('tmp')
        #sink.store_results("test_time_series_storing.tmp")
        
        reloaded_collection = BaseDataset.load('tmp')
        
        reloader = TimeSeriesSourceNode()
        reloader.set_input_dataset(reloaded_collection)
        #set_permanent_attributes(time_series_file = "test_time_series_storing.tmp")
        
        orig_data = list(source.request_data_for_testing()) 
        restored_data = list(reloader.request_data_for_testing())
        
        # Check that the two list have the same length
        self.assertEqual(len(orig_data), len(restored_data),
                         "Numbers of time series before storing and after reloading are not equal!")
        
        # Check that there is a one-to-one correspondence
        for orig_datapoint, orig_label in orig_data:
            found = False
            for restored_datapoint, restored_label in restored_data:
                found |= (orig_datapoint.view(numpy.ndarray) == restored_datapoint.view(numpy.ndarray)).all() \
                            and (orig_label == restored_label)
                if found: break
            self.assert_(found, 
                         "One of the original time series cannot not be found after reloading")
        
        shutil.rmtree('tmp') # Cleaning up...

开发者ID:Crespo911，项目名称:pyspace，代码行数:40，代码来源:test_time_series_sink.py

示例10: create

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
 def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
     """
     A factory method that creates a statistic operation based on the
     information given in the operation specification operation_spec.
     If debug is TRUE the creation of the statistic processes will not
     be in a separated thread.
     """
     assert(operation_spec["type"] == "statistic")
     input_path = operation_spec["input_path"]
     tabular = BaseDataset.load(os.path.join(pySPACE.configuration.storage, input_path)).data
     
     if operation_spec.has_key("filter"):
         conditions= csv_analysis.empty_dict(tabular)
         for key,l in operation_spec["filter"].items():
             conditions[key].extend(l)
         tabular = csv_analysis.strip_dict(tabular,conditions)
     metric = operation_spec.get("metric","Balanced_accuracy")
     parameter = operation_spec.get("parameter","__Dataset__")
     rel_par = operation_spec.get("related_parameters",["__Dataset__", "Key_Run", "Key_Fold"])
     average = operation_spec.get("average",None)
     
     if average in rel_par:
         rel_par.remove(average)
     if metric in rel_par:
         rel_par.remove(metric)
     if parameter in rel_par:
         rel_par.remove(parameter)
         
     reduced_tabular=cls.reduce_tabular(tabular,rel_par,metric,parameter,average)
     number_processes = 1
     processes = processing.Queue()
     cls._createProcesses(processes, result_directory, reduced_tabular)
     
     import shutil
     shutil.copy2(os.path.join(pySPACE.configuration.storage, input_path,"results.csv"), os.path.join(result_directory,"results.csv"))
     shutil.copy2(os.path.join(pySPACE.configuration.storage, input_path,"metadata.yaml"), os.path.join(result_directory,"metadata.yaml"))
     # create and return the shuffle operation object
     return cls(processes, operation_spec, result_directory, number_processes)

开发者ID:AlexanderFabisch，项目名称:pyspace，代码行数:40，代码来源:statistic.py

示例11: init

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
    def __init__(self,
                 dataset_dir,
                 command_template,
                 parametrization,
                 cv_folds,
                 run_number,
                 operation_result_dir):
        super(WEKAClassificationProcess, self).__init__()
        # Load the abbreviations
        abbreviations_file = open(os.path.join(pySPACE.configuration.spec_dir,
                                               'operations/weka_templates',
                                               'abbreviations.yaml'), 'r')
        self.abbreviations = yaml.load(abbreviations_file)
        abbreviations_file.close()
        # Determine the directory in which the process' results
        # are stored
        self.result_directory = operation_result_dir
        # Create collection
        collection = BaseDataset.load(dataset_dir)
        # The parametrization that is independent of the collection type
        # and the specific weka command template that is executed
        self.params = {"collection_name": dataset_dir.strip(os.sep).split(os.sep)[-1],
                       "run_number": run_number,
                       "cv_folds": cv_folds,
                       "weka_class_path": pySPACE.configuration.weka_class_path,
                       "temp_results": self.result_directory,
                       "unique_id": WEKAClassificationProcess.unique_id}
        # Collection dependent parameters
        if not collection.meta_data["train_test"] \
             and collection.meta_data["splits"] == 1:
            raise NotImplementedError()
        else:
            # The pattern of the train and test files generated by crossvalidation
            data_pattern =  os.path.join(dataset_dir,
                                         collection.meta_data["data_pattern"])
            # One example arff file in which WEKa can look up relation name etc.
            sample_dataset =  data_pattern.replace("_run", "_run0")\
                                          .replace("_sp_","_sp0_")\
                                          .replace("_tt","_train")
            self.params.update({"sample_dataset": sample_dataset,
                                "data_pattern": data_pattern})
        # Add custom parameters for the weka command template
        for parameter_name, parameter_value in parametrization.iteritems():
            self.params[parameter_name + "_abbr"] = parameter_value
            # Auto-expand abbreviations
            if parameter_value in self.abbreviations:
                parameter_value = self.abbreviations[parameter_value]
            elif parameter_name == 'classifier':
                import warnings
                warnings.warn("Did not find classifier abbreviation %s. "
                              " Expecting full name." % parameter_value)
            self.params[parameter_name] = parameter_value

        # Build the WEKA command by repeatedly replacing all placeholders in 
        # the template 
        while True:
            instantiated_template = command_template % self.params
            if instantiated_template == command_template:
                # All placeholders replace 
                self.weka_command = instantiated_template
                break
            else:
                # We have to continue since we are not converged
                command_template = instantiated_template
        
        self.handler_class = None
        
        WEKAClassificationProcess.unique_id += 1

开发者ID:AlexanderFabisch，项目名称:pyspace，代码行数:70，代码来源:weka_classification.py

示例12: call

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
    def __call__(self):
        """ Executes this process on the respective modality """
        ############## Prepare benchmarking ##############
        super(MergeProcess, self).pre_benchmarking()
        
        # For all input collections
        for source_test_collection_path in self.input_collections:
            # Check if the input data is splitted
            # e.g. only a single test file is in the source directory 
            source_files = glob.glob(os.sep.join([source_test_collection_path,
                                                  "data_run0", "*test*"]))
            splitted = len(source_files) > 1
            assert(not splitted)
            source_file_name = str(source_files[-1])
            
            # check if train sets are also present
            train_data_present = len(glob.glob(os.sep.join(
                                 [source_test_collection_path,"data_run0",\
                                  "*train*"]))) > 0
            
            # if training data is present -> use train and test sets separately
            if train_data_present:
                train_set_name_suffix = "train"
            else:
                train_set_name_suffix =  "test"
            
            # We create the collection Rest_vs_Collection
            source_test_collection_name = \
                                   source_test_collection_path.split(os.sep)[-2]
            test_base_collection_name = \
                          source_test_collection_name.strip("}{").split("}{")[0]
            if self.reverse:
                target_collection_name = source_test_collection_name.replace(
                                         test_base_collection_name,
                                         test_base_collection_name + "_vs_Rest")
                key = "train"
            else:
                target_collection_name = source_test_collection_name.replace(
                                         test_base_collection_name,
                                         "Rest_vs_" + test_base_collection_name)
                key = "test"
                
            target_collection_path = os.sep.join([self.result_directory,
                                                  target_collection_name])
            # determine the parameter_settings of the test collection
            test_collection = BaseDataset.load(source_test_collection_path)
            target_collection_params = \
                                 test_collection.meta_data["parameter_setting"]
            target_collection_params["__INPUT_DATASET__"] = \
                                           {key: source_test_collection_name}
            
            if source_file_name.endswith("arff"):
                file_ending = "arff"
                # Copy arff file from input collection to target collection
                source_test_file_path = os.sep.join([source_test_collection_path,
                                        "data_run0","features_sp0" +
                                        train_set_name_suffix + ".arff"])
                target_test_file_path = os.sep.join([target_collection_path,
                                       "data_run0","features_sp0_"+key+".arff"])
            
            elif source_file_name.endswith("pickle"):
                file_ending = "pickle"
                source_test_file_path = source_test_collection_path
                target_test_file_path = target_collection_path
            else:
                raise NotImplementedError("File type not supported in " \
                                                               "MergeOperation")
            
            source_train_pathes = []
            for source_train_collection_path in self.input_collections:
                source_train_collection_name = \
                                  source_train_collection_path.split(os.sep)[-2]
                # We must not use data originating from the same input
                # collection both in train and test files
                if source_test_collection_name == source_train_collection_name:
                    continue
                
                # Check that all constraints are fulfilled for this pair of
                # input collections
                if not all(eval(constraint_template % \
                  {'source_train_collection_name': source_train_collection_name,
                   'source_test_collection_name': source_test_collection_name})
                        for constraint_template in self.collection_constraints):
                    continue
                
                # check if all parameters are stored in the target path
                source_collection = \
                                BaseDataset.load(source_train_collection_path)
                source_collection_params = \
                            source_collection.meta_data["parameter_setting"]
                remaining_params = \
                          [param for param in source_collection_params.items() \
                            if param not in target_collection_params.items() and \
                               param[0] not in ["__INPUT_DATASET__", 
                               "__RESULT_DIRECTORY__", "__OUTPUT_BUNDLE__",
                               "__INPUT_COLLECTION__" ]] # for old data
                if remaining_params != []:
                    for k,v in remaining_params:
                         target_collection_path += "{%s#%s}" % (k,str(v))
                         target_collection_params[k]=v
#.........这里部分代码省略.........

开发者ID:BioinformaticsArchive，项目名称:pyspace，代码行数:103，代码来源:merge.py

示例13: _merge_pickle_files

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
    def _merge_pickle_files(self, target_dataset_path, source_dataset_pathes):
        """ Concatenate all datasets in source_dataset_pathes and store 
            them in the target dataset"""
        # sort the dataset 
        source_dataset_pathes.sort()
        # load a first dataset, in which the data of all other datasets is assembled
        target_dataset = BaseDataset.load(source_dataset_pathes[0])
        
        # Determine author and date
        try:
            author = getpass.getuser()
        except : 
            author = "Unknown"
        date = time.strftime("%Y%m%d_%H_%M_%S")
        # Delete node_chain file name
        try:
            target_dataset.meta_data.pop("node_chain_file_name")
        except:
            pass
        # Update meta data and store it
        params = target_dataset.meta_data.pop("parameter_setting")
        params["__INPUT_DATASET__"] = \
                 [s_c_p.split(os.sep)[-2] for s_c_p in source_dataset_pathes]
        params["__RESULT_DIRECTORY__"] = self.result_directory
        target_dataset.meta_data.update({"author" : author, 
                      "date" : date, 
                      "dataset_directory" : target_dataset_path,
                      "train_test" : False,
                      "parameter_setting" : params,
                      "changed_time" : self.change_time,
                      "input_dataset_name" : source_dataset_pathes[0][len(
                                        pySPACE.configuration.storage):]
        })
    
        # Concatenate data of all other datasets to target dataset
        for source_dataset_path in source_dataset_pathes[1:]:
            source_dataset = BaseDataset.load(source_dataset_path)
            for run in source_dataset.get_run_numbers():
                for split in source_dataset.get_split_numbers():
                    target_data = target_dataset.get_data(run, split, "test")

                    if self.change_time:
                        # ensure sorted target_data 
                        # TODO: encode this in meta data?  
                        target_data.sort(key=lambda t: t[0].end_time)
                        last_end_time = target_data[-1][0].end_time

                    for ts, l in target_data:
                        if ts.specs == None:
                            ts.specs = {"new_set": False}
                        elif ts.specs.has_key("new_set"):
                            break
                        else:
                            ts.specs["new_set"]= False

                    data = source_dataset.get_data(run, split, "test")

                    if self.change_time:                    
                        # ensure sorted target_data 
                        # TODO: encode this in meta data?
                        data.sort(key=lambda t: t[0].end_time)
                    # flag the first element of the concatenated data list
                    for i, (ts, l) in enumerate(data):
                        if ts.specs == None:
                            ts.specs = {"new_set": i==0}
                        else:
                            ts.specs["new_set"] = (i==0)
                        if self.change_time:
                            ts.start_time = last_end_time + ts.start_time
                            ts.end_time = last_end_time + ts.end_time
                            
                    # actual data is stored in a list that has to be extended
                    target_data.extend(data)
                
        target_dataset.store(target_dataset_path)

开发者ID:BioinformaticsArchive，项目名称:pyspace，代码行数:77，代码来源:concatenate.py

示例14: prepare_training

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
    def prepare_training(self, training_files, potentials, operation):
        """ Prepares pyspace live for training.

        Prepares everything for training of pyspace live,
        i.e. creates flows based on the dataflow specs
        and configures them.
        """
        online_logger.info( "Preparing Training")
        self.potentials = potentials
        self.operation = operation

        online_logger.info( "Creating flows..")
        for key in self.potentials.keys():
            spec_base = self.potentials[key]["configuration"].spec_dir
            if self.operation == "train":
                self.potentials[key]["node_chain"] = os.path.join(spec_base, self.potentials[key]["node_chain"])
                online_logger.info( "node_chain_spec:" + self.potentials[key]["node_chain"])

            elif self.operation in ("prewindowing", "prewindowing_offline"):
                if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True:
                    self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["stream_prewindowing_flow"])
                else:
                    self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["prewindowing_flow"])
                online_logger.info( "prewindowing_dataflow_spec: " + self.potentials[key]["prewindowing_flow"])

            elif self.operation == "prewindowed_train":
                if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True:
                    self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["stream_postprocess_flow"])
                else:
                    self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["postprocess_flow"])
                online_logger.info( "postprocessing_dataflow_spec: " + self.potentials[key]["postprocess_flow"])

            self.training_active_potential[key] = multiprocessing.Value("b",False)

        online_logger.info("Path variables set for NodeChains")

        # check if multiple potentials are given for training
        if isinstance(training_files, list):
            self.training_data = training_files
        else:
            self.training_data = [training_files]

        # Training is done in separate processes, we send the time series
        # windows to these threads via two queues
        online_logger.info( "Initializing Queues")
        for key in self.potentials.keys():
            self.queue[key] = multiprocessing.Queue()


        def flow_generator(key):
            """create a generator to yield all the abri flow windows"""
            # Yield all windows until a None item is found in the queue
            while True:
                window = self.queue[key].get(block = True, timeout = None)
                if window == None: break
                yield window

        # Create the actual data flows
        for key in self.potentials.keys():

            if self.operation == "train":
                self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain,
                                                         flow_spec = file(self.potentials[key]["node_chain"]))
                self.node_chains[key][0].set_generator(flow_generator(key))
                flow = open(self.potentials[key]["node_chain"])
            elif self.operation in ("prewindowing", "prewindowing_offline"):
                online_logger.info("loading prewindowing flow..")
                online_logger.info("file: " + str(self.potentials[key]["prewindowing_flow"]))

                self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain,
                                                             flow_spec = file(self.potentials[key]["prewindowing_flow"]))
                self.node_chains[key][0].set_generator(flow_generator(key))
                flow = open(self.potentials[key]["prewindowing_flow"])
            elif self.operation == "prewindowed_train":
                if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True:
                    self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain,
                                                                     flow_spec = file(self.potentials[key]["postprocess_flow"]))
                    # create windower
                    online_logger.info( "Creating Windower")
                    online_logger.info(self.potentials[key]["windower_spec_path_train"])
                    self.node_chains[key][0].set_windower_spec_file(os.path.join(spec_base, "node_chains", "windower", self.potentials[key]["windower_spec_path_train"]))
                    replace_start_and_end_markers = True
                else:
                    self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["postprocess_flow"]))
                    replace_start_and_end_markers = False

                final_collection = TimeSeriesDataset()
                final_collection_path = os.path.join(self.prewindowed_data_directory, key, "all_train_data")
                # delete previous training collection
                if os.path.exists(final_collection_path):
                    online_logger.info("deleting old training data collection for " + key)
                    shutil.rmtree(final_collection_path)

                # load all prewindowed collections and
                # append data to the final collection
                prewindowed_sets = \
                    glob.glob(os.path.join(self.prewindowed_data_directory, key, "*"))
                if len(prewindowed_sets) == 0:
                    online_logger.error("Couldn't find data, please do prewindowing first!")
                    raise Exception
#.........这里部分代码省略.........

开发者ID:AlexanderFabisch，项目名称:pyspace，代码行数:103，代码来源:trainer.py

示例15: create

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
 def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
     """
     A factory method that creates an Analysis operation based on the
     information given in the operation specification operation_spec.
     If debug is TRUE the creation of the Analysis Processes will not
     be in a separated thread.
     """
     assert(operation_spec["type"] == "comp_analysis")
     input_path = operation_spec["input_path"]
     summary = BaseDataset.load(os.path.join(pySPACE.configuration.storage,
                                   input_path))
     data_dict = summary.data
     ## Done
     
     # Determine the parameters that should be analyzed
     parameters = operation_spec["parameters"]
     
     # Determine dependent parameters, which don't get extra resolution
     try:
         dep_par = operation_spec["dep_par"]
     except KeyError:
         dep_par=[]
     
     # Determine the metrics that should be plotted
     spec_metrics = operation_spec["metrics"]
     metrics=[]
     for metric in spec_metrics:
         if data_dict.has_key(metric):
             metrics.append(metric)
         else:
             import warnings
             warnings.warn('The metric "' + metric + '" is not contained in the results csv file.')
     if len(metrics)==0:
         warnings.warn('No metric available from spec file, default to first dict entry.')
         metrics.append(data_dict.keys()[0])
         
     # Determine how many processes will be created
     number_parameter_values = [len(set(data_dict[param])) for param in parameters]
     number_processes = cls._numberOfProcesses(0, number_parameter_values)+1
     
     logscale = False
     if operation_spec.has_key('logscale'):
         logscale = operation_spec['logscale']
     
     markertype='x'
     if operation_spec.has_key('markertype'):
         markertype = operation_spec['markertype']
     
     if debug == True:
         # To better debug creation of processes we don't limit the queue 
         # and create all processes before executing them
         processes = processing.Queue()
         cls._createProcesses(processes, result_directory, data_dict, parameters, 
                                dep_par, metrics, logscale, markertype, True)
         return cls( processes, operation_spec, result_directory, number_processes)
     else:
         # Create all plot processes by calling a recursive helper method in 
         # another thread so that already created processes can be executed
         # although creation of processes is not finished yet. Therefore a queue 
         # is used which size is limited to guarantee that not to much objects 
         # are created (since this costs memory). However, the actual number 
         # of 100 is arbitrary and might be reviewed.
         processes = processing.Queue(100)
         create_process = processing.Process(target=cls._createProcesses,
                          args=( processes, result_directory, data_dict, 
                                 parameters, dep_par, metrics, logscale, markertype, True))
         create_process.start()
         # create and return the comp_analysis operation object
         return cls( processes, operation_spec, result_directory, 
                                number_processes, create_process)

开发者ID:AlexanderFabisch，项目名称:pyspace，代码行数:72，代码来源:comp_analysis.py

注：本文中的pySPACE.resources.dataset_defs.base.BaseDataset.load方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。