当前位置: 首页>>代码示例>>Python>>正文


Python BaseDataset.load_meta_data方法代码示例

本文整理汇总了Python中pySPACE.resources.dataset_defs.base.BaseDataset.load_meta_data方法的典型用法代码示例。如果您正苦于以下问题:Python BaseDataset.load_meta_data方法的具体用法?Python BaseDataset.load_meta_data怎么用?Python BaseDataset.load_meta_data使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pySPACE.resources.dataset_defs.base.BaseDataset的用法示例。


在下文中一共展示了BaseDataset.load_meta_data方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _get_result_dataset_dir

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load_meta_data [as 别名]
    def _get_result_dataset_dir(base_dir, input_dataset_dir,
                                   parameter_setting, hide_parameters):
        """ Determines the name of the result directory

        Determines the name of the result directory based on the
        input_dataset_dir, the node_chain_name and the parameter setting.
        """
        input_name = input_dataset_dir.strip(os.sep).split(os.sep)[-1]
        input_name = input_name.strip("{}")
        # If the input is already the result of an operation
        if input_name.count("}{") > 0:
            input_name_parts = input_name.split("}{")
            input_name = input_name_parts[0]

        # Load the input meta data
        dataset_dir = os.sep.join([pySPACE.configuration.storage,
                                                input_dataset_dir])
        dataset_md = BaseDataset.load_meta_data(dataset_dir)

        # We are going to change the parameter_setting and don't want to
        # interfere with later runs so we work on a copy
        parameter_setting = copy.deepcopy(parameter_setting)

        # Ignore pseudo parameter "__PREPARE_OPERATION__"
        if "__PREPARE_OPERATION__" in parameter_setting:
            parameter_setting.pop("__PREPARE_OPERATION__")

        # Add the input parameters meta data to the given parameter setting
        if "parameter_setting" in dataset_md:
            parameter_setting.update(dataset_md["parameter_setting"])

        # We have to remove ' characters from the parameter value since
        # Weka does ignore them
        for key, value in parameter_setting.iteritems():
            if isinstance(value, basestring) and value.count("'") > 1:
                parameter_setting[key] = eval(value)

        # Determine the result_directory name
        # String between Key and value changed from ":" to "#",
        # because ot problems in windows and with windows file servers
        parameter_str = "}{".join(("%s#%s" % (key, value))
                                        for key, value in parameter_setting.iteritems()
                                            if key not in hide_parameters)

        result_name =  "{%s}" % input_name

        if parameter_str != "":
            result_name += "{%s}" % (parameter_str)

        # Determine the path where this result will be stored
        # and create the directory if necessary
        result_dir = base_dir
        result_dir += os.sep + result_name
        create_directory(result_dir)

        return result_dir
开发者ID:AlexanderFabisch,项目名称:pyspace,代码行数:58,代码来源:node_chain.py

示例2: consolidate

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load_meta_data [as 别名]
    def consolidate(self):
        """ Consolidates the results obtained by the single processes into a consistent structure
        of collections that are stored on the file system.
        """
        # Consolidate the results
        directory_pattern = os.sep.join([self.result_directory, "{*",])
        dataset_pathes = glob.glob(directory_pattern)

        # For all collections found
        for dataset_path in dataset_pathes:
            # Load their meta_data
            meta_data = BaseDataset.load_meta_data(dataset_path)

            # Determine author and date
            try:
                author = pwd.getpwuid(os.getuid())[4]
            except:
                author = "unknown"
                self._log("Author could not be resolved.",level=logging.WARNING)
            date = time.strftime("%Y%m%d_%H_%M_%S")

            # Update meta data and store it
            meta_data.update({"author" : author, "date" : date})
            BaseDataset.store_meta_data(dataset_path, meta_data)

            # Copy the input dataset specification file to the result
            # directory in order to make later analysis of
            # the results more easy
            input_meta_path = os.sep.join([pySPACE.configuration.storage,
                                          meta_data["input_collection_name"]])
            input_meta = BaseDataset.load_meta_data(input_meta_path)
            BaseDataset.store_meta_data(dataset_path,input_meta,
                                        file_name="input_metadata.yaml")
        # Check if some results consist of several runs
        # and update the meta data in this case
        # TODO: This is not a clean solution
        for dataset_dir in glob.glob(os.sep.join([self.result_directory,
                                                     "*"])):
            if not os.path.isdir(dataset_dir): continue
            # There can be either run dirs, persistency dirs, or both of them.
            # Check of whichever there are more. If both exist, their numbers
            # are supposed to be equal.
            nr_run_dirs = len(glob.glob(os.sep.join([dataset_dir,
                                              "data_run*"])))
            nr_per_dirs = len(glob.glob(os.sep.join([dataset_dir,
                                              "persistency_run*"])))
            nr_runs = max(nr_run_dirs, nr_per_dirs)

            if nr_runs > 1:
                collection_meta = BaseDataset.load_meta_data(dataset_dir)
                collection_meta["runs"] = nr_runs
                BaseDataset.store_meta_data(dataset_dir,collection_meta)
        # If we don't create a feature vector or time series collection,
        # we evaluated our classification using a classification performance sink.
        # The resulting files should be merged to one csv tabular.
        pathlist = glob.glob(os.path.join(self.result_directory,"results_*"))
        if len(pathlist)>0:
            # Do the consolidation the same way as for WekaClassificationOperation
            self._log("Consolidating results ...")
            # We load and store the results once into a PerformanceResultSummary
            # This does the necessary consolidation...
            self._log("Reading intermediate results...")
            result_collection = PerformanceResultSummary(dataset_dir=self.result_directory)
            self._log("done")
            self._log("Storing result collection")
            result_collection.store(self.result_directory)
            self._log("done")
            PerformanceResultSummary.merge_traces(self.result_directory)

            if not(self.compression == False):
                # Since we get one result summary,
                # we don't need the numerous folders.
                # So we zip them to make the whole folder more easy visible.
                import zipfile
                cwd=os.getcwd()
                os.chdir(self.result_directory)
                # If there are to many or to large folders, problems may occur.
                # This case we want to log, try 64 bit mode, and then skip the zipping.
                try:
                    pathlist = glob.glob(os.path.join(self.result_directory,"{*}"))
                    
                    if not self.compression == "delete":                        
                        save_file=zipfile.ZipFile(self.result_directory+'/result_folders.zip',mode="w",compression=self.compression)
                        # we want to have the zipped file relative to the result directory
                        for path in pathlist:
                            for node in os.walk(path):
                                rel_path=os.path.relpath(node[0],self.result_directory)
                                save_file.write(rel_path)
                                for data in node[2]:
                                    save_file.write(os.path.join(rel_path,data))
                        save_file.close()
                    # To still have an easy access to the history of the processing,
                    # we keep one folder.
                    pathlist.pop()
                    for path in pathlist:
                        shutil.rmtree(path)
                except:
                    self._log("Result files could not be compressed with 32 bit mode, switching to 64 bit mode.", level=logging.CRITICAL)
                    # nearly total code copy, only difference with 64 bit mode
                    try:
#.........这里部分代码省略.........
开发者ID:AlexanderFabisch,项目名称:pyspace,代码行数:103,代码来源:node_chain.py

示例3: create

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load_meta_data [as 别名]
    def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
        """ A factory method that creates the processes which form an operation
        based on the  information given in the operation specification, *operation_spec*.

        In debug mode this is done in serial. In the other default mode,
        at the moment 4 processes are created in parallel and can be immediately
        executed. So generation of processes and execution are made in parallel.
        This kind of process creation is done independently from the backend.

        For huge parameter spaces this is necessary!
        Otherwise numerous processes are created and corresponding data is loaded
        but the concept of spreading the computation to different processors
        can not really be used, because process creation is blocking only
        one processor and memory space, but nothing more is done,
        till the processes are all created.

        .. todo:: Use :class:`~pySPACE.resources.dataset_defs.dummy.DummyDataset`
                  for empty data, when no input_path is given.
        """
        assert(operation_spec["type"] == "node_chain")

        # Determine all parameter combinations that should be tested
        parameter_settings = cls._get_parameter_space(operation_spec)

        ## Use node_chain parameter if no templates are given ##
        if not operation_spec.has_key("templates"):
            if operation_spec.has_key("node_chain"):
                operation_spec["templates"]=[operation_spec.pop("node_chain")]
            else:
                warnings.warn("Specify parameter 'templates' or 'node_chain' in your operation spec!")
                operation_spec["templates"]=[operation_spec.pop("flow")]
        elif operation_spec.has_key("node_chain"):
            operation_spec.pop("node_chain")
            warnings.warn("node_chain parameter is ignored. Templates are used.")
        elif type(operation_spec["templates"][0])==str: # load files in templates
            operation_spec["template_files"]=copy.deepcopy(operation_spec["templates"])
            for i in range(len(operation_spec["templates"])):
                rel_node_chain_file = operation_spec["templates"][i]
                abs_node_chain_file = open(os.sep.join([pySPACE.configuration.spec_dir,
                                                     "node_chains",
                                                     rel_node_chain_file]), 'r')
                node_chain = yaml.load(abs_node_chain_file)
                abs_node_chain_file.close()
                operation_spec["templates"][i] = node_chain


        storage = pySPACE.configuration.storage
        if not input_paths :
            raise Exception("No input datasets found in input_path %s in %s!"
                            % (operation_spec["input_path"],storage))

        # Get relative path
        rel_input_paths = [name[len(storage):]
                                for name in  input_paths]

        # Determine approximate number of runs
        if "runs" in operation_spec:
            runs = operation_spec["runs"]
        else:
            runs = []
            for dataset_dir in rel_input_paths:
                abs_collection_path = \
                        pySPACE.configuration.storage + os.sep \
                            + dataset_dir
                collection_runs = \
                        BaseDataset.load_meta_data(abs_collection_path).get('runs',1)
                runs.append(collection_runs)
            runs = max(runs)

        # Determine splits
        dataset_dir = rel_input_paths[0]
        abs_collection_path = \
                pySPACE.configuration.storage + os.sep + dataset_dir

        splits = BaseDataset.load_meta_data(abs_collection_path).get('splits', 1)

        # Determine how many processes will be created
        number_processes = len(operation_spec["templates"]) * \
                           len(parameter_settings) * len(rel_input_paths) * \
                           runs * splits

        if debug == True:
            # To better debug creation of processes we don't limit the queue
            # and create all processes before executing them
            processes = processing.Queue()
            cls._createProcesses(processes, result_directory, operation_spec,
                                 parameter_settings, rel_input_paths)
            # create and return the operation object
            return cls(processes, operation_spec, result_directory,
                       number_processes)
        else:
            # Create all processes by calling a recursive helper method in
            # another thread so that already created processes can be executed in
            # parallel. Therefore a queue is used which size is maximized to
            # guarantee that not to much objects are created (because this costs
            # memory). However, the actual number of 4 is arbitrary and might
            # be changed according to the system at hand.
            processes = processing.Queue(4)
            create_process = \
                    processing.Process(target=cls._createProcesses,
#.........这里部分代码省略.........
开发者ID:AlexanderFabisch,项目名称:pyspace,代码行数:103,代码来源:node_chain.py

示例4: _createProcesses

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load_meta_data [as 别名]
    def _createProcesses(cls, processes, result_directory, operation_spec,
                         parameter_settings, input_collections):

        storage_format = operation_spec["storage_format"] if "storage_format" \
                         in operation_spec else None

        # Determine whether the node_chain should be stored after data processing
        store_node_chain = operation_spec["store_node_chain"] \
                         if "store_node_chain" in operation_spec else False

        # Determine whether certain parameters should not be remembered
        hide_parameters = [] if "hide_parameters" not in operation_spec \
                                else list(operation_spec["hide_parameters"])
        hide_parameters.append("__INPUT_COLLECTION__")
        hide_parameters.append("__INPUT_DATASET__")
        hide_parameters.append("__RESULT_DIRECTORY__")
        hide_parameters.append("__OUTPUT_BUNDLE__")

        # Create all combinations of collections, runs and splits
        collection_run_split_combinations = []
        for input_dataset_dir in input_collections:
            # Determine number of runs to be conducted for this collection
            abs_collection_path = \
                pySPACE.configuration.storage + os.sep \
                    + input_dataset_dir
            collection_runs = \
                BaseDataset.load_meta_data(abs_collection_path).get('runs', 1)
                # D.get(k[,d]) -> D[k] if k in D, else d.

            if "runs" not in operation_spec:
                requested_runs  = collection_runs
            else:
                requested_runs = operation_spec["runs"]

            assert collection_runs == requested_runs \
                        or collection_runs ==  1, \
                    "Requested %s runs but input collection %s provides "\
                    "data for %s runs." % (requested_runs, input_dataset_dir,
                                           collection_runs)

            for run in range(max(requested_runs, collection_runs)):
                collection_splits = \
                    BaseDataset.load_meta_data(abs_collection_path).get('splits', 1)
                for split in range(collection_splits):
                    collection_run_split_combinations.append((input_dataset_dir, run, split))

        # Shuffle order of dataset-run-split combinations. This should help to
        # avoid that all processes work on the same data which can cause
        # problems due to locking etc.
        random.shuffle(collection_run_split_combinations)

        # For all templates
        for node_chain_spec in operation_spec["templates"]:
            # For all possible parameter instantiations of this template
            for parameter_setting in parameter_settings:
                # For all input collections-run combinations
                for input_dataset_dir, run, split in collection_run_split_combinations:
                    # We are going to change the parameter_setting and don't want to
                    # interfere with later runs so we work on a copy
                    parameter_setting_cp = copy.deepcopy(parameter_setting)

                    # Add input and output path to parameter
                    # setting
                    parameter_setting_cp["__INPUT_DATASET__"] = \
                            input_dataset_dir.split(os.sep)[-2]
                    parameter_setting_cp["__RESULT_DIRECTORY__"] = \
                            result_directory
                    if len(operation_spec["templates"])>1:
                        index = operation_spec["templates"].index(node_chain_spec)
                        parameter_setting_cp["__Template__"]=\
                            operation_spec["template_files"][index]

                    # Load the input meta data
                    dataset_dir = os.sep.join([pySPACE.configuration.storage,
                                               input_dataset_dir])
                    dataset_md = BaseDataset.load_meta_data(dataset_dir)
                    # Add the input parameters meta data to the given parameter setting
                    if "parameter_setting" in dataset_md:
                        dataset_md["parameter_setting"].update(parameter_setting_cp)
                        all_parameters = dataset_md["parameter_setting"]
                    else:
                        all_parameters = parameter_setting_cp

                    def check_constraint(constraint, parameters):
                        for key, value in parameters.iteritems():
                            constraint = constraint.replace(key, str(value))
                        return eval(constraint)

                    if not all(check_constraint(constraint_def,
                                                all_parameters) for \
                               constraint_def in \
                               operation_spec.get('old_parameter_constraints',[])):
                        continue

                    # Determine directory in which the result of this
                    # process should be written
                    result_dataset_directory = \
                        NodeChainOperation._get_result_dataset_dir(result_directory,
                                                                 input_dataset_dir,
                                                                 parameter_setting_cp,
#.........这里部分代码省略.........
开发者ID:AlexanderFabisch,项目名称:pyspace,代码行数:103,代码来源:node_chain.py

示例5: __call__

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load_meta_data [as 别名]

#.........这里部分代码省略.........
             
                # Determine names of the original data sets the input 
                # datasets are based on
                base_dataset1 = dataset_name1.strip("}{").split("}{")[0]
                base_dataset2 = dataset_name2.strip("}{").split("}{")[0]
                
                # Determine target dataset name and create directory
                # for it
                mixed_base_dataset = "%s_vs_%s" % (base_dataset1, 
                                                      base_dataset2)
                target_dataset_name = dataset_name1.replace(base_dataset1,
                                                                  mixed_base_dataset)
                
                target_dataset_dir = os.sep.join([self.result_directory,
                                                     target_dataset_name])
                
                create_directory(os.sep.join([target_dataset_dir, "data_run0"]))
                
                if splitted:
                    # For each split, copy the train data from dataset 1 and
                    # the test data from dataset 2 to the target dataset
                    for source_train_file_name in glob.glob(os.sep.join([dataset_dir1,
                                                                       "data_run0",
                                                                       "*_sp*_train.*"])):
                        # TODO: We have $n$ train sets and $n$ test sets, we                   "metadata.yaml"])),
                              
                        #       could use all $n*n$ combinations 
                        target_train_file_name = source_train_file_name.replace(dataset_dir1,
                                                                                target_dataset_dir)
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_train_file_name, 
                                                 target_train_file_name,
                                                 base_dataset1,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_train_file_name, 
                                       target_train_file_name)
                        
                        source_test_file_name = source_train_file_name.replace(dataset_dir1,
                                                                               dataset_dir2)
                        
                        source_test_file_name =  source_test_file_name.replace("train.",
                                                                                "test.")
                        target_test_file_name = target_train_file_name.replace("train.",
                                                                                "test.")
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_test_file_name, 
                                                 target_test_file_name,
                                                 base_dataset2,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_test_file_name,
                                       target_test_file_name)
                else:
                    # Use the data set from dataset 1 as training set and 
                    # the data set from dataset 2 as test data
                    for source_train_file_name in glob.glob(os.sep.join([dataset_dir1,
                                                                         "data_run0",
                                                                         "*_sp*_test.*"])):
                        target_train_file_name = source_train_file_name.replace("test.",
                                                                                "train.")
                        target_train_file_name = target_train_file_name.replace(dataset_dir1,
                                                                                target_dataset_dir)
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_train_file_name, 
                                                 target_train_file_name,
                                                 base_dataset1,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_train_file_name, 
                                       target_train_file_name)
                        
                        source_test_file_name = source_train_file_name.replace(dataset_dir1,
                                                                               dataset_dir2)
                        
                        target_test_file_name = target_train_file_name.replace("train.",
                                                                                "test.")
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_test_file_name, 
                                                 target_test_file_name,
                                                 base_dataset2,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_test_file_name,
                                       target_test_file_name)
                # Write metadata.yaml based on input meta data
                input_dataset1_meta = BaseDataset.load_meta_data(dataset_dir1)

                output_dataset_meta = dict(input_dataset1_meta)
                output_dataset_meta['train_test'] = True
                output_dataset_meta['date'] = time.strftime("%Y%m%d_%H_%M_%S")
                try:
                    output_dataset_meta['author'] = pwd.getpwuid(os.getuid())[4]
                except :
                    self._log("Author could not be resolved.",level=logging.WARNING)
                    output_dataset_meta['author'] = "unknown"
                BaseDataset.store_meta_data(target_dataset_dir,output_dataset_meta)
        
        ############## Clean up after benchmarking ##############
        super(ShuffleProcess, self).post_benchmarking()
开发者ID:AlexanderFabisch,项目名称:pyspace,代码行数:104,代码来源:shuffle.py

示例6: __call__

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load_meta_data [as 别名]

#.........这里部分代码省略.........
                target_test_file_path = os.sep.join([target_collection_path,
                                       "data_run0","features_sp0_"+key+".arff"])
            
            elif source_file_name.endswith("pickle"):
                file_ending = "pickle"
                source_test_file_path = source_test_collection_path
                target_test_file_path = target_collection_path
            else:
                raise NotImplementedError("File type not supported in " \
                                                               "MergeOperation")
            
            source_train_pathes = []
            for source_train_collection_path in self.input_collections:
                source_train_collection_name = \
                                  source_train_collection_path.split(os.sep)[-2]
                # We must not use data originating from the same input
                # collection both in train and test files
                if source_test_collection_name == source_train_collection_name:
                    continue
                
                # Check that all constraints are fulfilled for this pair of
                # input collections
                if not all(eval(constraint_template % \
                  {'source_train_collection_name': source_train_collection_name,
                   'source_test_collection_name': source_test_collection_name})
                        for constraint_template in self.collection_constraints):
                    continue
                
                # check if all parameters are stored in the target path
                source_collection = \
                                BaseDataset.load(source_train_collection_path)
                source_collection_params = \
                            source_collection.meta_data["parameter_setting"]
                remaining_params = \
                          [param for param in source_collection_params.items() \
                            if param not in target_collection_params.items() and \
                               param[0] not in ["__INPUT_DATASET__", 
                               "__RESULT_DIRECTORY__", "__OUTPUT_BUNDLE__",
                               "__INPUT_COLLECTION__" ]] # for old data
                if remaining_params != []:
                    for k,v in remaining_params:
                         target_collection_path += "{%s#%s}" % (k,str(v))
                         target_collection_params[k]=v
                   
                if "arff" == file_ending:
                    source_train_file_path = \
                                      os.sep.join([source_train_collection_path, 
                                                "data_run0", "features_sp0_" + \
                                               train_set_name_suffix + ".arff"])
                elif "pickle" == file_ending:
                    source_train_file_path = source_train_collection_path

                else:
                    raise NotImplementedError("File type not supported in " \
                                                              "MergeOperation!")     
                    
                source_train_pathes.append(source_train_file_path)
            
            if "arff" == file_ending:
                target_train_file_path = os.sep.join([target_collection_path,
                                       "data_run0","features_sp0_"+key+".arff"])
            elif "pickle" == file_ending:
                target_train_file_path = target_collection_path
            else:
                raise NotImplementedError("File type not supported in "
                                                              "MergeOperation!")     
            
            if len(source_train_pathes) == 0:
                continue
            
            create_directory(os.sep.join([target_collection_path,
                                          "data_run0"]))
            
            if "arff" == file_ending:
                self._copy_arff_file(source_test_file_path, 
                                     target_test_file_path,
                                     source_test_collection_name, 
                                     target_collection_name)
                                
                self._merge_arff_files(target_train_file_path, 
                                       source_train_pathes,
                                       target_collection_name)
                # Copy metadata.yaml
                # TODO: Adapt to new collection
                input_meta = BaseDataset.load_meta_data(source_test_collection_path)
                BaseDataset.store_meta_data(target_collection_path,input_meta)
            elif "pickle" == file_ending:
                self._copy_pickle_file(source_test_collection_path,
                                       target_collection_path,
                                       train_set_name_suffix)

                self._merge_pickle_files(target_train_file_path, 
                                         source_train_pathes, 
                                         train_set_name_suffix,
                                         target_collection_params)
            else:
                raise NotImplementedError("File type not supported in merge_operation")
            
        ############## Clean up after benchmarking ##############
        super(MergeProcess, self).post_benchmarking()
开发者ID:BioinformaticsArchive,项目名称:pyspace,代码行数:104,代码来源:merge.py

示例7: _get_result_dataset_dir

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load_meta_data [as 别名]
    def _get_result_dataset_dir(base_dir, input_dataset_dir, parameter_setting, hide_parameters):
        """ Determines the name of the result directory

        Determines the name of the result directory based on the
        input_dataset_dir, the node_chain_name and the parameter setting.
        """
        # Determine the result_directory name
        # String between Key and value changed from ":" to "#",
        # because ot problems in windows and with windows file servers
        def _get_result_dir_name(parameter_setting, hide_parameters, method=None):
            """ internal function to create result dir name in different ways"""
            if not method:
                parameter_str = "}{".join(
                    ("%s#%s" % (key, value))
                    for key, value in parameter_setting.iteritems()
                    if key not in hide_parameters
                )
            elif method == "hash":
                parameter_str = "}{".join(
                    ("%s#%s" % (key, hash(str(value).replace(" ", ""))))
                    for key, value in parameter_setting.iteritems()
                    if key not in hide_parameters
                )

            parameter_str = parameter_str.replace("'", "")
            parameter_str = parameter_str.replace(" ", "")
            parameter_str = parameter_str.replace("[", "")
            parameter_str = parameter_str.replace("]", "")
            parameter_str = parameter_str.replace(os.sep, "")
            result_name = "{%s}" % input_name

            if parameter_str != "":
                result_name += "{%s}" % (parameter_str)

            # Determine the path where this result will be stored
            # and create the directory if necessary
            result_dir = base_dir
            result_dir += os.sep + result_name
            # filename is to long
            # (longer than allowed including optional offsets for pyspace
            #  result csv naming conventions)
            # create a md5 hash of the result name and use that one
            import platform

            CURRENTOS = platform.system()
            if CURRENTOS == "Windows":
                # the maximum length for a filename on Windows is 255
                if len(result_dir) > 255 - 32:
                    result_name = "{" + hashlib.md5(result_name).hexdigest() + "}"
                    result_dir = base_dir
                    result_dir += os.sep + result_name
                return result_dir
            else:
                if len(result_dir) > os.pathconf(os.curdir, "PC_NAME_MAX") - 32:
                    result_name = "{" + hashlib.md5(result_name).hexdigest() + "}"
                    result_dir = base_dir
                    result_dir += os.sep + result_name
                return result_dir

        input_name = input_dataset_dir.strip(os.sep).split(os.sep)[-1]
        input_name = input_name.strip("{}")
        # If the input is already the result of an operation
        if input_name.count("}{") > 0:
            input_name_parts = input_name.split("}{")
            input_name = input_name_parts[0]

        # Load the input meta data
        dataset_dir = os.sep.join([pySPACE.configuration.storage, input_dataset_dir])
        dataset_md = BaseDataset.load_meta_data(dataset_dir)

        # We are going to change the parameter_setting and don't want to
        # interfere with later runs so we work on a copy
        parameter_setting = copy.deepcopy(parameter_setting)

        # Ignore pseudo parameter "__PREPARE_OPERATION__"
        if "__PREPARE_OPERATION__" in parameter_setting:
            parameter_setting.pop("__PREPARE_OPERATION__")

        # Add the input parameters meta data to the given parameter setting
        if "parameter_setting" in dataset_md:
            parameter_setting.update(dataset_md["parameter_setting"])

        # We have to remove ' characters from the parameter value since
        # Weka does ignore them
        for key, value in parameter_setting.iteritems():
            if isinstance(value, basestring) and value.count("'") > 1:
                parameter_setting[key] = eval(value)

        result_dir = _get_result_dir_name(parameter_setting, hide_parameters)
        try:
            create_directory(result_dir)
        except OSError as e:
            if e.errno == 36:
                # filename is too long
                result_dir = _get_result_dir_name(parameter_setting, hide_parameters, "hash")
            create_directory(result_dir)

        return result_dir
开发者ID:Hansa064,项目名称:pyspace,代码行数:100,代码来源:node_chain.py

示例8: consolidate

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load_meta_data [as 别名]
    def consolidate(self):
        """
        Consolidates the results obtained by the single WEKA filter
        processes into a consistent summary of datasets that is stored on
        the file system.
        
        .. todo:: Some of the contents of this method should go into the
                  :class:`~pySPACE.resources.dataset_defs.feature_vector.FeatureVectorDataset`
        """

        # Iterate over all collections and store the collection meta data etc.
        for entries in os.listdir(self.result_directory):
            fullpath = os.path.join(self.result_directory, entries)
            # For each collection        
            if os.path.isdir(fullpath):
                if entries.startswith("{"):
                    # Extract the parameters from the collection name in order to
                    # adjust the relation name
                    if self.num_parameters > 0:
                        parameter_strings = entries.strip("}{").split("}{")[-self.num_parameters:]
                        parameter_postfix = "{" + "}{".join(parameter_strings) + "}"
                    else:
                        parameter_strings = ""
                        parameter_postfix = ""
                    # Postprocessing of the arff files of this collection
                    for train_arff_file in glob.glob(fullpath + os.sep + "data_run*" 
                                           + os.sep + "*train.arff"):
                        # Adjust the relation name of the train file
                        content = open(train_arff_file, 'r').readlines()             
                        # We strip everything after the last "}"
                        endindex = content[0].rfind("}")
                        content[0] = content[0][:endindex+1]
                        content[0] += parameter_postfix + "'"
                        open(train_arff_file, 'w').writelines(content)
                        # Use relation name of train data for test data
                        test_arff_file = train_arff_file.replace("train.arff", "test.arff") 
                        test_content = open(test_arff_file, 'r').readlines()
                        test_content[0] = content[0] + "\n"
                        open(test_arff_file, 'w').writelines(test_content)
                    
                        # Check which features are contained in the arff file
                        feature_names = []
                        for line in content:
                            if line.startswith("@attribute"):
                                attribute = line.split()[1]
                                if attribute is not "class":
                                    feature_names.append(attribute)
                    # Store the collection meta data etc.
                    if self.num_parameters > 0:
                        input_collection_name = \
                            "{" + "}{".join(entries.strip("}{").split("}{")[:-self.num_parameters]) + "}"
                    else:
                        input_collection_name = entries
                        
                    input_collection_path = os.path.join(self.operation_spec["input_path"],
                                                     input_collection_name)

                    input_collection_meta = BaseDataset.load_meta_data(
                                            pySPACE.configuration.storage
                                            + os.sep
                                            + input_collection_path)
                    # Store the input collection
                    BaseDataset.store_meta_data(fullpath, input_collection_meta,
                                                file_name="input_metadata.yaml")
                    # Adjust collection metadata for the new collection
                    input_collection_meta["feature_names"] = feature_names
                    input_collection_meta["num_features"] = len(feature_names)
                    input_collection_meta["author"] = get_author()
                    input_collection_meta["date"] = time.strftime("%Y%m%d")
                    input_collection_meta["input_collection_name"] = input_collection_name
                    # Write the collection meta information into the folder
                    BaseDataset.store_meta_data(fullpath,input_collection_meta)
                    # Store the command_template
                    command_template_file = open(os.path.join(fullpath,
                                                          "command_template"), 'w')
                    command_template_file.write(self.command_template)
                    command_template_file.close()
                else:
                    # training and test arff need the same relation name
                    # otherwise Weka can't relate it to each other; the collection
                    # name and the parameters in {}{}-optic must be the relation 
                    # name for further processing    
                    self._log("WARNING: Collection name doesn't begin with '{'. Further processing may be collapse!", level= logging.WARNING)
        # Write the specification of this operation
        # to the result directory in order to make later 
        # analysis of results more easy
        source_operation_file = open(os.path.join(self.result_directory,
                                                  "source_operation.yaml"), 'w')
        yaml.dump(self.operation_spec, source_operation_file)
        source_operation_file.close()
开发者ID:Crespo911,项目名称:pyspace,代码行数:92,代码来源:weka_filter.py

示例9: consolidate

# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load_meta_data [as 别名]
    def consolidate(self, _=None):
        """ Consolidates the results obtained by the single processes into a consistent structure
        of collections that are stored on the file system.
        """
        # Consolidate the results
        directory_pattern = os.sep.join([self.result_directory, "{*",])
        dataset_pathes = glob.glob(directory_pattern)

        # For all collections found
        for dataset_path in dataset_pathes:
            try:
                # Load their meta_data
                meta_data = BaseDataset.load_meta_data(dataset_path)

                # Determine author and date
                author = get_author()
                date = time.strftime("%Y%m%d_%H_%M_%S")

                # Update meta data and store it
                meta_data.update({"author": author, "date": date})

                # There can be either run dirs, persistency dirs, or both of them.
                # Check of whichever there are more. If both exist, their numbers
                # are supposed to be equal.
                nr_run_dirs = len(glob.glob(os.path.join(dataset_path, "data_run*")))
                nr_per_dirs = len(glob.glob(os.path.join(dataset_path, "persistency_run*")))
                nr_runs = max(nr_run_dirs, nr_per_dirs)
                if nr_runs > 1:
                    meta_data["runs"] = nr_runs

                # Store the metadata
                BaseDataset.store_meta_data(dataset_path, meta_data)

                # Copy the input dataset specification file to the result
                # directory in order to make later analysis of
                # the results more easy
                # THA: Split the first "/" from the input collection name, because otherwise it will be treated
                # as an absolute path
                input_collection_name = meta_data["input_dataset_name"][1:] if \
                    meta_data["input_dataset_name"][0] == os.sep else meta_data["input_dataset_name"]
                input_meta_path = os.path.join(pySPACE.configuration.storage, input_collection_name)
                try:
                    input_meta = BaseDataset.load_meta_data(input_meta_path)
                    BaseDataset.store_meta_data(dataset_path, input_meta, file_name="input_metadata.yaml")
                except (IOError, OSError) as e:
                    self._log("Error copying the input_metadata.yaml: {error}".format(error=e.message),
                              level=logging.CRITICAL)
            except Exception as e:
                logging.getLogger("%s" % self).exception("Error updating the metadata: {error!s}".format(error=e))
                raise e

        # If we don't create a feature vector or time series collection,
        # we evaluated our classification using a classification performance sink.
        # The resulting files should be merged to one csv tabular.
        pathlist = glob.glob(os.path.join(self.result_directory,"results_*"))
        if len(pathlist)>0:
            # Do the consolidation the same way as for WekaClassificationOperation
            self._log("Consolidating results ...")
            # We load and store the results once into a PerformanceResultSummary
            # This does the necessary consolidation...
            self._log("Reading intermediate results...")
            try:
                result_collection = PerformanceResultSummary(dataset_dir=self.result_directory)
                self._log("done")
                self._log("Storing result collection")
                result_collection.store(self.result_directory)
                self._log("done")
                PerformanceResultSummary.merge_traces(self.result_directory)
            except Exception as e:
                logging.getLogger("%s" % self).exception("Error merging the result collection: {error!s}".format(
                    error=e))

            if self.compression:
                # Since we get one result summary,
                # we don't need the numerous folders.
                # So we zip them to make the whole folder more easy visible.
                import zipfile
                cwd = os.getcwd()
                os.chdir(self.result_directory)
                # If there are to many or to large folders, problems may occur.
                # This case we want to log, try 64 bit mode,
                # and then skip the zipping.
                try:
                    pathlist = glob.glob(os.path.join(self.result_directory,"{*}"))

                    if not self.compression == "delete":
                        save_file = zipfile.ZipFile(
                            self.result_directory+'/result_folders.zip',
                            mode="w", compression=self.compression)
                        # we want to have the zipped file relative to the
                        # result directory
                        for path in pathlist:
                            for node in os.walk(path):
                                rel_path=os.path.relpath(node[0],
                                                         self.result_directory)
                                save_file.write(rel_path)
                                for data in node[2]:
                                    save_file.write(os.path.join(rel_path,
                                                                 data))
                        save_file.close()
#.........这里部分代码省略.........
开发者ID:pyspace,项目名称:pyspace,代码行数:103,代码来源:node_chain.py


注:本文中的pySPACE.resources.dataset_defs.base.BaseDataset.load_meta_data方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。