本文整理汇总了Python中pySPACE.resources.dataset_defs.base.BaseDataset.load_meta_data方法的典型用法代码示例。如果您正苦于以下问题:Python BaseDataset.load_meta_data方法的具体用法?Python BaseDataset.load_meta_data怎么用?Python BaseDataset.load_meta_data使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pySPACE.resources.dataset_defs.base.BaseDataset
的用法示例。
在下文中一共展示了BaseDataset.load_meta_data方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _get_result_dataset_dir
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load_meta_data [as 别名]
def _get_result_dataset_dir(base_dir, input_dataset_dir,
parameter_setting, hide_parameters):
""" Determines the name of the result directory
Determines the name of the result directory based on the
input_dataset_dir, the node_chain_name and the parameter setting.
"""
input_name = input_dataset_dir.strip(os.sep).split(os.sep)[-1]
input_name = input_name.strip("{}")
# If the input is already the result of an operation
if input_name.count("}{") > 0:
input_name_parts = input_name.split("}{")
input_name = input_name_parts[0]
# Load the input meta data
dataset_dir = os.sep.join([pySPACE.configuration.storage,
input_dataset_dir])
dataset_md = BaseDataset.load_meta_data(dataset_dir)
# We are going to change the parameter_setting and don't want to
# interfere with later runs so we work on a copy
parameter_setting = copy.deepcopy(parameter_setting)
# Ignore pseudo parameter "__PREPARE_OPERATION__"
if "__PREPARE_OPERATION__" in parameter_setting:
parameter_setting.pop("__PREPARE_OPERATION__")
# Add the input parameters meta data to the given parameter setting
if "parameter_setting" in dataset_md:
parameter_setting.update(dataset_md["parameter_setting"])
# We have to remove ' characters from the parameter value since
# Weka does ignore them
for key, value in parameter_setting.iteritems():
if isinstance(value, basestring) and value.count("'") > 1:
parameter_setting[key] = eval(value)
# Determine the result_directory name
# String between Key and value changed from ":" to "#",
# because ot problems in windows and with windows file servers
parameter_str = "}{".join(("%s#%s" % (key, value))
for key, value in parameter_setting.iteritems()
if key not in hide_parameters)
result_name = "{%s}" % input_name
if parameter_str != "":
result_name += "{%s}" % (parameter_str)
# Determine the path where this result will be stored
# and create the directory if necessary
result_dir = base_dir
result_dir += os.sep + result_name
create_directory(result_dir)
return result_dir
示例2: consolidate
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load_meta_data [as 别名]
def consolidate(self):
""" Consolidates the results obtained by the single processes into a consistent structure
of collections that are stored on the file system.
"""
# Consolidate the results
directory_pattern = os.sep.join([self.result_directory, "{*",])
dataset_pathes = glob.glob(directory_pattern)
# For all collections found
for dataset_path in dataset_pathes:
# Load their meta_data
meta_data = BaseDataset.load_meta_data(dataset_path)
# Determine author and date
try:
author = pwd.getpwuid(os.getuid())[4]
except:
author = "unknown"
self._log("Author could not be resolved.",level=logging.WARNING)
date = time.strftime("%Y%m%d_%H_%M_%S")
# Update meta data and store it
meta_data.update({"author" : author, "date" : date})
BaseDataset.store_meta_data(dataset_path, meta_data)
# Copy the input dataset specification file to the result
# directory in order to make later analysis of
# the results more easy
input_meta_path = os.sep.join([pySPACE.configuration.storage,
meta_data["input_collection_name"]])
input_meta = BaseDataset.load_meta_data(input_meta_path)
BaseDataset.store_meta_data(dataset_path,input_meta,
file_name="input_metadata.yaml")
# Check if some results consist of several runs
# and update the meta data in this case
# TODO: This is not a clean solution
for dataset_dir in glob.glob(os.sep.join([self.result_directory,
"*"])):
if not os.path.isdir(dataset_dir): continue
# There can be either run dirs, persistency dirs, or both of them.
# Check of whichever there are more. If both exist, their numbers
# are supposed to be equal.
nr_run_dirs = len(glob.glob(os.sep.join([dataset_dir,
"data_run*"])))
nr_per_dirs = len(glob.glob(os.sep.join([dataset_dir,
"persistency_run*"])))
nr_runs = max(nr_run_dirs, nr_per_dirs)
if nr_runs > 1:
collection_meta = BaseDataset.load_meta_data(dataset_dir)
collection_meta["runs"] = nr_runs
BaseDataset.store_meta_data(dataset_dir,collection_meta)
# If we don't create a feature vector or time series collection,
# we evaluated our classification using a classification performance sink.
# The resulting files should be merged to one csv tabular.
pathlist = glob.glob(os.path.join(self.result_directory,"results_*"))
if len(pathlist)>0:
# Do the consolidation the same way as for WekaClassificationOperation
self._log("Consolidating results ...")
# We load and store the results once into a PerformanceResultSummary
# This does the necessary consolidation...
self._log("Reading intermediate results...")
result_collection = PerformanceResultSummary(dataset_dir=self.result_directory)
self._log("done")
self._log("Storing result collection")
result_collection.store(self.result_directory)
self._log("done")
PerformanceResultSummary.merge_traces(self.result_directory)
if not(self.compression == False):
# Since we get one result summary,
# we don't need the numerous folders.
# So we zip them to make the whole folder more easy visible.
import zipfile
cwd=os.getcwd()
os.chdir(self.result_directory)
# If there are to many or to large folders, problems may occur.
# This case we want to log, try 64 bit mode, and then skip the zipping.
try:
pathlist = glob.glob(os.path.join(self.result_directory,"{*}"))
if not self.compression == "delete":
save_file=zipfile.ZipFile(self.result_directory+'/result_folders.zip',mode="w",compression=self.compression)
# we want to have the zipped file relative to the result directory
for path in pathlist:
for node in os.walk(path):
rel_path=os.path.relpath(node[0],self.result_directory)
save_file.write(rel_path)
for data in node[2]:
save_file.write(os.path.join(rel_path,data))
save_file.close()
# To still have an easy access to the history of the processing,
# we keep one folder.
pathlist.pop()
for path in pathlist:
shutil.rmtree(path)
except:
self._log("Result files could not be compressed with 32 bit mode, switching to 64 bit mode.", level=logging.CRITICAL)
# nearly total code copy, only difference with 64 bit mode
try:
#.........这里部分代码省略.........
示例3: create
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load_meta_data [as 别名]
def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
""" A factory method that creates the processes which form an operation
based on the information given in the operation specification, *operation_spec*.
In debug mode this is done in serial. In the other default mode,
at the moment 4 processes are created in parallel and can be immediately
executed. So generation of processes and execution are made in parallel.
This kind of process creation is done independently from the backend.
For huge parameter spaces this is necessary!
Otherwise numerous processes are created and corresponding data is loaded
but the concept of spreading the computation to different processors
can not really be used, because process creation is blocking only
one processor and memory space, but nothing more is done,
till the processes are all created.
.. todo:: Use :class:`~pySPACE.resources.dataset_defs.dummy.DummyDataset`
for empty data, when no input_path is given.
"""
assert(operation_spec["type"] == "node_chain")
# Determine all parameter combinations that should be tested
parameter_settings = cls._get_parameter_space(operation_spec)
## Use node_chain parameter if no templates are given ##
if not operation_spec.has_key("templates"):
if operation_spec.has_key("node_chain"):
operation_spec["templates"]=[operation_spec.pop("node_chain")]
else:
warnings.warn("Specify parameter 'templates' or 'node_chain' in your operation spec!")
operation_spec["templates"]=[operation_spec.pop("flow")]
elif operation_spec.has_key("node_chain"):
operation_spec.pop("node_chain")
warnings.warn("node_chain parameter is ignored. Templates are used.")
elif type(operation_spec["templates"][0])==str: # load files in templates
operation_spec["template_files"]=copy.deepcopy(operation_spec["templates"])
for i in range(len(operation_spec["templates"])):
rel_node_chain_file = operation_spec["templates"][i]
abs_node_chain_file = open(os.sep.join([pySPACE.configuration.spec_dir,
"node_chains",
rel_node_chain_file]), 'r')
node_chain = yaml.load(abs_node_chain_file)
abs_node_chain_file.close()
operation_spec["templates"][i] = node_chain
storage = pySPACE.configuration.storage
if not input_paths :
raise Exception("No input datasets found in input_path %s in %s!"
% (operation_spec["input_path"],storage))
# Get relative path
rel_input_paths = [name[len(storage):]
for name in input_paths]
# Determine approximate number of runs
if "runs" in operation_spec:
runs = operation_spec["runs"]
else:
runs = []
for dataset_dir in rel_input_paths:
abs_collection_path = \
pySPACE.configuration.storage + os.sep \
+ dataset_dir
collection_runs = \
BaseDataset.load_meta_data(abs_collection_path).get('runs',1)
runs.append(collection_runs)
runs = max(runs)
# Determine splits
dataset_dir = rel_input_paths[0]
abs_collection_path = \
pySPACE.configuration.storage + os.sep + dataset_dir
splits = BaseDataset.load_meta_data(abs_collection_path).get('splits', 1)
# Determine how many processes will be created
number_processes = len(operation_spec["templates"]) * \
len(parameter_settings) * len(rel_input_paths) * \
runs * splits
if debug == True:
# To better debug creation of processes we don't limit the queue
# and create all processes before executing them
processes = processing.Queue()
cls._createProcesses(processes, result_directory, operation_spec,
parameter_settings, rel_input_paths)
# create and return the operation object
return cls(processes, operation_spec, result_directory,
number_processes)
else:
# Create all processes by calling a recursive helper method in
# another thread so that already created processes can be executed in
# parallel. Therefore a queue is used which size is maximized to
# guarantee that not to much objects are created (because this costs
# memory). However, the actual number of 4 is arbitrary and might
# be changed according to the system at hand.
processes = processing.Queue(4)
create_process = \
processing.Process(target=cls._createProcesses,
#.........这里部分代码省略.........
示例4: _createProcesses
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load_meta_data [as 别名]
def _createProcesses(cls, processes, result_directory, operation_spec,
parameter_settings, input_collections):
storage_format = operation_spec["storage_format"] if "storage_format" \
in operation_spec else None
# Determine whether the node_chain should be stored after data processing
store_node_chain = operation_spec["store_node_chain"] \
if "store_node_chain" in operation_spec else False
# Determine whether certain parameters should not be remembered
hide_parameters = [] if "hide_parameters" not in operation_spec \
else list(operation_spec["hide_parameters"])
hide_parameters.append("__INPUT_COLLECTION__")
hide_parameters.append("__INPUT_DATASET__")
hide_parameters.append("__RESULT_DIRECTORY__")
hide_parameters.append("__OUTPUT_BUNDLE__")
# Create all combinations of collections, runs and splits
collection_run_split_combinations = []
for input_dataset_dir in input_collections:
# Determine number of runs to be conducted for this collection
abs_collection_path = \
pySPACE.configuration.storage + os.sep \
+ input_dataset_dir
collection_runs = \
BaseDataset.load_meta_data(abs_collection_path).get('runs', 1)
# D.get(k[,d]) -> D[k] if k in D, else d.
if "runs" not in operation_spec:
requested_runs = collection_runs
else:
requested_runs = operation_spec["runs"]
assert collection_runs == requested_runs \
or collection_runs == 1, \
"Requested %s runs but input collection %s provides "\
"data for %s runs." % (requested_runs, input_dataset_dir,
collection_runs)
for run in range(max(requested_runs, collection_runs)):
collection_splits = \
BaseDataset.load_meta_data(abs_collection_path).get('splits', 1)
for split in range(collection_splits):
collection_run_split_combinations.append((input_dataset_dir, run, split))
# Shuffle order of dataset-run-split combinations. This should help to
# avoid that all processes work on the same data which can cause
# problems due to locking etc.
random.shuffle(collection_run_split_combinations)
# For all templates
for node_chain_spec in operation_spec["templates"]:
# For all possible parameter instantiations of this template
for parameter_setting in parameter_settings:
# For all input collections-run combinations
for input_dataset_dir, run, split in collection_run_split_combinations:
# We are going to change the parameter_setting and don't want to
# interfere with later runs so we work on a copy
parameter_setting_cp = copy.deepcopy(parameter_setting)
# Add input and output path to parameter
# setting
parameter_setting_cp["__INPUT_DATASET__"] = \
input_dataset_dir.split(os.sep)[-2]
parameter_setting_cp["__RESULT_DIRECTORY__"] = \
result_directory
if len(operation_spec["templates"])>1:
index = operation_spec["templates"].index(node_chain_spec)
parameter_setting_cp["__Template__"]=\
operation_spec["template_files"][index]
# Load the input meta data
dataset_dir = os.sep.join([pySPACE.configuration.storage,
input_dataset_dir])
dataset_md = BaseDataset.load_meta_data(dataset_dir)
# Add the input parameters meta data to the given parameter setting
if "parameter_setting" in dataset_md:
dataset_md["parameter_setting"].update(parameter_setting_cp)
all_parameters = dataset_md["parameter_setting"]
else:
all_parameters = parameter_setting_cp
def check_constraint(constraint, parameters):
for key, value in parameters.iteritems():
constraint = constraint.replace(key, str(value))
return eval(constraint)
if not all(check_constraint(constraint_def,
all_parameters) for \
constraint_def in \
operation_spec.get('old_parameter_constraints',[])):
continue
# Determine directory in which the result of this
# process should be written
result_dataset_directory = \
NodeChainOperation._get_result_dataset_dir(result_directory,
input_dataset_dir,
parameter_setting_cp,
#.........这里部分代码省略.........
示例5: __call__
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load_meta_data [as 别名]
#.........这里部分代码省略.........
# Determine names of the original data sets the input
# datasets are based on
base_dataset1 = dataset_name1.strip("}{").split("}{")[0]
base_dataset2 = dataset_name2.strip("}{").split("}{")[0]
# Determine target dataset name and create directory
# for it
mixed_base_dataset = "%s_vs_%s" % (base_dataset1,
base_dataset2)
target_dataset_name = dataset_name1.replace(base_dataset1,
mixed_base_dataset)
target_dataset_dir = os.sep.join([self.result_directory,
target_dataset_name])
create_directory(os.sep.join([target_dataset_dir, "data_run0"]))
if splitted:
# For each split, copy the train data from dataset 1 and
# the test data from dataset 2 to the target dataset
for source_train_file_name in glob.glob(os.sep.join([dataset_dir1,
"data_run0",
"*_sp*_train.*"])):
# TODO: We have $n$ train sets and $n$ test sets, we "metadata.yaml"])),
# could use all $n*n$ combinations
target_train_file_name = source_train_file_name.replace(dataset_dir1,
target_dataset_dir)
if source_train_file_name.endswith("arff"):
self._copy_arff_file(source_train_file_name,
target_train_file_name,
base_dataset1,
mixed_base_dataset)
else:
os.symlink(source_train_file_name,
target_train_file_name)
source_test_file_name = source_train_file_name.replace(dataset_dir1,
dataset_dir2)
source_test_file_name = source_test_file_name.replace("train.",
"test.")
target_test_file_name = target_train_file_name.replace("train.",
"test.")
if source_train_file_name.endswith("arff"):
self._copy_arff_file(source_test_file_name,
target_test_file_name,
base_dataset2,
mixed_base_dataset)
else:
os.symlink(source_test_file_name,
target_test_file_name)
else:
# Use the data set from dataset 1 as training set and
# the data set from dataset 2 as test data
for source_train_file_name in glob.glob(os.sep.join([dataset_dir1,
"data_run0",
"*_sp*_test.*"])):
target_train_file_name = source_train_file_name.replace("test.",
"train.")
target_train_file_name = target_train_file_name.replace(dataset_dir1,
target_dataset_dir)
if source_train_file_name.endswith("arff"):
self._copy_arff_file(source_train_file_name,
target_train_file_name,
base_dataset1,
mixed_base_dataset)
else:
os.symlink(source_train_file_name,
target_train_file_name)
source_test_file_name = source_train_file_name.replace(dataset_dir1,
dataset_dir2)
target_test_file_name = target_train_file_name.replace("train.",
"test.")
if source_train_file_name.endswith("arff"):
self._copy_arff_file(source_test_file_name,
target_test_file_name,
base_dataset2,
mixed_base_dataset)
else:
os.symlink(source_test_file_name,
target_test_file_name)
# Write metadata.yaml based on input meta data
input_dataset1_meta = BaseDataset.load_meta_data(dataset_dir1)
output_dataset_meta = dict(input_dataset1_meta)
output_dataset_meta['train_test'] = True
output_dataset_meta['date'] = time.strftime("%Y%m%d_%H_%M_%S")
try:
output_dataset_meta['author'] = pwd.getpwuid(os.getuid())[4]
except :
self._log("Author could not be resolved.",level=logging.WARNING)
output_dataset_meta['author'] = "unknown"
BaseDataset.store_meta_data(target_dataset_dir,output_dataset_meta)
############## Clean up after benchmarking ##############
super(ShuffleProcess, self).post_benchmarking()
示例6: __call__
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load_meta_data [as 别名]
#.........这里部分代码省略.........
target_test_file_path = os.sep.join([target_collection_path,
"data_run0","features_sp0_"+key+".arff"])
elif source_file_name.endswith("pickle"):
file_ending = "pickle"
source_test_file_path = source_test_collection_path
target_test_file_path = target_collection_path
else:
raise NotImplementedError("File type not supported in " \
"MergeOperation")
source_train_pathes = []
for source_train_collection_path in self.input_collections:
source_train_collection_name = \
source_train_collection_path.split(os.sep)[-2]
# We must not use data originating from the same input
# collection both in train and test files
if source_test_collection_name == source_train_collection_name:
continue
# Check that all constraints are fulfilled for this pair of
# input collections
if not all(eval(constraint_template % \
{'source_train_collection_name': source_train_collection_name,
'source_test_collection_name': source_test_collection_name})
for constraint_template in self.collection_constraints):
continue
# check if all parameters are stored in the target path
source_collection = \
BaseDataset.load(source_train_collection_path)
source_collection_params = \
source_collection.meta_data["parameter_setting"]
remaining_params = \
[param for param in source_collection_params.items() \
if param not in target_collection_params.items() and \
param[0] not in ["__INPUT_DATASET__",
"__RESULT_DIRECTORY__", "__OUTPUT_BUNDLE__",
"__INPUT_COLLECTION__" ]] # for old data
if remaining_params != []:
for k,v in remaining_params:
target_collection_path += "{%s#%s}" % (k,str(v))
target_collection_params[k]=v
if "arff" == file_ending:
source_train_file_path = \
os.sep.join([source_train_collection_path,
"data_run0", "features_sp0_" + \
train_set_name_suffix + ".arff"])
elif "pickle" == file_ending:
source_train_file_path = source_train_collection_path
else:
raise NotImplementedError("File type not supported in " \
"MergeOperation!")
source_train_pathes.append(source_train_file_path)
if "arff" == file_ending:
target_train_file_path = os.sep.join([target_collection_path,
"data_run0","features_sp0_"+key+".arff"])
elif "pickle" == file_ending:
target_train_file_path = target_collection_path
else:
raise NotImplementedError("File type not supported in "
"MergeOperation!")
if len(source_train_pathes) == 0:
continue
create_directory(os.sep.join([target_collection_path,
"data_run0"]))
if "arff" == file_ending:
self._copy_arff_file(source_test_file_path,
target_test_file_path,
source_test_collection_name,
target_collection_name)
self._merge_arff_files(target_train_file_path,
source_train_pathes,
target_collection_name)
# Copy metadata.yaml
# TODO: Adapt to new collection
input_meta = BaseDataset.load_meta_data(source_test_collection_path)
BaseDataset.store_meta_data(target_collection_path,input_meta)
elif "pickle" == file_ending:
self._copy_pickle_file(source_test_collection_path,
target_collection_path,
train_set_name_suffix)
self._merge_pickle_files(target_train_file_path,
source_train_pathes,
train_set_name_suffix,
target_collection_params)
else:
raise NotImplementedError("File type not supported in merge_operation")
############## Clean up after benchmarking ##############
super(MergeProcess, self).post_benchmarking()
示例7: _get_result_dataset_dir
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load_meta_data [as 别名]
def _get_result_dataset_dir(base_dir, input_dataset_dir, parameter_setting, hide_parameters):
""" Determines the name of the result directory
Determines the name of the result directory based on the
input_dataset_dir, the node_chain_name and the parameter setting.
"""
# Determine the result_directory name
# String between Key and value changed from ":" to "#",
# because ot problems in windows and with windows file servers
def _get_result_dir_name(parameter_setting, hide_parameters, method=None):
""" internal function to create result dir name in different ways"""
if not method:
parameter_str = "}{".join(
("%s#%s" % (key, value))
for key, value in parameter_setting.iteritems()
if key not in hide_parameters
)
elif method == "hash":
parameter_str = "}{".join(
("%s#%s" % (key, hash(str(value).replace(" ", ""))))
for key, value in parameter_setting.iteritems()
if key not in hide_parameters
)
parameter_str = parameter_str.replace("'", "")
parameter_str = parameter_str.replace(" ", "")
parameter_str = parameter_str.replace("[", "")
parameter_str = parameter_str.replace("]", "")
parameter_str = parameter_str.replace(os.sep, "")
result_name = "{%s}" % input_name
if parameter_str != "":
result_name += "{%s}" % (parameter_str)
# Determine the path where this result will be stored
# and create the directory if necessary
result_dir = base_dir
result_dir += os.sep + result_name
# filename is to long
# (longer than allowed including optional offsets for pyspace
# result csv naming conventions)
# create a md5 hash of the result name and use that one
import platform
CURRENTOS = platform.system()
if CURRENTOS == "Windows":
# the maximum length for a filename on Windows is 255
if len(result_dir) > 255 - 32:
result_name = "{" + hashlib.md5(result_name).hexdigest() + "}"
result_dir = base_dir
result_dir += os.sep + result_name
return result_dir
else:
if len(result_dir) > os.pathconf(os.curdir, "PC_NAME_MAX") - 32:
result_name = "{" + hashlib.md5(result_name).hexdigest() + "}"
result_dir = base_dir
result_dir += os.sep + result_name
return result_dir
input_name = input_dataset_dir.strip(os.sep).split(os.sep)[-1]
input_name = input_name.strip("{}")
# If the input is already the result of an operation
if input_name.count("}{") > 0:
input_name_parts = input_name.split("}{")
input_name = input_name_parts[0]
# Load the input meta data
dataset_dir = os.sep.join([pySPACE.configuration.storage, input_dataset_dir])
dataset_md = BaseDataset.load_meta_data(dataset_dir)
# We are going to change the parameter_setting and don't want to
# interfere with later runs so we work on a copy
parameter_setting = copy.deepcopy(parameter_setting)
# Ignore pseudo parameter "__PREPARE_OPERATION__"
if "__PREPARE_OPERATION__" in parameter_setting:
parameter_setting.pop("__PREPARE_OPERATION__")
# Add the input parameters meta data to the given parameter setting
if "parameter_setting" in dataset_md:
parameter_setting.update(dataset_md["parameter_setting"])
# We have to remove ' characters from the parameter value since
# Weka does ignore them
for key, value in parameter_setting.iteritems():
if isinstance(value, basestring) and value.count("'") > 1:
parameter_setting[key] = eval(value)
result_dir = _get_result_dir_name(parameter_setting, hide_parameters)
try:
create_directory(result_dir)
except OSError as e:
if e.errno == 36:
# filename is too long
result_dir = _get_result_dir_name(parameter_setting, hide_parameters, "hash")
create_directory(result_dir)
return result_dir
示例8: consolidate
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load_meta_data [as 别名]
def consolidate(self):
"""
Consolidates the results obtained by the single WEKA filter
processes into a consistent summary of datasets that is stored on
the file system.
.. todo:: Some of the contents of this method should go into the
:class:`~pySPACE.resources.dataset_defs.feature_vector.FeatureVectorDataset`
"""
# Iterate over all collections and store the collection meta data etc.
for entries in os.listdir(self.result_directory):
fullpath = os.path.join(self.result_directory, entries)
# For each collection
if os.path.isdir(fullpath):
if entries.startswith("{"):
# Extract the parameters from the collection name in order to
# adjust the relation name
if self.num_parameters > 0:
parameter_strings = entries.strip("}{").split("}{")[-self.num_parameters:]
parameter_postfix = "{" + "}{".join(parameter_strings) + "}"
else:
parameter_strings = ""
parameter_postfix = ""
# Postprocessing of the arff files of this collection
for train_arff_file in glob.glob(fullpath + os.sep + "data_run*"
+ os.sep + "*train.arff"):
# Adjust the relation name of the train file
content = open(train_arff_file, 'r').readlines()
# We strip everything after the last "}"
endindex = content[0].rfind("}")
content[0] = content[0][:endindex+1]
content[0] += parameter_postfix + "'"
open(train_arff_file, 'w').writelines(content)
# Use relation name of train data for test data
test_arff_file = train_arff_file.replace("train.arff", "test.arff")
test_content = open(test_arff_file, 'r').readlines()
test_content[0] = content[0] + "\n"
open(test_arff_file, 'w').writelines(test_content)
# Check which features are contained in the arff file
feature_names = []
for line in content:
if line.startswith("@attribute"):
attribute = line.split()[1]
if attribute is not "class":
feature_names.append(attribute)
# Store the collection meta data etc.
if self.num_parameters > 0:
input_collection_name = \
"{" + "}{".join(entries.strip("}{").split("}{")[:-self.num_parameters]) + "}"
else:
input_collection_name = entries
input_collection_path = os.path.join(self.operation_spec["input_path"],
input_collection_name)
input_collection_meta = BaseDataset.load_meta_data(
pySPACE.configuration.storage
+ os.sep
+ input_collection_path)
# Store the input collection
BaseDataset.store_meta_data(fullpath, input_collection_meta,
file_name="input_metadata.yaml")
# Adjust collection metadata for the new collection
input_collection_meta["feature_names"] = feature_names
input_collection_meta["num_features"] = len(feature_names)
input_collection_meta["author"] = get_author()
input_collection_meta["date"] = time.strftime("%Y%m%d")
input_collection_meta["input_collection_name"] = input_collection_name
# Write the collection meta information into the folder
BaseDataset.store_meta_data(fullpath,input_collection_meta)
# Store the command_template
command_template_file = open(os.path.join(fullpath,
"command_template"), 'w')
command_template_file.write(self.command_template)
command_template_file.close()
else:
# training and test arff need the same relation name
# otherwise Weka can't relate it to each other; the collection
# name and the parameters in {}{}-optic must be the relation
# name for further processing
self._log("WARNING: Collection name doesn't begin with '{'. Further processing may be collapse!", level= logging.WARNING)
# Write the specification of this operation
# to the result directory in order to make later
# analysis of results more easy
source_operation_file = open(os.path.join(self.result_directory,
"source_operation.yaml"), 'w')
yaml.dump(self.operation_spec, source_operation_file)
source_operation_file.close()
示例9: consolidate
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load_meta_data [as 别名]
def consolidate(self, _=None):
""" Consolidates the results obtained by the single processes into a consistent structure
of collections that are stored on the file system.
"""
# Consolidate the results
directory_pattern = os.sep.join([self.result_directory, "{*",])
dataset_pathes = glob.glob(directory_pattern)
# For all collections found
for dataset_path in dataset_pathes:
try:
# Load their meta_data
meta_data = BaseDataset.load_meta_data(dataset_path)
# Determine author and date
author = get_author()
date = time.strftime("%Y%m%d_%H_%M_%S")
# Update meta data and store it
meta_data.update({"author": author, "date": date})
# There can be either run dirs, persistency dirs, or both of them.
# Check of whichever there are more. If both exist, their numbers
# are supposed to be equal.
nr_run_dirs = len(glob.glob(os.path.join(dataset_path, "data_run*")))
nr_per_dirs = len(glob.glob(os.path.join(dataset_path, "persistency_run*")))
nr_runs = max(nr_run_dirs, nr_per_dirs)
if nr_runs > 1:
meta_data["runs"] = nr_runs
# Store the metadata
BaseDataset.store_meta_data(dataset_path, meta_data)
# Copy the input dataset specification file to the result
# directory in order to make later analysis of
# the results more easy
# THA: Split the first "/" from the input collection name, because otherwise it will be treated
# as an absolute path
input_collection_name = meta_data["input_dataset_name"][1:] if \
meta_data["input_dataset_name"][0] == os.sep else meta_data["input_dataset_name"]
input_meta_path = os.path.join(pySPACE.configuration.storage, input_collection_name)
try:
input_meta = BaseDataset.load_meta_data(input_meta_path)
BaseDataset.store_meta_data(dataset_path, input_meta, file_name="input_metadata.yaml")
except (IOError, OSError) as e:
self._log("Error copying the input_metadata.yaml: {error}".format(error=e.message),
level=logging.CRITICAL)
except Exception as e:
logging.getLogger("%s" % self).exception("Error updating the metadata: {error!s}".format(error=e))
raise e
# If we don't create a feature vector or time series collection,
# we evaluated our classification using a classification performance sink.
# The resulting files should be merged to one csv tabular.
pathlist = glob.glob(os.path.join(self.result_directory,"results_*"))
if len(pathlist)>0:
# Do the consolidation the same way as for WekaClassificationOperation
self._log("Consolidating results ...")
# We load and store the results once into a PerformanceResultSummary
# This does the necessary consolidation...
self._log("Reading intermediate results...")
try:
result_collection = PerformanceResultSummary(dataset_dir=self.result_directory)
self._log("done")
self._log("Storing result collection")
result_collection.store(self.result_directory)
self._log("done")
PerformanceResultSummary.merge_traces(self.result_directory)
except Exception as e:
logging.getLogger("%s" % self).exception("Error merging the result collection: {error!s}".format(
error=e))
if self.compression:
# Since we get one result summary,
# we don't need the numerous folders.
# So we zip them to make the whole folder more easy visible.
import zipfile
cwd = os.getcwd()
os.chdir(self.result_directory)
# If there are to many or to large folders, problems may occur.
# This case we want to log, try 64 bit mode,
# and then skip the zipping.
try:
pathlist = glob.glob(os.path.join(self.result_directory,"{*}"))
if not self.compression == "delete":
save_file = zipfile.ZipFile(
self.result_directory+'/result_folders.zip',
mode="w", compression=self.compression)
# we want to have the zipped file relative to the
# result directory
for path in pathlist:
for node in os.walk(path):
rel_path=os.path.relpath(node[0],
self.result_directory)
save_file.write(rel_path)
for data in node[2]:
save_file.write(os.path.join(rel_path,
data))
save_file.close()
#.........这里部分代码省略.........