本文整理汇总了Python中pySPACE.resources.dataset_defs.base.BaseDataset.load方法的典型用法代码示例。如果您正苦于以下问题:Python BaseDataset.load方法的具体用法?Python BaseDataset.load怎么用?Python BaseDataset.load使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pySPACE.resources.dataset_defs.base.BaseDataset
的用法示例。
在下文中一共展示了BaseDataset.load方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _merge_pickle_files
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
def _merge_pickle_files(self, target_collection_path, source_collection_pathes,
train_set_name_suffix, target_collection_params):
""" Merge all collections in source_collection_pathes and store them \
in the target collection"""
# load a first collection, in which the data of all other collections
# is assembled
target_collection = BaseDataset.load(source_collection_pathes[0])
try:
author = pwd.getpwuid(os.getuid())[4]
except:
author = "unknown"
self._log("Author could not be resolved.",level=logging.WARNING)
date = time.strftime("%Y%m%d_%H_%M_%S")
# Delete node_chain file name
try:
target_collection.meta_data.pop("node_chain_file_name")
except:
pass
# Update meta data and store it
k = "test" if self.reverse else "train"
target_collection_params["__INPUT_DATASET__"][k] = \
[s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes]
target_collection_params["__RESULT_DIRECTORY__"] = self.result_directory
target_collection.meta_data.update({
"author" : author,
"date" : date,
"dataset_directory" : target_collection_path,
"train_test" : True,
"parameter_setting" : target_collection_params,
"input_collection_name" : source_collection_pathes[0][len(
pySPACE.configuration.storage):]
})
# merge data of all other collections to target collection
for source_collection_path in source_collection_pathes[1:]:
source_collection = BaseDataset.load(source_collection_path)
for run in source_collection.get_run_numbers():
for split in source_collection.get_split_numbers():
data = source_collection.get_data(run, split,
train_set_name_suffix)
target_data = target_collection.get_data(run, split,
train_set_name_suffix)
# actual data is stored in a list that has to be extended
target_data.extend(data)
# if only test data was given, the "Rest_vs" collection is stored as
# training data
if not self.reverse and "test" == train_set_name_suffix:
# exchange the "test" in key tuple to "train" before storing
for key in target_collection.data.keys():
assert("test" == key[2])
value = target_collection.data.pop(key)
key = (key[0],key[1],"train")
target_collection.data[key] = value
target_collection.store(target_collection_path)
示例2: _createProcesses
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
def _createProcesses(cls, processes, result_directory, operation_spec,
parameter_settings, input_collections, command_template):
# For each combination of classifier, input-collection and
# run number, create one WEKA_process
for dataset_dir in input_collections:
collection = BaseDataset.load(dataset_dir)
# Determine the number of iterations and splits to be used
iterations = collection.meta_data["runs"]
splits = collection.meta_data["splits"]
if "runs" in operation_spec:
assert(iterations in [1, operation_spec["runs"]])
iterations = operation_spec["runs"]
if "cv_folds" in operation_spec:
assert(splits in [1, operation_spec["cv_folds"]])
splits = operation_spec["cv_folds"]
for parametrization in parameter_settings:
for run_number in range(iterations):
process = WEKAClassificationProcess(dataset_dir,
command_template,
parametrization,
splits,
run_number,
result_directory)
processes.put(process)
# give executing process the sign that creation is now finished
processes.put(False)
示例3: __init__
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
def __init__(self, dataset_dir, command_template, parametrization,
run_number, split_number, operation_result_dir,
hide_parameters = []):
super(WEKAFilterProcess, self).__init__()
# Determine the directory in which the of the process' results
# are stored
result_collection_name = dataset_dir.split(os.sep)[-2]
for parameter_name, parameter_value in parametrization.iteritems():
# If this is a parameter that should not be hidden, then we have to
# encode it in the result collection name
if not parameter_name in hide_parameters:
result_collection_name += "{__%s__:%s}" % (parameter_name.upper(),
parameter_value)
self.result_directory = os.path.join(operation_result_dir,
result_collection_name)
# Create directory for intermediate results if it does not exist yet
create_directory(self.result_directory
+ os.sep + "data_run%s" % run_number)
# Create collection
collection = BaseDataset.load(dataset_dir)
# The parametrization that is independent of the collection type
# and the specific weka command template that is executed
self.params = {"dataset_name": dataset_dir.replace('/','_'),
"dataset_dir": dataset_dir,
"run_number": run_number,
"split_number": split_number,
"weka_class_path": pySPACE.configuration.weka_class_path,
"temp_results": self.result_directory}
# Load the abbreviations
abbreviations_file = open(os.path.join(pySPACE.configuration.spec_dir,
'operations/weka_templates',
'abbreviations.yaml'), 'r')
self.abbreviations = yaml.load(abbreviations_file)
# Add custom parameters for the weka command template
for parameter_name, parameter_value in parametrization.iteritems():
# Auto-expand abbreviations
if parameter_value in self.abbreviations:
parameter_value = self.abbreviations[parameter_value]
self.params[parameter_name] = parameter_value
# Build the WEKA command by repeatedly replacing all placeholders in
# the template
while True:
instantiated_template = command_template % self.params
if instantiated_template == command_template:
# All placeholders replace
self.weka_command = instantiated_template
break
else:
# We have to continue since we are not converged
command_template = instantiated_template
self.handler_class = None
示例4: _copy_file
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
def _copy_file(self, source_collection_path, target_collection_path,
train_set_name_suffix):
""" Copy a dataset to a new destination
**Parameters**
:source_collection_path:
The path to the dataset that has to be copied.
:target_collection_path:
The path to where the dataset should be copied.
:train_set_name_suffix:
Either 'train' or 'test'. Specifies if the target dataset is
handeled as training or testing data.
"""
source_collection = BaseDataset.load(source_collection_path)
# if only test data was given, the "Rest_vs" collection is stored as
# training data
if self.reverse and "test" == train_set_name_suffix:
# exchange the "test" in key tuple to "train" before storing
for key in source_collection.data.keys():
assert("test" == key[2])
value = source_collection.data.pop(key)
key = (key[0],key[1],"train")
source_collection.data[key] = value
# we store the data in the same format as before
source_collection.store(target_collection_path,
source_collection.meta_data["storage_format"])
示例5: create
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
"""
A factory method that creates an WEKA operation based on the
information given in the operation specification operation_spec
"""
assert(operation_spec["type"] == "weka_classification")
# Determine all parameter combinations that should be tested
parameter_settings = cls._get_parameter_space(operation_spec)
# Read the command template from a file
template_file = open(os.path.join(pySPACE.configuration.spec_dir,
"operations",
"weka_templates",
operation_spec["template"]),
'r')
command_template = template_file.read()
template_file.close()
# number of processes
if "runs" in operation_spec:
number_processes = len(input_paths) * len(parameter_settings) * \
operation_spec["runs"]
else: # approximate the number of processes
runs = []
for dataset_dir in input_paths:
collection = BaseDataset.load(dataset_dir)
runs.append(collection.meta_data["runs"])
runs = max(runs)
number_processes = len(input_paths) * len(parameter_settings) * \
runs
if debug == True:
# To better debug creation of processes we don't limit the queue
# and create all processes before executing them
processes = processing.Queue()
cls._createProcesses(processes, result_directory, operation_spec,
parameter_settings, input_paths,
command_template)
# create and return the weka operation object
return cls(processes, operation_spec, result_directory,
number_processes)
else:
# Create all processes by calling a recursive helper method in
# another thread so that already created processes can be executed in
# parallel. Therefore a queue is used which size is maximized to
# guarantee that not to much objects are created (because this costs
# memory). However, the actual number of 100 is arbitrary and might
# be reviewed.
processes = processing.Queue(100)
create_process = processing.Process(target=cls._createProcesses,
args=( processes, result_directory, operation_spec,
parameter_settings, input_paths,
command_template))
create_process.start()
# create and return the weka operation object
return cls(processes, operation_spec, result_directory,
number_processes, create_process)
示例6: __call__
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
def __call__(self):
""" Executes this process on the respective modality """
# Restore configuration
pySPACE.configuration = self.configuration
# reduce log_level for processing a second time and
# set communication possibility for nodes to backend
pySPACE.configuration.min_log_level = self.min_log_level
pySPACE.configuration.logging_com = self.handler_args
pySPACE.configuration.backend_com = self.backend_com
############## Prepare benchmarking ##############
super(NodeChainProcess, self).pre_benchmarking()
# Load the data and check that it can be processed
# Note: This can not be done in the objects constructor since in
# that case the whole input would need to be pickled
# when doing the remote call
abs_dataset_dir = os.sep.join([self.storage,
self.rel_dataset_dir])
input_collection = BaseDataset.load(abs_dataset_dir)
# We have to remember parameters used for generating this specific
# input dataset
if 'parameter_setting' in input_collection.meta_data.keys():
# but not __INPUT_DATASET__ and __RESULT_DIRECTORY__
for k, v in input_collection.meta_data['parameter_setting'].items():
if k not in ["__INPUT_DATASET__", "__RESULT_DIRECTORY__"]:
self.parameter_setting[k] = v
NodeChainProcess._check_node_chain_dataset_consistency(self.node_chain,
input_collection)
############## Do the actual benchmarking ##############
self._log("Start benchmarking run %s of node_chain %s on dataset %s"
% (self.run,
self.node_chain_spec,
self.rel_dataset_dir))
# Do the actual benchmarking for this collection/node_chain combination
try:
result_collection = \
self.node_chain.benchmark(input_collection = input_collection,
run = self.run,
persistency_directory = self.persistency_dir,
store_node_chain = self.store_node_chain)
except Exception, exception:
# Send Exception to Logger
import traceback
print traceback.format_exc()
self._log(traceback.format_exc(), level = logging.ERROR)
raise
示例7: _copy_pickle_file
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
def _copy_pickle_file(self, source_collection_path, target_collection_path,
train_set_name_suffix):
source_collection = BaseDataset.load(source_collection_path)
# if only test data was given, the "Rest_vs" collection is stored as
# training data
if self.reverse and "test" == train_set_name_suffix:
# exchange the "test" in key tuple to "train" before storing
for key in source_collection.data.keys():
assert("test" == key[2])
value = source_collection.data.pop(key)
key = (key[0],key[1],"train")
source_collection.data[key] = value
source_collection.store(target_collection_path)
示例8: create
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
""" A factory method that creates an Analysis operation based on the
information given in the operation specification operation_spec
"""
assert(operation_spec["type"] == "analysis")
input_path = operation_spec["input_path"]
summary = BaseDataset.load(os.path.join(pySPACE.configuration.storage,
input_path))
data_dict = summary.data
# Determine the parameters that should be analyzed
parameters = operation_spec["parameters"]
# Determine the metrics that should be plotted
metrics = operation_spec["metrics"]
# Determine how many processes will be created
number_parameter_values = [len(set(data_dict[param])) for param in parameters]
number_processes = cls._numberOfProcesses(0, number_parameter_values)+1
if debug == True:
# To better debug creation of processes we don't limit the queue
# and create all processes before executing them
processes = processing.Queue()
cls._createProcesses(processes, result_directory, data_dict, parameters,
metrics, True)
return cls( processes, operation_spec, result_directory, number_processes)
else:
# Create all plot processes by calling a recursive helper method in
# another thread so that already created processes can be executed
# although creation of processes is not finished yet. Therefore a queue
# is used which size is limited to guarantee that not to much objects
# are created (since this costs memory). However, the actual number
# of 100 is arbitrary and might be changed according to the system at hand.
processes = processing.Queue(100)
create_process = processing.Process(target=cls._createProcesses,
args=( processes, result_directory, data_dict,
parameters, metrics, True))
create_process.start()
# create and return the operation object
return cls( processes, operation_spec, result_directory, number_processes, create_process)
示例9: test_time_series_storing
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
def test_time_series_storing(self):
if os.path.exists('tmp') is False :
os.makedirs('tmp')
source = SimpleTimeSeriesSourceNode()
sink = TimeSeriesSinkNode()
sink.register_input_node(source)
sink.set_run_number(0)
sink.process_current_split()
result_collection = sink.get_result_dataset()
result_collection.store('tmp')
#sink.store_results("test_time_series_storing.tmp")
reloaded_collection = BaseDataset.load('tmp')
reloader = TimeSeriesSourceNode()
reloader.set_input_dataset(reloaded_collection)
#set_permanent_attributes(time_series_file = "test_time_series_storing.tmp")
orig_data = list(source.request_data_for_testing())
restored_data = list(reloader.request_data_for_testing())
# Check that the two list have the same length
self.assertEqual(len(orig_data), len(restored_data),
"Numbers of time series before storing and after reloading are not equal!")
# Check that there is a one-to-one correspondence
for orig_datapoint, orig_label in orig_data:
found = False
for restored_datapoint, restored_label in restored_data:
found |= (orig_datapoint.view(numpy.ndarray) == restored_datapoint.view(numpy.ndarray)).all() \
and (orig_label == restored_label)
if found: break
self.assert_(found,
"One of the original time series cannot not be found after reloading")
shutil.rmtree('tmp') # Cleaning up...
示例10: create
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
"""
A factory method that creates a statistic operation based on the
information given in the operation specification operation_spec.
If debug is TRUE the creation of the statistic processes will not
be in a separated thread.
"""
assert(operation_spec["type"] == "statistic")
input_path = operation_spec["input_path"]
tabular = BaseDataset.load(os.path.join(pySPACE.configuration.storage, input_path)).data
if operation_spec.has_key("filter"):
conditions= csv_analysis.empty_dict(tabular)
for key,l in operation_spec["filter"].items():
conditions[key].extend(l)
tabular = csv_analysis.strip_dict(tabular,conditions)
metric = operation_spec.get("metric","Balanced_accuracy")
parameter = operation_spec.get("parameter","__Dataset__")
rel_par = operation_spec.get("related_parameters",["__Dataset__", "Key_Run", "Key_Fold"])
average = operation_spec.get("average",None)
if average in rel_par:
rel_par.remove(average)
if metric in rel_par:
rel_par.remove(metric)
if parameter in rel_par:
rel_par.remove(parameter)
reduced_tabular=cls.reduce_tabular(tabular,rel_par,metric,parameter,average)
number_processes = 1
processes = processing.Queue()
cls._createProcesses(processes, result_directory, reduced_tabular)
import shutil
shutil.copy2(os.path.join(pySPACE.configuration.storage, input_path,"results.csv"), os.path.join(result_directory,"results.csv"))
shutil.copy2(os.path.join(pySPACE.configuration.storage, input_path,"metadata.yaml"), os.path.join(result_directory,"metadata.yaml"))
# create and return the shuffle operation object
return cls(processes, operation_spec, result_directory, number_processes)
示例11: __init__
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
def __init__(self,
dataset_dir,
command_template,
parametrization,
cv_folds,
run_number,
operation_result_dir):
super(WEKAClassificationProcess, self).__init__()
# Load the abbreviations
abbreviations_file = open(os.path.join(pySPACE.configuration.spec_dir,
'operations/weka_templates',
'abbreviations.yaml'), 'r')
self.abbreviations = yaml.load(abbreviations_file)
abbreviations_file.close()
# Determine the directory in which the process' results
# are stored
self.result_directory = operation_result_dir
# Create collection
collection = BaseDataset.load(dataset_dir)
# The parametrization that is independent of the collection type
# and the specific weka command template that is executed
self.params = {"collection_name": dataset_dir.strip(os.sep).split(os.sep)[-1],
"run_number": run_number,
"cv_folds": cv_folds,
"weka_class_path": pySPACE.configuration.weka_class_path,
"temp_results": self.result_directory,
"unique_id": WEKAClassificationProcess.unique_id}
# Collection dependent parameters
if not collection.meta_data["train_test"] \
and collection.meta_data["splits"] == 1:
raise NotImplementedError()
else:
# The pattern of the train and test files generated by crossvalidation
data_pattern = os.path.join(dataset_dir,
collection.meta_data["data_pattern"])
# One example arff file in which WEKa can look up relation name etc.
sample_dataset = data_pattern.replace("_run", "_run0")\
.replace("_sp_","_sp0_")\
.replace("_tt","_train")
self.params.update({"sample_dataset": sample_dataset,
"data_pattern": data_pattern})
# Add custom parameters for the weka command template
for parameter_name, parameter_value in parametrization.iteritems():
self.params[parameter_name + "_abbr"] = parameter_value
# Auto-expand abbreviations
if parameter_value in self.abbreviations:
parameter_value = self.abbreviations[parameter_value]
elif parameter_name == 'classifier':
import warnings
warnings.warn("Did not find classifier abbreviation %s. "
" Expecting full name." % parameter_value)
self.params[parameter_name] = parameter_value
# Build the WEKA command by repeatedly replacing all placeholders in
# the template
while True:
instantiated_template = command_template % self.params
if instantiated_template == command_template:
# All placeholders replace
self.weka_command = instantiated_template
break
else:
# We have to continue since we are not converged
command_template = instantiated_template
self.handler_class = None
WEKAClassificationProcess.unique_id += 1
示例12: __call__
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
def __call__(self):
""" Executes this process on the respective modality """
############## Prepare benchmarking ##############
super(MergeProcess, self).pre_benchmarking()
# For all input collections
for source_test_collection_path in self.input_collections:
# Check if the input data is splitted
# e.g. only a single test file is in the source directory
source_files = glob.glob(os.sep.join([source_test_collection_path,
"data_run0", "*test*"]))
splitted = len(source_files) > 1
assert(not splitted)
source_file_name = str(source_files[-1])
# check if train sets are also present
train_data_present = len(glob.glob(os.sep.join(
[source_test_collection_path,"data_run0",\
"*train*"]))) > 0
# if training data is present -> use train and test sets separately
if train_data_present:
train_set_name_suffix = "train"
else:
train_set_name_suffix = "test"
# We create the collection Rest_vs_Collection
source_test_collection_name = \
source_test_collection_path.split(os.sep)[-2]
test_base_collection_name = \
source_test_collection_name.strip("}{").split("}{")[0]
if self.reverse:
target_collection_name = source_test_collection_name.replace(
test_base_collection_name,
test_base_collection_name + "_vs_Rest")
key = "train"
else:
target_collection_name = source_test_collection_name.replace(
test_base_collection_name,
"Rest_vs_" + test_base_collection_name)
key = "test"
target_collection_path = os.sep.join([self.result_directory,
target_collection_name])
# determine the parameter_settings of the test collection
test_collection = BaseDataset.load(source_test_collection_path)
target_collection_params = \
test_collection.meta_data["parameter_setting"]
target_collection_params["__INPUT_DATASET__"] = \
{key: source_test_collection_name}
if source_file_name.endswith("arff"):
file_ending = "arff"
# Copy arff file from input collection to target collection
source_test_file_path = os.sep.join([source_test_collection_path,
"data_run0","features_sp0" +
train_set_name_suffix + ".arff"])
target_test_file_path = os.sep.join([target_collection_path,
"data_run0","features_sp0_"+key+".arff"])
elif source_file_name.endswith("pickle"):
file_ending = "pickle"
source_test_file_path = source_test_collection_path
target_test_file_path = target_collection_path
else:
raise NotImplementedError("File type not supported in " \
"MergeOperation")
source_train_pathes = []
for source_train_collection_path in self.input_collections:
source_train_collection_name = \
source_train_collection_path.split(os.sep)[-2]
# We must not use data originating from the same input
# collection both in train and test files
if source_test_collection_name == source_train_collection_name:
continue
# Check that all constraints are fulfilled for this pair of
# input collections
if not all(eval(constraint_template % \
{'source_train_collection_name': source_train_collection_name,
'source_test_collection_name': source_test_collection_name})
for constraint_template in self.collection_constraints):
continue
# check if all parameters are stored in the target path
source_collection = \
BaseDataset.load(source_train_collection_path)
source_collection_params = \
source_collection.meta_data["parameter_setting"]
remaining_params = \
[param for param in source_collection_params.items() \
if param not in target_collection_params.items() and \
param[0] not in ["__INPUT_DATASET__",
"__RESULT_DIRECTORY__", "__OUTPUT_BUNDLE__",
"__INPUT_COLLECTION__" ]] # for old data
if remaining_params != []:
for k,v in remaining_params:
target_collection_path += "{%s#%s}" % (k,str(v))
target_collection_params[k]=v
#.........这里部分代码省略.........
示例13: _merge_pickle_files
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
def _merge_pickle_files(self, target_dataset_path, source_dataset_pathes):
""" Concatenate all datasets in source_dataset_pathes and store
them in the target dataset"""
# sort the dataset
source_dataset_pathes.sort()
# load a first dataset, in which the data of all other datasets is assembled
target_dataset = BaseDataset.load(source_dataset_pathes[0])
# Determine author and date
try:
author = getpass.getuser()
except :
author = "Unknown"
date = time.strftime("%Y%m%d_%H_%M_%S")
# Delete node_chain file name
try:
target_dataset.meta_data.pop("node_chain_file_name")
except:
pass
# Update meta data and store it
params = target_dataset.meta_data.pop("parameter_setting")
params["__INPUT_DATASET__"] = \
[s_c_p.split(os.sep)[-2] for s_c_p in source_dataset_pathes]
params["__RESULT_DIRECTORY__"] = self.result_directory
target_dataset.meta_data.update({"author" : author,
"date" : date,
"dataset_directory" : target_dataset_path,
"train_test" : False,
"parameter_setting" : params,
"changed_time" : self.change_time,
"input_dataset_name" : source_dataset_pathes[0][len(
pySPACE.configuration.storage):]
})
# Concatenate data of all other datasets to target dataset
for source_dataset_path in source_dataset_pathes[1:]:
source_dataset = BaseDataset.load(source_dataset_path)
for run in source_dataset.get_run_numbers():
for split in source_dataset.get_split_numbers():
target_data = target_dataset.get_data(run, split, "test")
if self.change_time:
# ensure sorted target_data
# TODO: encode this in meta data?
target_data.sort(key=lambda t: t[0].end_time)
last_end_time = target_data[-1][0].end_time
for ts, l in target_data:
if ts.specs == None:
ts.specs = {"new_set": False}
elif ts.specs.has_key("new_set"):
break
else:
ts.specs["new_set"]= False
data = source_dataset.get_data(run, split, "test")
if self.change_time:
# ensure sorted target_data
# TODO: encode this in meta data?
data.sort(key=lambda t: t[0].end_time)
# flag the first element of the concatenated data list
for i, (ts, l) in enumerate(data):
if ts.specs == None:
ts.specs = {"new_set": i==0}
else:
ts.specs["new_set"] = (i==0)
if self.change_time:
ts.start_time = last_end_time + ts.start_time
ts.end_time = last_end_time + ts.end_time
# actual data is stored in a list that has to be extended
target_data.extend(data)
target_dataset.store(target_dataset_path)
示例14: prepare_training
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
def prepare_training(self, training_files, potentials, operation):
""" Prepares pyspace live for training.
Prepares everything for training of pyspace live,
i.e. creates flows based on the dataflow specs
and configures them.
"""
online_logger.info( "Preparing Training")
self.potentials = potentials
self.operation = operation
online_logger.info( "Creating flows..")
for key in self.potentials.keys():
spec_base = self.potentials[key]["configuration"].spec_dir
if self.operation == "train":
self.potentials[key]["node_chain"] = os.path.join(spec_base, self.potentials[key]["node_chain"])
online_logger.info( "node_chain_spec:" + self.potentials[key]["node_chain"])
elif self.operation in ("prewindowing", "prewindowing_offline"):
if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True:
self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["stream_prewindowing_flow"])
else:
self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["prewindowing_flow"])
online_logger.info( "prewindowing_dataflow_spec: " + self.potentials[key]["prewindowing_flow"])
elif self.operation == "prewindowed_train":
if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True:
self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["stream_postprocess_flow"])
else:
self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["postprocess_flow"])
online_logger.info( "postprocessing_dataflow_spec: " + self.potentials[key]["postprocess_flow"])
self.training_active_potential[key] = multiprocessing.Value("b",False)
online_logger.info("Path variables set for NodeChains")
# check if multiple potentials are given for training
if isinstance(training_files, list):
self.training_data = training_files
else:
self.training_data = [training_files]
# Training is done in separate processes, we send the time series
# windows to these threads via two queues
online_logger.info( "Initializing Queues")
for key in self.potentials.keys():
self.queue[key] = multiprocessing.Queue()
def flow_generator(key):
"""create a generator to yield all the abri flow windows"""
# Yield all windows until a None item is found in the queue
while True:
window = self.queue[key].get(block = True, timeout = None)
if window == None: break
yield window
# Create the actual data flows
for key in self.potentials.keys():
if self.operation == "train":
self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain,
flow_spec = file(self.potentials[key]["node_chain"]))
self.node_chains[key][0].set_generator(flow_generator(key))
flow = open(self.potentials[key]["node_chain"])
elif self.operation in ("prewindowing", "prewindowing_offline"):
online_logger.info("loading prewindowing flow..")
online_logger.info("file: " + str(self.potentials[key]["prewindowing_flow"]))
self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain,
flow_spec = file(self.potentials[key]["prewindowing_flow"]))
self.node_chains[key][0].set_generator(flow_generator(key))
flow = open(self.potentials[key]["prewindowing_flow"])
elif self.operation == "prewindowed_train":
if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True:
self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain,
flow_spec = file(self.potentials[key]["postprocess_flow"]))
# create windower
online_logger.info( "Creating Windower")
online_logger.info(self.potentials[key]["windower_spec_path_train"])
self.node_chains[key][0].set_windower_spec_file(os.path.join(spec_base, "node_chains", "windower", self.potentials[key]["windower_spec_path_train"]))
replace_start_and_end_markers = True
else:
self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["postprocess_flow"]))
replace_start_and_end_markers = False
final_collection = TimeSeriesDataset()
final_collection_path = os.path.join(self.prewindowed_data_directory, key, "all_train_data")
# delete previous training collection
if os.path.exists(final_collection_path):
online_logger.info("deleting old training data collection for " + key)
shutil.rmtree(final_collection_path)
# load all prewindowed collections and
# append data to the final collection
prewindowed_sets = \
glob.glob(os.path.join(self.prewindowed_data_directory, key, "*"))
if len(prewindowed_sets) == 0:
online_logger.error("Couldn't find data, please do prewindowing first!")
raise Exception
#.........这里部分代码省略.........
示例15: create
# 需要导入模块: from pySPACE.resources.dataset_defs.base import BaseDataset [as 别名]
# 或者: from pySPACE.resources.dataset_defs.base.BaseDataset import load [as 别名]
def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
"""
A factory method that creates an Analysis operation based on the
information given in the operation specification operation_spec.
If debug is TRUE the creation of the Analysis Processes will not
be in a separated thread.
"""
assert(operation_spec["type"] == "comp_analysis")
input_path = operation_spec["input_path"]
summary = BaseDataset.load(os.path.join(pySPACE.configuration.storage,
input_path))
data_dict = summary.data
## Done
# Determine the parameters that should be analyzed
parameters = operation_spec["parameters"]
# Determine dependent parameters, which don't get extra resolution
try:
dep_par = operation_spec["dep_par"]
except KeyError:
dep_par=[]
# Determine the metrics that should be plotted
spec_metrics = operation_spec["metrics"]
metrics=[]
for metric in spec_metrics:
if data_dict.has_key(metric):
metrics.append(metric)
else:
import warnings
warnings.warn('The metric "' + metric + '" is not contained in the results csv file.')
if len(metrics)==0:
warnings.warn('No metric available from spec file, default to first dict entry.')
metrics.append(data_dict.keys()[0])
# Determine how many processes will be created
number_parameter_values = [len(set(data_dict[param])) for param in parameters]
number_processes = cls._numberOfProcesses(0, number_parameter_values)+1
logscale = False
if operation_spec.has_key('logscale'):
logscale = operation_spec['logscale']
markertype='x'
if operation_spec.has_key('markertype'):
markertype = operation_spec['markertype']
if debug == True:
# To better debug creation of processes we don't limit the queue
# and create all processes before executing them
processes = processing.Queue()
cls._createProcesses(processes, result_directory, data_dict, parameters,
dep_par, metrics, logscale, markertype, True)
return cls( processes, operation_spec, result_directory, number_processes)
else:
# Create all plot processes by calling a recursive helper method in
# another thread so that already created processes can be executed
# although creation of processes is not finished yet. Therefore a queue
# is used which size is limited to guarantee that not to much objects
# are created (since this costs memory). However, the actual number
# of 100 is arbitrary and might be reviewed.
processes = processing.Queue(100)
create_process = processing.Process(target=cls._createProcesses,
args=( processes, result_directory, data_dict,
parameters, dep_par, metrics, logscale, markertype, True))
create_process.start()
# create and return the comp_analysis operation object
return cls( processes, operation_spec, result_directory,
number_processes, create_process)