本文整理汇总了Python中pySPACE.tools.memoize_generator.MemoizeGenerator.fresh方法的典型用法代码示例。如果您正苦于以下问题:Python MemoizeGenerator.fresh方法的具体用法?Python MemoizeGenerator.fresh怎么用?Python MemoizeGenerator.fresh使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pySPACE.tools.memoize_generator.MemoizeGenerator
的用法示例。
在下文中一共展示了MemoizeGenerator.fresh方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: SimpleTimeSeriesSourceNode
# 需要导入模块: from pySPACE.tools.memoize_generator import MemoizeGenerator [as 别名]
# 或者: from pySPACE.tools.memoize_generator.MemoizeGenerator import fresh [as 别名]
class SimpleTimeSeriesSourceNode(TimeSeriesSourceNode):
""" A simple test class for unit tests
Generates the same data for test and training.
"""
def __init__(self, *args, **kwargs):
super(SimpleTimeSeriesSourceNode, self).__init__(*args, **kwargs)
run_number = 0
# We have to create a dummy dataset
class DummyObject(object): pass
dataset = DummyObject()
dataset.meta_data = {'runs' : 1}
dataset.data = {}
self.set_permanent_attributes(dataset = dataset,
run_number=run_number)
def request_data_for_testing(self):
"""
Returns the data that can be used for testing of subsequent nodes
.. todo:: to document
"""
# If we haven't read the data for testing yet
if self.data_for_testing is None:
self.time_series = [(TimeSeries(input_array = numpy.ones((2,2))*i,
channel_names = ["X", "Y"],
sampling_frequency = 2),
random.choice(["A", "B"]))
for i in range(23)]
# Create a generator that emits the windows
test_data_generator = ((sample, label) \
for (sample, label) in self.time_series)
self.data_for_testing = MemoizeGenerator(test_data_generator,
caching = True)
# Return a fresh copy of the generator
return self.data_for_testing.fresh()
示例2: TrainTestSplitterNode
# 需要导入模块: from pySPACE.tools.memoize_generator import MemoizeGenerator [as 别名]
# 或者: from pySPACE.tools.memoize_generator.MemoizeGenerator import fresh [as 别名]
#.........这里部分代码省略.........
def is_split_node(self):
""" Returns whether this is a split node. """
return True
def use_next_split(self):
""" Use the next split of the data into training and test data.
Returns True if more splits are available, otherwise False.
This method is useful for benchmarking
"""
return False
def train_sweep(self, use_test_data):
""" Performs the actual training of the node.
.. note:: Split nodes cannot be trained
"""
raise Exception("Split nodes cannot be trained")
def request_data_for_training(self, use_test_data):
""" Returns the data for training of subsequent nodes
.. todo:: to document
"""
# Create split lazily when required
if self.train_data == None:
self._create_split()
# Create training data generator
self.data_for_training = \
MemoizeGenerator(instance for instance in self.train_data)
return self.data_for_training.fresh()
def request_data_for_testing(self):
""" Returns the data for testing of subsequent nodes
.. todo:: to document
"""
# Create split lazily when required
if self.test_data == None:
self._create_split()
# Create test data generator
self.data_for_testing = \
MemoizeGenerator(instance for instance in self.test_data)
return self.data_for_testing.fresh()
def _create_split(self):
""" Create the split of the data into training and test data. """
self._log("Splitting data into train and test data")
train_data = list(self.input_node.request_data_for_training(use_test_data=False))
# If there is already a non-empty training set,
# it means that we are not the first split node in the node chain.
if len(train_data) > 0:
raise Exception("No iterated splitting of data sets allowed\n "
"(Calling a splitter on a data set that is already "
"split)")
# Create generator instead of loading all data
if self.num_train_instances and not (self.random):
self.train_data = []
input_generator=self.input_node.request_data_for_testing
示例3: SimpleSourceTemplateNode
# 需要导入模块: from pySPACE.tools.memoize_generator import MemoizeGenerator [as 别名]
# 或者: from pySPACE.tools.memoize_generator.MemoizeGenerator import fresh [as 别名]
class SimpleSourceTemplateNode(BaseNode):
""" A simple template that illustrates the basic principles of a source node
In `pySPACE`, source nodes are used at the beginning of the node chain.
The source nodes are responsible for the input of data, be it from a
static source or from a live stream.
It is very important to note that these nodes just serve the purpose of
providing the node chain with an input dataset and do not perform any
changes on the data itself. That being said, these nodes are **do not**
have an **input node** and are **not trainable**!
In the following we will discuss the general strategy for building a new
source node for a static input data set which has been saved to disk.
In the case of more complicated inputs, please consult the documentation of
:mod:`~pySPACE.missions.nodes.source.external_generator_source.ExternalGeneratorSourceNode`
and :mod:`~pySPACE.missions.nodes.source.time_series_source.Stream2TimeSeriesSourceNode`
"""
def __init__(self, **kwargs):
""" Initialize some values to 0 or `None`
The initialization routine of the source node is basically completely
empty. Should you feel the need to do something in this part of the
code, you can initialize the ``input_dataset`` to ``None``. This
attribute will then later be changed when the ``set_input_dataset``
method is called.
If the user wants to generate the dataset inside the SourceNode,
this should be done in the ``__init__`` method though. A good example
of this practice can be found in the
:mod:`~pySPACE.missions.nodes.source.random_time_series_source.RandomTimeSeriesSourceNode`
"""
super(SimpleSourceTemplateNode, self).__init__(**kwargs)
self.set_permanent_attributes(dataset=None)
def set_input_dataset(self, dataset):
""" Sets the dataset from which this node reads the data
This method is the beginning of the node. Put simply, this method
starts the feeding process of your node chain by telling the node chain
where to get the data from.
"""
self.set_permanent_attributes(dataset=dataset)
def request_data_for_training(self, use_test_data):
""" Returns the data that can be used for training of subsequent nodes
This method streams training data and sends it to the subsequent nodes.
If one looks at the tutorial related to building new nodes (available in
the tutorial section), one can see exactly where the ``request_data``
methods are put to use.
The following example is one that was extracted from the
:mod:`~pySPACE.missions.nodes.source.feature_vector_source.FeatureVectorSourceNode`
which should(in theory at least) be implementable for all types of data.
"""
if not use_test_data:
# If the input dataset consists only of one single run,
# we use this as input for all runs to be conducted (i.e. we
# rely on later randomization of the order). Otherwise
# we use the data for this run number
if self.dataset.meta_data["runs"] > 1:
key = (self.run_number, self.current_split, "train")
else:
key = (0, self.current_split, "train")
# Check if there is training data for the current split and run
if key in self.dataset.data.keys():
self._log("Accessing input dataset's training feature vector windows.")
self.data_for_training = MemoizeGenerator(self.dataset.get_data(*key).__iter__(),
caching=self.caching)
else:
# Returns an iterator that iterates over an empty sequence
# (i.e. an iterator that is immediately exhausted), since
# this node does not provide any data that is explicitly
# dedicated for training
self._log("No training data available.")
self.data_for_training = MemoizeGenerator((x for x in [].__iter__()),
caching=self.caching)
else:
# Return the test data as there is no additional data that
# was dedicated for training
return self.request_data_for_testing()
# Return a fresh copy of the generator
return self.data_for_training.fresh()
def request_data_for_testing(self):
""" Returns the data that can be used for testing of subsequent nodes
The principle of obtaining the testing data are the same as the principles
used in obtaining the training data set. The only difference here is that,
in the case in which there is no testing data available, we allow for the
training data to be used as testing data.
"""
# If we haven't read the data for testing yet
if self.data_for_testing == None:
self._log("Accessing input dataset's test feature vector windows.")
#.........这里部分代码省略.........
示例4: StreamWindowingNode
# 需要导入模块: from pySPACE.tools.memoize_generator import MemoizeGenerator [as 别名]
# 或者: from pySPACE.tools.memoize_generator.MemoizeGenerator import fresh [as 别名]
class StreamWindowingNode(BaseNode):
"""Get a stream of time series objects and window them inside a flow.
Node that interprets a stream of incoming time series objects as
a raw data stream.
The markers stored in marker_name attribute are used as the markers
for a :class:`~pySPACE.missions.support.windower.MarkerWindower`.
This should done *before* any splitter, since all incoming windows
are regarded as parts of a consecutive data stream.
**Parameters**
:windower_spec_file:
The window specification file for the
:class:`~pySPACE.missions.support.windower.MarkerWindower`.
Used for testing and training, if windower_spec_file_train
is not specified.
:windower_spec_file_train:
A separate window file for training only.
If not specified, windower_spec_file is used for training
and testing.
**Parameters**
**Exemplary Call**
.. code-block:: yaml
-
node : Stream_Windowing
parameters :
windower_spec_file : "example_lrp_window_spec.yaml"
:Authors: Hendrik Woehrle ([email protected])
:Created: 2012/07/09
"""
def __init__(self,
windower_spec_file,
windower_spec_file_train = None,
local_window_conf=False,
nullmarker_stride_ms=None,
*args,
**kwargs):
super(StreamWindowingNode, self).__init__(*args, **kwargs)
if windower_spec_file_train is None:
windower_spec_file_train = windower_spec_file
self.set_permanent_attributes(client = None,
marker_windower = None,
window_definition = None,
local_window_conf = local_window_conf,
windower_spec_file = windower_spec_file,
windower_spec_file_train = windower_spec_file_train,
nullmarker_stride_ms=nullmarker_stride_ms)
def request_data_for_training(self, use_test_data):
""" Returns the data that can be used for training of subsequent nodes
.. todo:: to document
"""
# set window definition for train phase windower file
self.window_definition = \
Windower._load_window_spec(self.windower_spec_file_train,
self.local_window_conf)
self._log("Requesting train data...")
if self.data_for_training is None:
if not use_test_data:
# Get training and test data (with labels)
train_data = \
list(self.input_node.request_data_for_training(use_test_data=use_test_data))
# If training or test data is an empty list
if train_data == []:
self.data_for_training=MemoizeGenerator(
(x for x in [].__iter__()), caching=True)
return self.data_for_training.fresh()
# create stream of
self.window_stream(train_data)
# Create a generator that emits the windows
train_data_generator = ((sample, label) for (sample, label)
in self.marker_windower)
self.data_for_training = MemoizeGenerator(train_data_generator,
caching=True)
return self.data_for_training.fresh()
else:
# Return the test data as there is no additional data that
# was dedicated for training
self.data_for_training = self.request_data_for_testing()
return self.data_for_training.fresh()
else:
return self.data_for_training.fresh()
def request_data_for_testing(self):
#.........这里部分代码省略.........
示例5: InstanceSelectionNode
# 需要导入模块: from pySPACE.tools.memoize_generator import MemoizeGenerator [as 别名]
# 或者: from pySPACE.tools.memoize_generator.MemoizeGenerator import fresh [as 别名]
#.........这里部分代码省略.........
r = random.Random(self.run_number)
# Retain only *percentage_selected* percent of the data
retained_instances = []
for label, instances in all_instances.iteritems():
# enable random choice of samples
r.shuffle(instances)
if not self.reduce_class or \
self.train_percentage_selected == 100:
end_index = int(round(len(instances) *
self.train_percentage_selected / 100))
elif not (self.reduce_class == label):
end_index = len(instances)
else: # self.reduce_class==label--> reduction needed
end_index = int(round(len(instances) *
self.train_percentage_selected / 100))
retained_instances.extend(zip(instances[0:end_index],
[label]*end_index))
# mix up samples between the different labels
r.shuffle(retained_instances)
# Compute a generator the yields the train data and
# encapsulate it in an object that memoizes its outputs and
# provides a "fresh" method that returns a new generator that will
# yield the same sequence
train_data_generator = ((self.execute(data), label)
for (data, label) in retained_instances)
self.data_for_training = MemoizeGenerator(train_data_generator,
caching=self.caching)
self._log("Data for training finished", level=logging.DEBUG)
# Return a fresh copy of the generator
return self.data_for_training.fresh()
def request_data_for_testing(self):
""" Returns data for testing of subsequent nodes
.. todo:: to document
"""
assert(self.input_node is not None)
if self.test_percentage_selected > 100:
self._log("Test percentage of %f reduced to 100." %
self.test_percentage_selected,
level=logging.ERROR)
self.test_percentage_selected = 100
self._log("Data for testing is requested.", level=logging.DEBUG)
if self.test_percentage_selected == 100:
return super(InstanceSelectionNode, self).request_data_for_testing()
# If we haven't computed the data for testing yet
if self.data_for_testing is None:
# Assert that this node has already been trained
assert(not self.is_trainable() or
self.get_remaining_train_phase() == 0)
# Divide available instances according to label
all_instances = defaultdict(list)
for instance, label in self.input_node.request_data_for_testing():
all_instances[label].append(instance)
self._log("Keeping only %s percent of test data" %
self.test_percentage_selected,
level=logging.DEBUG)
r = random.Random(self.run_number)
示例6: ReduceOverrepresentedClassNode
# 需要导入模块: from pySPACE.tools.memoize_generator import MemoizeGenerator [as 别名]
# 或者: from pySPACE.tools.memoize_generator.MemoizeGenerator import fresh [as 别名]
class ReduceOverrepresentedClassNode(BaseNode):
""" Reject instances to balance categories for classification
The node forwards only a reduced number
of the training and test instances of the bigger class
to get a balanced ratio of the
classes. The forwarded instances are selected randomly.
All data of the underrepresented class is
forwarded.
**Parameters**
**Exemplary call**
.. code-block:: yaml
-
node : Reduce_Overrepresented_Class
:Author: Hendrik Woehrle ([email protected])
:Created: 2010/09/22
"""
def __init__(self, **kwargs):
super(ReduceOverrepresentedClassNode, self).__init__(**kwargs)
def request_data_for_training(self, use_test_data):
""" Returns data for training of subsequent nodes
.. todo:: to document
"""
assert(self.input_node is not None)
self._log("Data for testing is requested.", level=logging.DEBUG)
if self.data_for_training is None:
self._log("Producing data for training.", level=logging.DEBUG)
# Train this node
self.train_sweep(use_test_data)
# Divide available instances according to label
all_instances = defaultdict(list)
for instance, label in self.input_node.request_data_for_training(
use_test_data):
all_instances[label].append(instance)
retained_instances = self.balance_instances(all_instances)
# Compute a generator the yields the test data and
# encapsulate it in an object that memoizes its outputs and
# provides a "fresh" method that returns a new generator that will
# yield the same sequence
self._log("Producing data for testing.", level=logging.DEBUG)
train_data_generator = ((self.execute(data), label)
for (data, label) in retained_instances)
self.data_for_training = MemoizeGenerator(train_data_generator,
caching=self.caching)
self._log("Data for training finished", level=logging.DEBUG)
# Return a fresh copy of the generator
return self.data_for_training.fresh()
def request_data_for_testing(self):
""" Returns data for testing of subsequent nodes
.. todo:: to document
"""
assert(self.input_node is not None)
self._log("Data for testing is requested.", level=logging.DEBUG)
# If we haven't computed the data for testing yet
if self.data_for_testing is None:
# Assert that this node has already been trained
assert(not self.is_trainable() or
self.get_remaining_train_phase() == 0)
# Divide available instances according to label
all_instances = defaultdict(list)
for instance, label in self.input_node.request_data_for_testing():
all_instances[label].append(instance)
retained_instances = self.balance_instances(all_instances)
# Compute a generator the yields the test data and
# encapsulate it in an object that memoizes its outputs and
# provides a "fresh" method that returns a new generator that will
# yield the same sequence
self._log("Producing data for testing.", level=logging.DEBUG)
test_data_generator = ((self.execute(data), label)
for (data, label) in retained_instances)
self.data_for_testing = MemoizeGenerator(test_data_generator,
caching=self.caching)
self._log("Data for testing finished", level=logging.DEBUG)
# Return a fresh copy of the generator
return self.data_for_testing.fresh()
#.........这里部分代码省略.........
示例7: CrossValidationSplitterNode
# 需要导入模块: from pySPACE.tools.memoize_generator import MemoizeGenerator [as 别名]
# 或者: from pySPACE.tools.memoize_generator.MemoizeGenerator import fresh [as 别名]
#.........这里部分代码省略.........
This method is useful for benchmarking
"""
if self.current_split + 1 < self.splits:
self.current_split = self.current_split + 1
self._log("Benchmarking with split %s/%s" % (self.current_split + 1,
self.splits))
return True
else:
return False
def train_sweep(self, use_test_data):
""" Performs the actual training of the node.
.. note:: Split nodes cannot be trained
"""
raise Exception("Split nodes cannot be trained")
def request_data_for_training(self, use_test_data):
""" Returns the data for training of subsequent nodes
.. todo:: to document
"""
# Create cv-splits lazily when required
if self.split_indices == None:
self._create_splits()
# All data can be used for training which is not explicitly
# specified for testing by the current cv-split
self.data_for_training = MemoizeGenerator(
self.data[i] for i in range(len(self.data))
if not i in self.split_indices[self.current_split])
return self.data_for_training.fresh()
def request_data_for_testing(self):
""" Returns the data for testing of subsequent nodes
.. todo:: to document
"""
# Create cv-splits lazily when required
if self.split_indices == None:
self._create_splits()
# Only that data can be used for testing which is explicitly
# specified for this purpose by the current cv-split
self.data_for_testing = MemoizeGenerator(
self.data[i] for i in self.split_indices[self.current_split])
return self.data_for_testing.fresh()
def _create_splits(self):
""" Create the split of the data for n-fold cross-validation """
self._log("Creating %s splits for cross validation" % self.splits)
# Get training and test data (with labels)
train_data = \
list(self.input_node.request_data_for_training(use_test_data=False))
test_data = list(self.input_node.request_data_for_testing())
# If there is already a non-empty training set,
# it means that we are not the first split node in the node chain
if len(train_data) > 0:
raise Exception("No iterated splitting of data sets allowed\n "
"(Calling a splitter on a data set that is "
"already split)")
示例8: PrintDataNode
# 需要导入模块: from pySPACE.tools.memoize_generator import MemoizeGenerator [as 别名]
# 或者: from pySPACE.tools.memoize_generator.MemoizeGenerator import fresh [as 别名]
#.........这里部分代码省略.........
return data_generator
def request_data_for_training(self, use_test_data):
""" Returns data for training of subsequent nodes of the node chain
A call to this method might involve training of the node chain up this
node. If use_test_data is true, all available data is used for
training, otherwise only the data that is explicitly for training.
"""
assert(self.input_node != None)
self._log("Data for training is requested.", level = logging.DEBUG)
# If we haven't computed the data for training yet
if self.data_for_training == None:
self._log("Producing data for training.", level = logging.DEBUG)
# Train this node
self.train_sweep(use_test_data)
# Compute a generator the yields the train data and
# encapsulate it in an object that memoizes its outputs and
# provides a "fresh" method that returns a new generator that'll
# yield the same sequence
# This line crashes without the NodeMetaclass bug fix
train_data_generator = \
itertools.imap(lambda (data, label) :
self.print_data(data, label),
self.input_node.request_data_for_training(
use_test_data))
self.data_for_training = MemoizeGenerator(train_data_generator,
caching=self.caching)
self._log("Data for training finished", level = logging.DEBUG)
# Return a fresh copy of the generator
return self.data_for_training.fresh()
def request_data_for_testing(self):
""" Returns data for testing of subsequent nodes of the node chain
A call to this node might involve evaluating the whole node chain
up to this node.
"""
assert(self.input_node != None)
self._log("Data for testing is requested.", level = logging.DEBUG)
# If we haven't computed the data for testing yet
if self.data_for_testing == None:
# Assert that this node has already been trained
assert(not self.is_trainable() or
self.get_remaining_train_phase() == 0)
# Compute a generator the yields the test data and
# encapsulate it in an object that memoizes its outputs and
# provides a "fresh" method that returns a new generator that'll
# yield the same sequence
self._log("Producing data for testing.", level = logging.DEBUG)
test_data_generator = \
itertools.imap(lambda (data, label):
self.print_data(data, label),
self.input_node.request_data_for_testing())
self.data_for_testing = MemoizeGenerator(test_data_generator,
caching=self.caching)
self._log("Data for testing finished", level = logging.DEBUG)
# Return a fresh copy of the generator
return self.data_for_testing.fresh()
def print_data(self, data, label):
示例9: Stream2TimeSeriesSourceNode
# 需要导入模块: from pySPACE.tools.memoize_generator import MemoizeGenerator [as 别名]
# 或者: from pySPACE.tools.memoize_generator.MemoizeGenerator import fresh [as 别名]
#.........这里部分代码省略.........
This is helpful, when using this source node in online application,
since for most other source nodes, :func:`request_data_for_testing`
is used instead.
..todo:: check code
"""
# self._log("Processing data.", level = logging.DEBUG)
#
# # Create a generator that emits the windows
# data_generator = ((sample, label) for (sample, label) in \
# self.marker_windower)
# return data_generator
return self.request_data_for_testing()
def request_data_for_training(self, use_test_data):
"""
Returns the data that can be used for training of subsequent nodes
.. todo:: to document
"""
self._log("Requesting train data...")
if not use_test_data:
# If we haven't read the data for training yet
if self.data_for_training is None:
self._log("Start streaming.")
self.dataset.set_window_defs(
window_definition=self.window_definition,
nullmarker_stride_ms=self.nullmarker_stride_ms,
no_overlap=self.no_overlap,
data_consistency_check=self.data_consistency_check)
if self.dataset.meta_data["runs"] > 1:
key = (self.run_number, self.current_split, "train")
else:
key = (0, self.current_split, "train")
# Create a generator that emits the windows
train_data_generator = (
(sample, label)
for (sample, label) in self.dataset.get_data(*key))
self.data_for_training = \
MemoizeGenerator(train_data_generator,
caching=self.caching)
# Return a fresh copy of the generator
return self.data_for_training.fresh()
else:
# Return the test data as there is no additional data that
# was dedicated for training
return self.request_data_for_testing()
def request_data_for_testing(self):
"""
Returns the data that can be used for testing of subsequent nodes
.. todo:: to document
"""
self._log("Requesting test data...")
# If we haven't read the data for testing yet
if self.data_for_testing is None:
self._log("Start streaming.")
self.dataset.set_window_defs(
window_definition=self.window_definition,
nullmarker_stride_ms=self.nullmarker_stride_ms,
no_overlap=self.no_overlap,
data_consistency_check=self.data_consistency_check)
if self.dataset.meta_data["runs"] > 1:
key = (self.run_number, self.current_split, "test")
else:
key = (0, self.current_split, "test")
# Create a generator that emits the windows
test_data_generator = (
(sample, label)
for (sample, label) in self.dataset.get_data(*key))
self.data_for_testing = \
MemoizeGenerator(test_data_generator,
caching=self.caching)
# Return a fresh copy of the generator
return self.data_for_testing.fresh()
def store_state(self, result_dir, index=None):
""" Stores this node in the given directory *result_dir* """
from pySPACE.tools.filesystem import create_directory
node_dir = os.path.join(result_dir, self.__class__.__name__)
create_directory(node_dir)
result_file = open(os.path.join(node_dir, "window_definitions.txt"), "w")
for window_def in self.window_definition:
result_file.write(str(window_def))
result_file.close()
示例10: TimeSeriesSourceNode
# 需要导入模块: from pySPACE.tools.memoize_generator import MemoizeGenerator [as 别名]
# 或者: from pySPACE.tools.memoize_generator.MemoizeGenerator import fresh [as 别名]
class TimeSeriesSourceNode(BaseNode):
""" Source for windowed :class:`~pySPACE.resources.data_types.time_series.TimeSeries` saved in pickle format via :class:`~pySPACE.missions.nodes.sink.time_series_sink.TimeSeriesSinkNode`
**Parameters**
**Exemplary Call**
.. code-block:: yaml
-
node : TimeSeriesSource
:Author: Jan Hendrik Metzen ([email protected])
:Created: 2008/11/25
"""
input_types = ["TimeSeries"]
def __init__(self, **kwargs):
super(TimeSeriesSourceNode, self).__init__(**kwargs)
self.set_permanent_attributes(dataset=None)
def set_input_dataset(self, dataset):
""" Sets the dataset from which this node reads the data """
self.set_permanent_attributes(dataset=dataset)
def register_input_node(self, node):
""" Register the given node as input """
raise Exception("No nodes can be registered as inputs for source nodes")
def use_next_split(self):
"""
Use the next split of the data into training and test data.
Returns True if more splits are available, otherwise False.
This method is useful for benchmarking
"""
# if the input dataset has more than one split/run we will compute
# the splits in parallel, i.e. we don't return any further splits
return False
def train_sweep(self, use_test_data):
"""
Performs the actual training of the node.
.. note:: Source nodes cannot be trained
"""
raise Exception("Source nodes cannot be trained")
def request_data_for_training(self, use_test_data):
"""
Returns the time windows that can be used for training of subsequent nodes
.. todo:: to document
"""
if not use_test_data:
# If the input dataset consists only of one single run,
# we use this as input for all runs to be conducted (i.e. we
# rely on later randomization of the order). Otherwise
# we use the data for this run number
if self.dataset.meta_data["runs"] > 1:
key = (self.run_number, self.current_split, "train")
self._log("Run %s." % self.run_number)
else:
key = (0, self.current_split, "train")
self._log("Run %s. Using input data of run 0." % self.run_number)
# Check if there is training data for the current split and run
if key in self.dataset.data.keys():
self._log("Accessing input dataset's training time series windows.")
self.data_for_training = \
MemoizeGenerator(self.dataset.get_data(*key).__iter__(),
caching=self.caching)
else:
# Returns an iterator that iterates over an empty sequence
# (i.e. an iterator that is immediately exhausted), since
# this node does not provide any data that is explicitly
# dedicated for training
self._log("No training data available.")
self.data_for_training = MemoizeGenerator((x for x in [].__iter__()),
caching=self.caching)
else:
# Return the test data as there is no additional data that
# was dedicated for training
return self.request_data_for_testing()
# Return a fresh copy of the generator
return self.data_for_training.fresh()
def request_data_for_testing(self):
"""
Returns the data that can be used for testing of subsequent nodes
.. todo:: to document
"""
# If we haven't read the data for testing yet
if self.data_for_testing is None:
self._log("Accessing input dataset's test time series windows.")
# If the input dataset consists only of one single run,
# we use this as input for all runs to be conducted (i.e. we
# rely on later randomization of the order). Otherwise
#.........这里部分代码省略.........
示例11: FeatureVectorSourceNode
# 需要导入模块: from pySPACE.tools.memoize_generator import MemoizeGenerator [as 别名]
# 或者: from pySPACE.tools.memoize_generator.MemoizeGenerator import fresh [as 别名]
class FeatureVectorSourceNode(BaseNode):
""" Source for samples of type :class:`~pySPACE.resources.data_types.feature_vector.FeatureVector`
This node reads :class:`~pySPACE.resources.data_types.feature_vector.FeatureVector`
elements
accumulated in a :mod:`~pySPACE.resources.dataset_defs.feature_vector` and
passes them into the :mod:`~pySPACE.environments.chains.node_chain`.
As described in :mod:`~pySPACE.resources.dataset_defs.feature_vector` it is important,
that the storage format is correct specified in the metadata.yaml.
If the dataset has been constructed by pySPACE, this is done automatically.
**Parameters**
**Exemplary Call**
.. code-block:: yaml
-
node : Feature_Vector_Source
:Author: Jan Hendrik Metzen ([email protected])
:Created: 2008/11/25
"""
input_types = ["FeatureVector"]
def __init__(self, **kwargs):
super(FeatureVectorSourceNode, self).__init__(**kwargs)
def set_input_dataset(self, dataset):
""" Sets the dataset from which this node reads the data """
self.set_permanent_attributes(dataset=dataset)
def register_input_node(self, node):
""" Register the given node as input """
raise Exception("No nodes can be registered as inputs for source nodes")
def use_next_split(self):
"""
Use the next split of the data into training and test data.
Returns True if more splits are available, otherwise False.
This method is useful for benchmarking
"""
return False
def train_sweep(self, use_test_data):
"""
Performs the actual training of the node.
.. note:: Source nodes cannot be trained
"""
raise Exception("Source nodes cannot be trained")
def request_data_for_training(self, use_test_data):
"""
Returns the time windows that can be used for training of subsequent nodes
.. todo:: to document
"""
if not use_test_data:
# If the input dataset consists only of one single run,
# we use this as input for all runs to be conducted (i.e. we
# rely on later randomization of the order). Otherwise
# we use the data for this run number
if self.dataset.meta_data["runs"] > 1:
key = (self.run_number, self.current_split, "train")
else:
key = (0, self.current_split, "train")
# Check if there is training data for the current split and run
if key in self.dataset.data.keys():
self._log("Accessing input dataset's training feature vector windows.")
self.data_for_training = MemoizeGenerator(self.dataset.get_data(*key).__iter__(),
caching=self.caching)
else:
# Returns an iterator that iterates over an empty sequence
# (i.e. an iterator that is immediately exhausted), since
# this node does not provide any data that is explicitly
# dedicated for training
self._log("No training data available.")
self.data_for_training = MemoizeGenerator((x for x in [].__iter__()),
caching=self.caching)
else:
# Return the test data as there is no additional data that
# was dedicated for training
return self.request_data_for_testing()
# Return a fresh copy of the generator
return self.data_for_training.fresh()
def request_data_for_testing(self):
"""
Returns the data that can be used for testing of subsequent nodes
.. todo:: to document
"""
# If we haven't read the data for testing yet
if self.data_for_testing == None:
self._log("Accessing input dataset's test feature vector windows.")
# If the input dataset consists only of one single run,
# we use this as input for all runs to be conducted (i.e. we
# rely on later randomization of the order). Otherwise
#.........这里部分代码省略.........
示例12: ConsumeTrainingDataNode
# 需要导入模块: from pySPACE.tools.memoize_generator import MemoizeGenerator [as 别名]
# 或者: from pySPACE.tools.memoize_generator.MemoizeGenerator import fresh [as 别名]
#.........这里部分代码省略.........
self.set_permanent_attributes(wrapped_node = wrapped_node,
consumption_rate = consumption_rate,
internal_training_set = [],
external_training_set = [],
r = random.Random(random_seed))
@staticmethod
def node_from_yaml(node_spec):
""" Creates a node based on the node_spec to overwrite default """
# This node requires one parameters, namely a list of nodes
assert("parameters" in node_spec
and "wrapped_node" in node_spec["parameters"]),\
"ConsumeTrainingDataNode requires specification of a wrapped node!"
# Create all nodes that are packed together in this layer
wrapped_node = BaseNode.node_from_yaml(node_spec["parameters"]["wrapped_node"])
node_spec["parameters"].pop("wrapped_node")
# Create the node object
node_obj = ConsumeTrainingDataNode(wrapped_node = wrapped_node,
**node_spec["parameters"])
return node_obj
def is_trainable(self):
""" Returns whether this node is trainable. """
return self.wrapped_node.is_trainable()
def is_supervised(self):
""" Returns whether this node requires supervised training """
return self.wrapped_node.is_supervised()
def _get_train_set(self, use_test_data = False):
""" Returns the data that can be used for training """
# We take data that is provided by the input node for training
# NOTE: This might involve training of the preceding nodes
train_set = list(self.input_node.request_data_for_training(use_test_data))
# Divide available instances according to label
all_instances = defaultdict(list)
for instance, label in train_set:
all_instances[label].append(instance)
# Split into training data used internally and training data that is
# available for successor nodes
self.internal_training_set = []
self.external_training_set = []
for label, instances in all_instances.iteritems():
self.r.shuffle(instances)
split_index = int(round(len(instances) * self.consumption_rate))
self.internal_training_set.extend(zip(instances[:split_index],
repeat(label)))
self.external_training_set.extend(zip(instances[split_index:],
repeat(label)))
return self.internal_training_set
def request_data_for_training(self, use_test_data):
""" Returns data for training of subsequent nodes
.. todo:: to document
"""
assert(self.input_node != None)
self._log("Data for training is requested.", level = logging.DEBUG)
# If we haven't computed the data for training yet
if self.data_for_training == None:
self._log("Producing data for training.", level = logging.DEBUG)
# Train this node
self.train_sweep(use_test_data)
# Compute a generator the yields the train data and
# encapsulate it in an object that memoizes its outputs and
# provides a "fresh" method that returns a new generator that'll
# yield the same sequence
train_data_generator = \
itertools.imap(lambda (data, label) : (self.execute(data), label),
self.external_training_set)
self.data_for_training = MemoizeGenerator(train_data_generator,
caching=self.caching)
self._log("Data for training finished", level = logging.DEBUG)
# Return a fresh copy of the generator
return self.data_for_training.fresh()
def _train(self, data, label):
""" Trains the wrapped nodes on the given data vector *data* """
self.wrapped_node.train(data, label)
def _stop_training(self):
""" Finish the training of the node."""
self.wrapped_node.stop_training()
def _execute(self, data):
""" Executes the node on the given data vector *data* """
return self.wrapped_node.execute(data)
def store_state(self, result_dir, index=None):
""" Stores this node in the given directory *result_dir* """
self.wrapped_node.store_state(result_dir, index=None)
示例13: TransferSplitterNode
# 需要导入模块: from pySPACE.tools.memoize_generator import MemoizeGenerator [as 别名]
# 或者: from pySPACE.tools.memoize_generator.MemoizeGenerator import fresh [as 别名]
#.........这里部分代码省略.........
def use_next_split(self):
""" Use the next split of the data into training and test data.
Returns True if more splits are available, otherwise False.
This method is useful for benchmarking
"""
if self.current_split + 1 < self.splits:
self.current_split = self.current_split + 1
self._log("Benchmarking with split %s/%s" % (self.current_split + 1,
self.splits))
return True
else:
return False
def train_sweep(self, use_test_data):
""" Performs the actual training of the node.
.. note:: Split nodes cannot be trained
"""
raise Exception("Split nodes cannot be trained")
def request_data_for_training(self, use_test_data):
# Create split lazily when required
if self.split_indices_train == None:
self._create_split()
# Create training data generator
self.data_for_training = MemoizeGenerator(
self.data[i] for i in self.split_indices_train[self.current_split])
return self.data_for_training.fresh()
def request_data_for_testing(self):
# Create split lazily when required
if self.split_indices_test == None:
self._create_split()
# Create test data generator
self.data_for_testing = MemoizeGenerator(
self.data[i] for i in self.split_indices_test[self.current_split])
return self.data_for_testing.fresh()
def _create_split(self):
""" Create the split of the data into training and test data. """
self._log("Splitting data into train and test data")
# Get training and test data
# note: return the data in a list can double the memory requirements!
train_data = list(self.input_node.request_data_for_training(
use_test_data = False))
test_data = list(self.input_node.request_data_for_testing())
# If there is already a non-empty training set,
# it means that we are not the first split node in the node chain.
if len(train_data) > 0:
if len(test_data)==0:
# If there was an All_Train_Splitter before, filter according
# to wdef_train and return all training data
self.split_indices_train = \
[[ind for ind, (win, lab) in enumerate(train_data) \
if win.specs['wdef_name'] in self.wdefs_train]]
self.split_indices_test = [[]]
示例14: InstanceSelectionNode
# 需要导入模块: from pySPACE.tools.memoize_generator import MemoizeGenerator [as 别名]
# 或者: from pySPACE.tools.memoize_generator.MemoizeGenerator import fresh [as 别名]
#.........这里部分代码省略.........
self.train_percentage_selected,
level=logging.DEBUG)
# Retain only *percentage_selected* percent of the data
for label, instances in all_instances.iteritems():
# enable random choice of samples
r.shuffle(instances)
if not self.reduce_class or \
self.train_percentage_selected == 100:
end_index = int(round(len(instances) *
self.train_percentage_selected / 100))
elif not (self.reduce_class == label):
end_index = len(instances)
else: # self.reduce_class==label--> reduction needed
end_index = int(round(len(instances) *
self.train_percentage_selected / 100))
retained_instances.extend(zip(instances[0:end_index],
[label]*end_index))
if self.random:
# mix up samples between the different labels
r.shuffle(retained_instances)
# Compute a generator the yields the train data and
# encapsulate it in an object that memoizes its outputs and
# provides a "fresh" method that returns a new generator that will
# yield the same sequence
train_data_generator = ((self.execute(data), label)
for (data, label) in retained_instances)
self.data_for_training = MemoizeGenerator(train_data_generator,
caching=self.caching)
self._log("Data for training finished", level=logging.DEBUG)
# Return a fresh copy of the generator
return self.data_for_training.fresh()
def request_data_for_testing(self):
""" Returns data for testing of subsequent nodes
.. todo:: to document
"""
assert(self.input_node is not None)
if self.test_percentage_selected > 100:
self._log("Test percentage of %f reduced to 100." %
self.test_percentage_selected,
level=logging.ERROR)
self.test_percentage_selected = 100
self._log("Data for testing is requested.", level=logging.DEBUG)
if self.test_percentage_selected == 100:
return super(InstanceSelectionNode, self).request_data_for_testing()
# If we haven't computed the data for testing yet
if self.data_for_testing is None:
# Assert that this node has already been trained
assert(not self.is_trainable() or
self.get_remaining_train_phase() == 0)
# Divide available instances according to label
all_instances = defaultdict(list)
for instance, label in self.input_node.request_data_for_testing():
all_instances[label].append(instance)
self._log("Keeping only %s percent of test data" %
self.test_percentage_selected,
level=logging.DEBUG)
r = random.Random(self.run_number)
示例15: RandomTimeSeriesSourceNode
# 需要导入模块: from pySPACE.tools.memoize_generator import MemoizeGenerator [as 别名]
# 或者: from pySPACE.tools.memoize_generator.MemoizeGenerator import fresh [as 别名]
#.........这里部分代码省略.........
channel_names = ["X", "Y"],
class_labels = ['A','B'],
class_choice_function = random.random,
choice_threshold = 0.33,
sampling_frequency = 2,
**kwargs):
super(RandomTimeSeriesSourceNode, self).__init__(**kwargs)
# We have to create a dummy collection
class DummyObject(object): pass
collection = DummyObject()
collection.meta_data = {'runs' : 1}
collection.data = {}
# only binary classification supported by now
assert( len(class_labels) == 2)
self.set_permanent_attributes(collection = collection,
num_instances = num_instances,
generating_function_class_0 = generating_function_class_0,
generating_function_class_1 = generating_function_class_1,
channel_names = channel_names,
class_labels = class_labels,
class_choice_function = class_choice_function,
choice_threshold = choice_threshold,
sampling_frequency = sampling_frequency)
def generate_random_data(self):
""" Method that is invoked by train and test data generation functions"""
# invokes the given generating functions
generated_data = []
for i in range(self.num_instances):
choice = self.class_choice_function()
label = None
if choice < self.choice_threshold:
input_array = self.generating_function_class_0(i)
label = self.class_labels[0]
else:
input_array = self.generating_function_class_1(i)
label = self.class_labels[1]
generated_data.append( (TimeSeries(input_array = input_array,
channel_names = self.channel_names,
sampling_frequency = self.sampling_frequency ),
label))
return generated_data
def request_data_for_testing(self):
"""
Returns the data that can be used for testing of subsequent nodes
.. todo:: to document
"""
# If we haven't read the data for testing yet
if self.data_for_testing == None:
generated_data = self.generate_random_data()
# Create a generator that emits the windows
test_data_generator = ((sample, label) \
for (sample, label) in generated_data)
self.data_for_testing = MemoizeGenerator(test_data_generator,
caching = True)
# Return a fresh copy of the generator
return self.data_for_testing.fresh()
def request_data_for_training(self, use_test_data):
"""
Returns the data that can be used for testing of subsequent nodes
.. todo:: to document
"""
if use_test_data:
return self.request_data_for_testing()
# If we haven't read the data for testing yet
if self.data_for_training == None:
generated_data = self.generate_random_data()
# Create a generator that emits the windows
train_data_generator = ((sample, label) \
for (sample, label) in generated_data)
self.data_for_training = MemoizeGenerator(train_data_generator,
caching = True)
# Return a fresh copy of the generator
return self.data_for_training.fresh()
def get_metadata(self, key):
""" This source node does not contain collection meta data. """
return None