本文整理汇总了Python中thunder.rdds.fileio.seriesloader.SeriesLoader.fromBinary方法的典型用法代码示例。如果您正苦于以下问题:Python SeriesLoader.fromBinary方法的具体用法?Python SeriesLoader.fromBinary怎么用?Python SeriesLoader.fromBinary使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类thunder.rdds.fileio.seriesloader.SeriesLoader
的用法示例。
在下文中一共展示了SeriesLoader.fromBinary方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _run_tst_fromBinary
# 需要导入模块: from thunder.rdds.fileio.seriesloader import SeriesLoader [as 别名]
# 或者: from thunder.rdds.fileio.seriesloader.SeriesLoader import fromBinary [as 别名]
def _run_tst_fromBinary(self, useConfJson=False):
# run this as a single big test so as to avoid repeated setUp and tearDown of the spark context
# data will be a sequence of test data
# all keys and all values in a test data item must be of the same length
# keys get converted to ints regardless of raw input format
DATA = [
SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int16', 'int16'),
SeriesBinaryTestData.fromArrays([[1, 2, 3], [5, 6, 7]], [[11], [12]], 'int16', 'int16'),
SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int16', 'int32'),
SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11, 12, 13]], 'int32', 'int16'),
SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]], 'int16', 'float32'),
SeriesBinaryTestData.fromArrays([[1, 2, 3]], [[11.0, 12.0, 13.0]], 'float32', 'float32'),
SeriesBinaryTestData.fromArrays([[2, 3, 4]], [[11.0, 12.0, 13.0]], 'float32', 'float32'),
]
for itemidx, item in enumerate(DATA):
outSubdir = os.path.join(self.outputdir, 'input%d' % itemidx)
os.mkdir(outSubdir)
fname = os.path.join(outSubdir, 'inputfile%d.bin' % itemidx)
with open(fname, 'wb') as f:
item.writeToFile(f)
loader = SeriesLoader(self.sc)
if not useConfJson:
series = loader.fromBinary(outSubdir, nkeys=item.nkeys, nvalues=item.nvals, keyType=str(item.keyDtype),
valueType=str(item.valDtype))
else:
# write configuration file
conf = {'input': outSubdir,
'nkeys': item.nkeys, 'nvalues': item.nvals,
'valuetype': str(item.valDtype), 'keytype': str(item.keyDtype)}
with open(os.path.join(outSubdir, "conf.json"), 'wb') as f:
json.dump(conf, f, indent=2)
series = loader.fromBinary(outSubdir)
seriesData = series.rdd.collect()
expectedData = item.data
assert_equals(len(expectedData), len(seriesData),
"Differing numbers of k/v pairs in item %d; expected %d, got %d" %
(itemidx, len(expectedData), len(seriesData)))
for expected, actual in zip(expectedData, seriesData):
expectedKeys = tuple(expected[0])
expectedType = smallestFloatType(item.valDtype)
expectedVals = array(expected[1], dtype=expectedType)
assert_equals(expectedKeys, actual[0],
"Key mismatch in item %d; expected %s, got %s" %
(itemidx, str(expectedKeys), str(actual[0])))
assert_true(allclose(expectedVals, actual[1]),
"Value mismatch in item %d; expected %s, got %s" %
(itemidx, str(expectedVals), str(actual[1])))
assert_equals(expectedType, str(actual[1].dtype),
"Value type mismatch in item %d; expected %s, got %s" %
(itemidx, expectedType, str(actual[1].dtype)))
示例2: _run_roundtrip_tst
# 需要导入模块: from thunder.rdds.fileio.seriesloader import SeriesLoader [as 别名]
# 或者: from thunder.rdds.fileio.seriesloader.SeriesLoader import fromBinary [as 别名]
def _run_roundtrip_tst(self, testCount, arrays, blockSize):
print "Running TestSeriesBinaryWriteFromStack roundtrip test #%d" % testCount
insubdir = os.path.join(self.outputdir, 'input%d' % testCount)
os.mkdir(insubdir)
outsubdir = os.path.join(self.outputdir, 'output%d' % testCount)
#os.mkdir(outsubdir)
for aryCount, array in enumerate(arrays):
# array.tofile always writes in column-major order...
array.tofile(os.path.join(insubdir, "img%02d.stack" % aryCount))
# ... but we will read and interpret these as though they are in row-major order
dims = list(arrays[0].shape)
dims.reverse()
underTest = SeriesLoader(self.sc)
underTest.saveFromStack(insubdir, outsubdir, dims, blockSize=blockSize, datatype=str(arrays[0].dtype))
roundtripped = underTest.fromBinary(outsubdir).collect()
for serieskeys, seriesvalues in roundtripped:
for seriesidx, seriesval in enumerate(seriesvalues):
#print "seriesidx: %d; serieskeys: %s; seriesval: %g" % (seriesidx, serieskeys, seriesval)
# flip indices again for row vs col-major insanity
arykeys = list(serieskeys)
arykeys.reverse()
msg = "Failure on test #%d, time point %d, indices %s" % (testCount, seriesidx, str(tuple(arykeys)))
try:
assert_almost_equal(arrays[seriesidx][tuple(arykeys)], seriesval, places=4)
except AssertionError, e:
raise AssertionError(msg, e)
示例3: loadSeries
# 需要导入模块: from thunder.rdds.fileio.seriesloader import SeriesLoader [as 别名]
# 或者: from thunder.rdds.fileio.seriesloader.SeriesLoader import fromBinary [as 别名]
def loadSeries(self, datapath, nkeys=None, nvalues=None, inputformat='binary', minPartitions=None,
conffile='conf.json', keytype=None, valuetype=None):
"""
Loads a Series object from data stored as text or binary files.
Supports single files or multiple files stored on a local file system, a networked file system (mounted
and available on all cluster nodes), Amazon S3, or HDFS.
Parameters
----------
datapath: string
Path to data files or directory, specified as either a local filesystem path or in a URI-like format,
including scheme. A datapath argument may include a single '*' wildcard character in the filename. Examples
of valid datapaths include 'a/local/relative/directory/*.stack", "s3n:///my-s3-bucket/data/mydatafile.tif",
"/mnt/my/absolute/data/directory/", or "file:///mnt/another/data/directory/".
nkeys: int, optional (but required if `inputformat` is 'text')
dimensionality of data keys. (For instance, (x,y,z) keyed data for 3-dimensional image timeseries data.)
For text data, number of keys must be specified in this parameter; for binary data, number of keys must be
specified either in this parameter or in a configuration file named by the 'conffile' argument if this
parameter is not set.
nvalues: int, optional (but required if `inputformat` is 'text')
Number of values expected to be read. For binary data, nvalues must be specified either in this parameter
or in a configuration file named by the 'conffile' argument if this parameter is not set.
inputformat: {'text', 'binary'}. optional, default 'binary'
Format of data to be read.
minPartitions: int, optional
Explicitly specify minimum number of Spark partitions to be generated from this data. Used only for
text data. Default is to use minParallelism attribute of Spark context object.
conffile: string, optional, default 'conf.json'
Path to JSON file with configuration options including 'nkeys', 'nvalues', 'keytype', and 'valuetype'.
If a file is not found at the given path, then the base directory given in 'datafile'
will also be checked. Parameters `nkeys` or `nvalues` that are specified as explicit arguments to this
method will take priority over those found in conffile if both are present.
Returns
-------
data: thunder.rdds.Series
A newly-created Series object, wrapping an RDD of series data. This RDD will have as keys an n-tuple
of int, with n given by `nkeys` or the configuration passed in `conffile`. RDD values will be a numpy
array of length `nvalues` (or as specified in the passed configuration file).
"""
checkparams(inputformat, ['text', 'binary'])
from thunder.rdds.fileio.seriesloader import SeriesLoader
loader = SeriesLoader(self._sc, minPartitions=minPartitions)
if inputformat.lower() == 'text':
data = loader.fromText(datapath, nkeys=nkeys)
else:
# must be either 'text' or 'binary'
data = loader.fromBinary(datapath, conffilename=conffile, nkeys=nkeys, nvalues=nvalues,
keytype=keytype, valuetype=valuetype)
return data
示例4: _run_roundtrip_tst
# 需要导入模块: from thunder.rdds.fileio.seriesloader import SeriesLoader [as 别名]
# 或者: from thunder.rdds.fileio.seriesloader.SeriesLoader import fromBinary [as 别名]
def _run_roundtrip_tst(self, testCount, arrays, blockSize):
print "Running TestSeriesBinaryWriteFromStack roundtrip test #%d" % testCount
insubdir = os.path.join(self.outputdir, 'input%d' % testCount)
os.mkdir(insubdir)
outsubdir = os.path.join(self.outputdir, 'output%d' % testCount)
#os.mkdir(outsubdir)
for aryCount, array in enumerate(arrays):
# array.tofile always writes in column-major order...
array.tofile(os.path.join(insubdir, "img%02d.stack" % aryCount))
# ... but we will read and interpret these as though they are in row-major order
dims = list(arrays[0].shape)
dims.reverse()
underTest = SeriesLoader(self.sc)
underTest.saveFromStack(insubdir, outsubdir, dims, blockSize=blockSize, datatype=str(arrays[0].dtype))
series = underTest.fromStack(insubdir, dims, datatype=str(arrays[0].dtype))
roundtripped_series = underTest.fromBinary(outsubdir)
roundtripped = roundtripped_series.collect()
direct = series.collect()
expecteddtype = str(smallest_float_type(arrays[0].dtype))
assert_equals(expecteddtype, roundtripped_series.dtype)
assert_equals(expecteddtype, series.dtype)
assert_equals(expecteddtype, str(roundtripped[0][1].dtype))
assert_equals(expecteddtype, str(direct[0][1].dtype))
with open(os.path.join(outsubdir, "conf.json"), 'r') as fp:
# check that binary series file data type *matches* input stack data type (not yet converted to float)
# at least according to conf.json
conf = json.load(fp)
assert_equals(str(arrays[0].dtype), conf["valuetype"])
for ((serieskeys, seriesvalues), (directkeys, directvalues)) in zip(roundtripped, direct):
assert_equals(directkeys, serieskeys)
assert_equals(directvalues, seriesvalues)
for seriesidx, seriesval in enumerate(seriesvalues):
#print "seriesidx: %d; serieskeys: %s; seriesval: %g" % (seriesidx, serieskeys, seriesval)
# flip indices again for row vs col-major insanity
arykeys = list(serieskeys)
arykeys.reverse()
msg = "Failure on test #%d, time point %d, indices %s" % (testCount, seriesidx, str(tuple(arykeys)))
try:
assert_almost_equal(arrays[seriesidx][tuple(arykeys)], seriesval, places=4)
except AssertionError, e:
raise AssertionError(msg, e)
示例5: _run_roundtrip_tst
# 需要导入模块: from thunder.rdds.fileio.seriesloader import SeriesLoader [as 别名]
# 或者: from thunder.rdds.fileio.seriesloader.SeriesLoader import fromBinary [as 别名]
def _run_roundtrip_tst(self, testIdx, nimages, aryShape, dtypeSpec, npartitions):
testArrays = TestSeriesBinaryWriteFromStack.generateTestImages(nimages, aryShape, dtypeSpec)
loader = SeriesLoader(self.sc)
series = loader.fromArrays(testArrays)
saveDirPath = os.path.join(self.outputdir, 'save%d' % testIdx)
series.repartition(npartitions) # note: this does an elementwise shuffle! won't be in sorted order
series.saveAsBinarySeries(saveDirPath)
nnonemptyPartitions = 0
for partitionList in series.rdd.glom().collect():
if partitionList:
nnonemptyPartitions += 1
del partitionList
nsaveFiles = len(glob.glob(saveDirPath + os.sep + "*.bin"))
roundtrippedSeries = loader.fromBinary(saveDirPath)
with open(os.path.join(saveDirPath, "conf.json"), 'r') as fp:
conf = json.load(fp)
# sorting is required here b/c of the randomization induced by the repartition.
# orig and roundtripped will in general be different from each other, since roundtripped
# will have (0, 0, 0) index as first element (since it will be the lexicographically first
# file) while orig has only a 1 in npartitions chance of starting with (0, 0, 0) after repartition.
expectedPackedAry = series.pack(sorting=True)
actualPackedAry = roundtrippedSeries.pack(sorting=True)
assert_true(array_equal(expectedPackedAry, actualPackedAry))
assert_equals(nnonemptyPartitions, nsaveFiles)
assert_equals(len(aryShape), conf["nkeys"])
assert_equals(nimages, conf["nvalues"])
assert_equals("int16", conf["keytype"])
assert_equals(str(series.dtype), conf["valuetype"])
# check that we have converted ourselves to an appropriate float after reloading
assert_equals(str(smallestFloatType(series.dtype)), str(roundtrippedSeries.dtype))
示例6: loadSeries
# 需要导入模块: from thunder.rdds.fileio.seriesloader import SeriesLoader [as 别名]
# 或者: from thunder.rdds.fileio.seriesloader.SeriesLoader import fromBinary [as 别名]
def loadSeries(self, dataPath, nkeys=None, nvalues=None, inputFormat='binary', minPartitions=None,
confFilename='conf.json', keyType=None, valueType=None, keyPath=None, varName=None):
"""
Loads a Series object from data stored as binary, text, npy, or mat.
For binary and text, supports single files or multiple files stored on a local file system,
a networked file system (mounted and available on all cluster nodes), Amazon S3, or HDFS.
For local formats (npy and mat) only local file systems currently supported.
Parameters
----------
dataPath: string
Path to data files or directory, as either a local filesystem path or a URI.
May include a single '*' wildcard in the filename. Examples of valid dataPaths include
'local/directory/*.stack", "s3n:///my-s3-bucket/data/", or "file:///mnt/another/directory/".
nkeys: int, optional (required if `inputFormat` is 'text'), default = None
Number of keys per record (e.g. 3 for (x, y, z) coordinate keys). Must be specified for
text data; can be specified here or in a configuration file for binary data.
nvalues: int, optional (required if `inputFormat` is 'text')
Number of values per record. Must be specified here or in a configuration file for binary data.
inputFormat: {'text', 'binary', 'npy', 'mat'}. optional, default = 'binary'
inputFormat of data to be read.
minPartitions: int, optional, default = SparkContext.minParallelism
Minimum number of Spark partitions to use, only for text.
confFilename: string, optional, default 'conf.json'
Path to JSON file with configuration options including 'nkeys', 'nvalues',
'keyType', and 'valueType'. If a file is not found at the given path, then the base
directory in 'dataPath' will be checked. Parameters will override the conf file.
keyType: string or numpy dtype, optional, default = None
Numerical type of keys, will override conf file.
valueType: string or numpy dtype, optional, default = None
Numerical type of values, will override conf file.
keyPath: string, optional, default = None
Path to file with keys when loading from npy or mat.
varName : str, optional, default = None
Variable name to load (for MAT files only)
Returns
-------
data: thunder.rdds.Series
A Series object, wrapping an RDD, with (n-tuples of ints) : (numpy array) pairs
"""
checkParams(inputFormat, ['text', 'binary', 'npy', 'mat'])
from thunder.rdds.fileio.seriesloader import SeriesLoader
loader = SeriesLoader(self._sc, minPartitions=minPartitions)
if inputFormat.lower() == 'binary':
data = loader.fromBinary(dataPath, confFilename=confFilename, nkeys=nkeys, nvalues=nvalues,
keyType=keyType, valueType=valueType)
elif inputFormat.lower() == 'text':
if nkeys is None:
raise Exception('Must provide number of keys per record for loading from text')
data = loader.fromText(dataPath, nkeys=nkeys)
elif inputFormat.lower() == 'npy':
data = loader.fromNpyLocal(dataPath, keyPath)
else:
if varName is None:
raise Exception('Must provide variable name for loading MAT files')
data = loader.fromMatLocal(dataPath, varName, keyPath)
return data