本文整理汇总了Python中nervanagpu.NervanaGPU.dot方法的典型用法代码示例。如果您正苦于以下问题:Python NervanaGPU.dot方法的具体用法?Python NervanaGPU.dot怎么用?Python NervanaGPU.dot使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nervanagpu.NervanaGPU
的用法示例。
在下文中一共展示了NervanaGPU.dot方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: max
# 需要导入模块: from nervanagpu import NervanaGPU [as 别名]
# 或者: from nervanagpu.NervanaGPU import dot [as 别名]
devA2 = ng.empty(dimA, dtype=np.float32)
devB2 = ng.empty(dimB, dtype=np.float32)
devA2[:] = devA1
devB2[:] = devB1
devC2 = ng.empty(dimC, dtype=np.float32)
if op[0] == 't': devA1, devA2 = devA1.T, devA2.T
if op[1] == 't': devB1, devB2 = devB1.T, devB2.T
glops16 = 0
glops32 = 0
glops64 = 0
if op == "tn" and dtype is np.float16:
# Experimental 128x16 gemm kernel
glops16 = ng.dot(devA1, devB1, devC1, repeat=repeat, size=16)
if op != 'nt':
glops32 = ng.dot(devA1, devB1, devC1, repeat=repeat, size=32)
glops64 = ng.dot(devA1, devB1, devC1, repeat=repeat, size=64)
glops128 = ng.dot(devA1, devB1, devC1, repeat=repeat, size=128)
glops = max(glops16, glops32, glops64, glops128)
if glops16 == glops:
fastest = 16
elif glops32 == glops:
fastest = 32
elif glops64 == glops:
fastest = 64
else:
fastest = 128
示例2: GPU
# 需要导入模块: from nervanagpu import NervanaGPU [as 别名]
# 或者: from nervanagpu.NervanaGPU import dot [as 别名]
#.........这里部分代码省略.........
def uniform(self, low=0.0, high=1.0, shape=1, dtype=default_dtype,
persist_values=True, name=None, allocator=drv.mem_alloc):
"""
generate numpy random number and convert to a GPUTensor.
If called with dype=None it will probably explode
"""
ary = np.random.uniform(low, high, shape)
return GPUTensor(ary.shape, dtype, allocator=allocator, name=name,
rounding=self.ng.round_mode).set(ary)
def normal(self, loc=0.0, scale=1.0, size=1, dtype=default_dtype,
persist_values=True, name=None, allocator=drv.mem_alloc):
"""
Gaussian/Normal random number sample generation
"""
ary = np.random.normal(loc, scale, size)
return GPUTensor(ary.shape, dtype, allocator=allocator, name=name,
rounding=self.ng.round_mode).set(ary)
def fprop_fc(self, out, inputs, weights, layer=None):
"""
Forward propagate the inputs of a fully connected network layer to
produce output pre-activations (ready for transformation by an
activation function).
Arguments:
out (GPUTensor): Where to store the forward propagated results.
inputs (GPUTensor): Will be either the dataset input values (first
layer), or the outputs from the previous layer.
weights (GPUTensor): The weight coefficient values for this layer.
layer (Layer): The layer object.
"""
self.ng.dot(weights, inputs, out)
def bprop_fc(self, out, weights, deltas, layer=None):
"""
Backward propagate the error through a fully connected network layer.
Arguments:
out (GPUTensor): Where to store the backward propagated errors.
weights (GPUTensor): The weight coefficient values for this layer.
deltas (GPUTensor): The error values for this layer
layer (Layer): The layer object.
"""
self.ng.dot(weights.T, deltas, out)
def update_fc(self, out, inputs, deltas, layer=None):
"""
Compute the updated gradient for a fully connected network layer.
Arguments:
out (GPUTensor): Where to store the updated gradient value.
inputs (GPUTensor): Will be either the dataset input values (first
layer), or the outputs from the previous layer.
deltas (GPUTensor): The error values for this layer
layer (Layer): The layer object.
"""
self.ng.dot(deltas, inputs.T, out)
def fprop_conv(self, out, inputs, weights, ofmshape, ofmsize, ofmlocs,
ifmshape, links, nifm, padding, stride, ngroups, fpropbuf,
local=False):
"""
Forward propagate the inputs of a convolutional network layer to
produce output pre-activations (ready for transformation by an
示例3: in
# 需要导入模块: from nervanagpu import NervanaGPU [as 别名]
# 或者: from nervanagpu.NervanaGPU import dot [as 别名]
devA2[:] = devA1
devB2[:] = devB1
devC2 = ng.empty(dimC, dtype=np.float32)
# devC2 = devC2s.share(dimC, dtype=np.float32)
devC2[:] = devC1
if op[0] == 't': devA1, devA2 = devA1.T, devA2.T
if op[1] == 't': devB1, devB2 = devB1.T, devB2.T
for tile in (32,64,128):
if op == 'nt' and tile != 128:
continue
try:
ng.dot(devA1, devB1, devC1, alpha=alpha, beta=beta, size=tile)
context.synchronize()
cublas_dot(devA2, devB2, devC2, alpha=alpha, beta=beta)
partial1 = ng.empty((devC1.shape[0],1), dtype=np.float32)
partial2 = partial1[0:1,0:1]
if ng.min(ng.finite(devC1), partial=partial1, out=partial2).get()[0,0] == 0.0:
print("Error: NaN KCN: (%d,%d,%d) ab: (%f,%f) dtype: %d" %
(K,C,N, alpha,beta, itemsize))
exit()
diff = ng.max(abs(devC2 - devC1), partial=partial1, out=partial2).get()[0,0]
mean = ng.mean(abs(devC2), partial=partial1, out=partial2).get()[0,0]
pctErr = 100 * diff / mean
示例4: exit
# 需要导入模块: from nervanagpu import NervanaGPU [as 别名]
# 或者: from nervanagpu.NervanaGPU import dot [as 别名]
if data_type == "All Ones":
cpuA = np.ones(dimA, dtype=dtype).astype(np.float32)
cpuB = np.ones(dimB, dtype=dtype).astype(np.float32)
#cpuB = np.identity(n, dtype=np.float32)
else:
cpuA = np.random.uniform(-1.0, 1.0, dimA).astype(np.float32)
cpuB = np.random.uniform(-1.0, 1.0, dimB).astype(np.float32)
devA = ng.array(cpuA, dtype=dtype)
devB = ng.array(cpuB, dtype=dtype)
devC = ng.empty(dimC, dtype=dtype)
if op[0] == 't': cpuA, devA = cpuA.T, devA.T
if op[1] == 't': cpuB, devB = cpuB.T, devB.T
ng.dot(devA, devB, devC, repeat=repeat)
if cpu:
cpuC = np.dot(cpuA, cpuB)
cpuD = devC.get()
diff = np.absolute(cpuC - cpuD)
print diff.max()
print cpuD[::max(m//4,1),::max(n//4,1)]
print cpuC[::max(m//4,1),::max(n//4,1)]
print diff[::max(m//4,1),::max(n//4,1)]
# print cpuD
# exit()
示例5: cublas_dot
# 需要导入模块: from nervanagpu import NervanaGPU [as 别名]
# 或者: from nervanagpu.NervanaGPU import dot [as 别名]
devA2 = devA1
devB2 = devB1
# otherwise copy
else:
devA2 = ng.empty(dimA, dtype=np.float32)
devB2 = ng.empty(dimB, dtype=np.float32)
devA2[:] = devA1
devB2[:] = devB1
devC2 = ng.empty(dimC, dtype=np.float32)
devC2[:] = devC1
if op[0] == 't': devA1, devA2 = devA1.T, devA2.T
if op[1] == 't': devB1, devB2 = devB1.T, devB2.T
ng.dot(devA1, devB1, devC1, alpha=alpha, beta=beta, repeat=repeat)
cublas_dot(devA2, devB2, devC2, alpha=alpha, beta=beta, repeat=repeat)
partial1 = ng.empty((devC1.shape[0],1), dtype=np.float32)
partial2 = partial1[0:1,0:1]
diff = ng.max(abs(devC2 - devC1), partial=partial1, out=partial2).get()[0,0]
mean = ng.mean(abs(devC2), partial=partial1, out=partial2).get()[0,0]
#if diff > .1:
print("Error: %.3f%%" % (100 * diff / mean))
print("--------------------------------------------------------------------------------")
cublas.cublasDestroy(handle)
示例6: min
# 需要导入模块: from nervanagpu import NervanaGPU [as 别名]
# 或者: from nervanagpu.NervanaGPU import dot [as 别名]
cpuA = np.random.uniform(-1.0, 1.0, dimA).astype(np.float32)
cpuB = np.random.uniform(-1.0, 1.0, dimB).astype(np.float32)
# cpuB = np.identity(n, dtype=dtype)
devA = ng.array(cpuA, dtype=dtype)
devB = ng.array(cpuB, dtype=dtype)
devC = ng.empty(dimC, dtype=dtype)
# repeat = min(int(50.0 * 4096**3 / (m * n * k)), 1000)
if op[0] == "t":
cpuA, devA = cpuA.T, devA.T
if op[1] == "t":
cpuB, devB = cpuB.T, devB.T
ng.dot(devA, devB, devC, repeat=1)
# context.synchronize()
cpuC = np.dot(cpuA, cpuB)
cpuD = devC.get()
diff = np.absolute(cpuC - cpuD)
max_diff = diff.max()
print(max_diff, cpuD.max())
if max_diff > 0.1 or max_diff != max_diff:
# print(m, n, k, max_diff)
print(cpuD[:: max(m // 16, 1), :: max(n // 16, 1)])
print(cpuC[:: max(m // 16, 1), :: max(n // 16, 1)])
print(diff[:: max(m // 16, 1), :: max(n // 16, 1)])
exit()
示例7: MGPU
# 需要导入模块: from nervanagpu import NervanaGPU [as 别名]
# 或者: from nervanagpu.NervanaGPU import dot [as 别名]
#.........这里部分代码省略.........
assert hbuf.size == dbuf.size * dbuf.num_dev
assert isinstance(dbuf, MGPUTensor)
assert hbuf.dtype == dbuf.dtype
ndata = dbuf.size
starts = [i * ndata for i in range(self.num_dev)]
for dest, strm, ctx, doff in zip(dbuf.tlist, self.strms, self.ctxs,
starts):
src = hbuf.reshape((hbuf.size))[doff:(doff + ndata)]
ctx.push()
drv.memcpy_htod_async(dest.ptr, src, strm)
ctx.pop()
self.synchronize()
def fprop_fc(self, out, inputs, weights, layer=None):
"""
In this case, the weights are shards, the acts are replicas
ubuf should be of size nout/num_dev x mbsz
"""
ubuf = layer.mempool[0]
assert ubuf.shape == (weights.shape[0], inputs.shape[1])
if layer.use_biases:
biases = layer.biases.tlist
else:
biases = [None for i in range(self.num_dev)]
for dbuf, ibuf, wt, bs, strm, ctx in zip(ubuf.tlist, inputs.tlist,
weights.tlist, biases,
self.strms, self.ctxs):
ctx.push()
self.ng.stream = strm
self.ng.dot(wt, ibuf, dbuf)
if layer.use_biases:
self.ng.add(dbuf, bs, out=dbuf)
ctx.pop()
# Note, should be safe not to sync because each fragment is computed
# on the same stream that originates the copy
# self.synchronize()
self.fragment_to_replica(ubuf, out)
def bprop_fc(self, out, weights, deltas, layer=None):
"""
Backward propagate the error through a fully connected network layer.
Arguments:
out (GPUTensor): Where to store the backward propagated errors.
weights (GPUTensor): The weight coefficient values for this layer.
deltas (GPUTensor): The error values for this layer
layer (Layer): The layer object.
"""
ubuf = layer.mempool[1]
wtsz = weights.shape[0]
starts = [i * wtsz for i in range(self.num_dev)]
assert out.shape == (weights.shape[1], deltas.shape[1])
assert ubuf.shape == out.shape
for dbuf, ibuf, wt, strm, ctx, off in zip(out.tlist, deltas.tlist,
weights.tlist, self.strms,
self.ctxs, starts):
ctx.push()
self.ng.stream = strm
self.ng.dot(wt.T, ibuf[off:(off + wtsz)], dbuf)
ctx.pop()