本文整理汇总了Python中pycuda.driver.Context.synchronize方法的典型用法代码示例。如果您正苦于以下问题:Python Context.synchronize方法的具体用法?Python Context.synchronize怎么用?Python Context.synchronize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pycuda.driver.Context
的用法示例。
在下文中一共展示了Context.synchronize方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: calcV
# 需要导入模块: from pycuda.driver import Context [as 别名]
# 或者: from pycuda.driver.Context import synchronize [as 别名]
def calcV(I_shape, I_cu, V_cu):
#Ifull = I
Ci = I_shape[0]
iH = I_shape[1]
iW = I_shape[2]
N = I_shape[3]
tiles = iW // 4
oH = iH
oW = iW
padH = 1
padW = 1
# adapted from winograd_conv.py
#if N == 1:
# shlN = 0
#elif N < 32:
# shlN = len(bin(N-1))-2
#else:
# shlN = 5
shlN = 5
shlY, shlX, maskY, shrY, maskX, shrX, maskN, supY, supX = {
0 : (4, 5, 0x18, 3, 0x07, 0, 0x00, 0x203, 0x300), # 4x8 yyxxx
1 : (4, 4, 0x18, 3, 0x06, 1, 0x01, 0x203, 0x201), # 4x4 yyxxn
2 : (3, 4, 0x10, 4, 0x0c, 2, 0x03, 0x104, 0x202), # 2x4 yxxnn
3 : (2, 4, 0x00, 0, 0x18, 3, 0x07, 0x000, 0x203), # 1x4 xxnnn
4 : (2, 3, 0x00, 0, 0x10, 4, 0x0f, 0x000, 0x104), # 1x2 xnnnn
5 : (2, 2, 0x00, 0, 0x00, 0, 0x1f, 0x000, 0x000), # 1x1 nnnnn
}.get(shlN)
GYS = ceil_div(oH, 1 << shlY)
GXS = ceil_div(oW, 1 << shlX)
GN = ceil_div(N, 1 << shlN)
# GK = ceil_div(Co, 32)
GYS2 = GYS // 2
GXS2 = GXS * 2
div_GXS2 = get_div_mul_shift_32(GXS * GYS, GXS2)
div_GXS = get_div_mul_shift_32(GXS * GYS, GXS)
image_size = 1152*Ci*GXS*GYS*GN
print('div_GXS', div_GXS)
print('GYS', GYS, 'GXS', GXS, 'GN', GN, 'Ci', Ci, 'GY_GX', GXS * GYS)
grid = (GN, GYS*GXS, Ci)
block = (32, 1, 1)
call_cu_kernel(
k_calcV,
grid, block,
V_cu, I_cu,
iH, iW, N, padH, padW,
GXS, GYS2, GXS2, div_GXS2[0], div_GXS2[1], div_GXS[0], div_GXS[1],
shlY, shlX, maskY, shrY, maskX, shrX, shlN, maskN,
iH * iW * N, iW * N, GYS*GXS*Ci*1152, GXS * Ci * 1152, Ci * 1152,
GXS, GXS * GYS, GN, Ci)
Context.synchronize()
timecheck('calced V_cu')
示例2: calcO
# 需要导入模块: from pycuda.driver import Context [as 别名]
# 或者: from pycuda.driver.Context import synchronize [as 别名]
def calcO(O_cu, M_shape, M_cu):
GK = M_shape[2]
GN = M_shape[0]
tiles = M_shape[4]
num_xinu_tiles = GK * 32 * GN * 32 * tiles * tiles
grid = (ceil_div(num_xinu_tiles, 32), 1, 1)
block = (32, 1, 1)
call_cu_kernel(
k_calcO,
grid, block,
O_cu, M_cu,
num_xinu_tiles
)
Context.synchronize()
timecheck('calced O_cu')
示例3: calcM
# 需要导入模块: from pycuda.driver import Context [as 别名]
# 或者: from pycuda.driver.Context import synchronize [as 别名]
def calcM(N, Co, M_cu, U_shape, U_cu, V_shape, V_cu):
Co = (U_shape[2] - 1) * 32 + U_shape[4]
Ci = U_shape[3]
GK = ceil_div(Co, 32)
tiles = V_shape[4]
GN = V_shape[2]
print('GK', GK, 'GN', GN, 'tiles', tiles, 'Co', Co, 'Ci', Ci, 'N', N)
grid = (tiles * tiles,1,1) # b
block = (32, 16, 1) # 16 for intel...
call_cu_kernel(
k_calcM,
grid, block,
M_cu, U_cu, V_cu,
Ci, 1, tiles, GN, GK) #,
# cl.LocalMemory(32 * 32 * 4), cl.LocalMemory(32 * 32 * 4))
Context.synchronize()
timecheck('calced M_cu')
示例4: calcU
# 需要导入模块: from pycuda.driver import Context [as 别名]
# 或者: from pycuda.driver.Context import synchronize [as 别名]
def calcU(W_shape, W_cu, U_cu):
Ci = W_shape[0]
kH = W_shape[1]
kW = W_shape[2]
Co = W_shape[3]
# this is adapted from neon's winograd_conv.py:
GK = ceil_div(Co, 32)
filter_size = 1152*Ci*GK
grid = (GK, Ci, 1)
block = (32, 1, 1)
call_cu_kernel(
k_calcU,
grid, block,
U_cu, W_cu,
kH * kW * Co, kW * Co, kW * Co * 2, Co, Ci * 1152,
Ci, GK)
Context.synchronize()
timecheck('calced U_cu')
示例5: process
# 需要导入模块: from pycuda.driver import Context [as 别名]
# 或者: from pycuda.driver.Context import synchronize [as 别名]
def process(iH, iW, N, Ci, Co, kH=3, kW=3):
inittime()
np.random.seed(123)
oH = iH
oW = iW
tiles = iW // 4
shlN = 5
shlY, shlX, maskY, shrY, maskX, shrX, maskN, supY, supX = {
0 : (4, 5, 0x18, 3, 0x07, 0, 0x00, 0x203, 0x300), # 4x8 yyxxx
1 : (4, 4, 0x18, 3, 0x06, 1, 0x01, 0x203, 0x201), # 4x4 yyxxn
2 : (3, 4, 0x10, 4, 0x0c, 2, 0x03, 0x104, 0x202), # 2x4 yxxnn
3 : (2, 4, 0x00, 0, 0x18, 3, 0x07, 0x000, 0x203), # 1x4 xxnnn
4 : (2, 3, 0x00, 0, 0x10, 4, 0x0f, 0x000, 0x104), # 1x2 xnnnn
5 : (2, 2, 0x00, 0, 0x00, 0, 0x1f, 0x000, 0x000), # 1x1 nnnnn
}.get(shlN)
GYS = ceil_div(oH, 1 << shlY)
GXS = ceil_div(oW, 1 << shlX)
GN = ceil_div(N, 1 << shlN)
# GK = ceil_div(Co, 32)
GYS2 = GYS // 2
GXS2 = GXS * 2
GK = ceil_div(Co, 32)
W = np.random.randn(Ci,kH,kW,Co).astype(np.float32)
I = np.zeros((Ci,iH, iW,N), dtype=np.float32)
I[:] = np.random.randn(*I.shape)
print('Co', Co, 'iH', iH, 'iW', iW, 'N', N, 'tiles', tiles)
W_cu = gpuarray.to_gpu(W)
I_cu = gpuarray.to_gpu(I)
U = np.zeros((6, 6, GK, Ci, 32,), dtype=np.float32)
U_cu = gpuarray.to_gpu(U)
V = np.zeros((6, 6, GN,GXS, GYS, Ci, 32), dtype=np.float32)
V_cu = gpuarray.to_gpu(V)
M = np.zeros((GN, 32, GK, 32, tiles, tiles, 6, 6,), dtype=np.float32)
M_cu = gpuarray.to_gpu(M)
O = np.zeros((GN, 32, GK, 32, tiles, tiles, 4, 4,), dtype=np.float32)
O_cu = gpuarray.to_gpu(O)
Context.synchronize()
print('allocated buffers')
start = time.time()
for it in range(3):
calcU(U_cu=U_cu, W_shape=W.shape, W_cu=W_cu)
calcV(V_cu=V_cu, I_shape=I.shape, I_cu=I_cu)
calcM(N=N, Co=Co, M_cu=M_cu, U_shape=U.shape, U_cu=U_cu, V_shape=V.shape, V_cu=V_cu)
calcO(O_cu=O_cu, M_shape=M.shape, M_cu=M_cu)
Context.synchronize()
end = time.time()
print('calcs done')
print('time for all calcs:', end - start)
start = time.time()
O = O_cu.get()
# cl.enqueue_copy(q, O, O_cu)
O = O.transpose(2,3, 4,6, 5,7, 0,1).reshape(
GK * 32, tiles * 4, tiles * 4, GN * 32)
print('O.shape', O.shape)
W_from_cu = np.zeros((Ci, 3, 3, Co), dtype=np.float32)
W_from_cu = W_cu.get()
U_from_cpu = winograd_cpu.calcU(W=W)
U_from_cu = np.zeros((6, 6, GK, Ci, 32), dtype=np.float32)
U_from_cu = U_cu.get()
U_from_cu_ = U_from_cu.transpose(
0, 1, 2, 4, 3).reshape(6, 6, GK * 32, Ci)[:, :, :Co]
assert np.allclose(U_from_cu_, U_from_cpu, atol=1e-4)
V_from_cpu = winograd_cpu.calcV(I=I)
V_from_cu = np.copy(V)
V_from_cu = V_cu.get()
print('tiles', tiles)
# 0 1 2 3 4 5 6
# 6, 6, GN,GXS, GYS, Ci, 32
V_from_cu_ = V_from_cu.transpose(
2,6,0,1,5,3,4).reshape(
GN * 32, 6, 6, Ci, tiles, tiles)[:N]
assert np.allclose(V_from_cu_, V_from_cpu, atol=1e-3)
# 0 1 2 3 4 5 6 7
# [n//32][n % 32][co // 32][co % 32][th][tw][xi][nu]
M_from_cpu = winograd_cpu.calcM(U=U_from_cu, V=V_from_cu, N=N, Co=Co)
#.........这里部分代码省略.........