本文整理汇总了Python中pycuda.driver.to_device函数的典型用法代码示例。如果您正苦于以下问题:Python to_device函数的具体用法?Python to_device怎么用?Python to_device使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了to_device函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_compare_order
def test_compare_order():
'''
compare_order between C(row-major), F(column-major)
'''
compare_order = mod_cu.get_function('compare_order')
nx, ny = 3, 4
f_1d = np.arange(nx*ny, dtype='f8')
f_2d_C = f_1d.reshape((nx,ny), order='C')
f_2d_F = f_1d.reshape((nx,ny), order='F')
print ''
print 'f_1d_C\n\n', f_1d
print 'f_2d_C\n', f_2d_C
print 'f_2d_F\n', f_2d_F
print ''
print 'after cuda'
ret_f_1d = np.zeros_like(f_1d)
f_1d_gpu = cuda.mem_alloc_like(f_1d)
f_2d_C_gpu = cuda.to_device(f_2d_C)
compare_order(f_2d_C_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1))
cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu)
print 'f_1d from f_2d_C\n', ret_f_1d
f_2d_F_gpu = cuda.to_device(f_2d_F)
compare_order(f_2d_F_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1))
cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu)
print 'f_1d from f_2d_F\n', ret_f_1d
示例2: multiply_csr
def multiply_csr(matrix, vector, block_size, repeat=1):
'''
Method multiply matrix by vector using CUDA module for CSR.
Calculation executed on nVidia GPU.
Parameters
==========
matrix : Scipy matrix or numpy array
Matrix to multiplication.
vector : numpy array
Vector to multiplication. His length must equal number of columns
matrix.
block_size : int (recommended 128 or 256)
Size of block CUDA.
repeat : int > 0
Number of repetitions multiplications. It has no effect on
result. Specifies the length of returned list of execution times.
Returns
=======
Tuple of result multiplication and list of execution times.
'''
if len(vector) != matrix.shape[1]:
raise ArithmeticError('Length of the vector is not equal to the'
'number of columns of the matrix.')
matrix = mf.convert_to_scipy_csr(matrix)
data = numpy.array(matrix.data, dtype=numpy.float32)
indices = numpy.array(matrix.indices, dtype=numpy.int32)
indptr = numpy.array(matrix.indptr, dtype=numpy.int32)
data = cuda.to_device(data)
indices = cuda.to_device(indices)
indptr = cuda.to_device(indptr)
num_rows = matrix.shape[0]
result = numpy.zeros(num_rows, dtype=numpy.float32)
time_list = []
grid_size = int(numpy.ceil((num_rows+0.0)/block_size))
block = (block_size, 1, 1)
grid = (grid_size, 1)
g_vector = cuda.to_device(vector)
num_rows = numpy.int32(num_rows)
kernel, texref = cudacodes.get_cuda_csr(block_size=block_size)
texref.set_address(g_vector, vector.nbytes)
tex = [texref]
for _ in range(repeat):
start.record()
kernel(data,
indices,
indptr,
cuda.Out(result),
num_rows,
block=block,
grid=grid,
texrefs=tex)
end.record()
end.synchronize()
time_list.append(start.time_till(end))
return (result, time_list)
示例3: allocation
def allocation(self):
super(DGModalGpu, self).allocation()
self.ul_gpu = cuda.to_device(self.ul)
self.ul_prev_gpu = cuda.to_device(self.ul)
self.ul_tmp_gpu = cuda.to_device(self.ul)
self.kl_gpu = cuda.to_device(self.ul)
self.el_sum_gpu = cuda.to_device(np.zeros(self.ne))
示例4: test_simple_kernel_2
def test_simple_kernel_2(self):
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
a = np.random.randn(400).astype(np.float32)
b = np.random.randn(400).astype(np.float32)
a_gpu = drv.to_device(a)
b_gpu = drv.to_device(b)
dest = np.zeros_like(a)
multiply_them(
drv.Out(dest), a_gpu, b_gpu,
block=(400, 1, 1))
assert la.norm(dest-a*b) == 0
drv.Context.synchronize()
# now try with offsets
dest = np.zeros_like(a)
multiply_them(
drv.Out(dest), np.intp(a_gpu)+a.itemsize, b_gpu,
block=(399, 1, 1))
assert la.norm((dest[:-1]-a[1:]*b[:-1])) == 0
示例5: train_gpu
def train_gpu(self, num_iter, model_file_path):
if self.batch == 0:
# Prepare to send the numpy array to gpu
self.syn1_gpu = cuda.to_device(self.syn1)
# Create word idx and related data-structure.
self.base_word_rep = cuda.mem_alloc(len(self.dictionary)*WordRep.memsize)
word_rep_ptr = int(self.base_word_rep)
self.word_reps = {}
for w_idx, word in sorted(self.dictionary.items()):
word_code = 1-2*self.words_rep[word][0].astype(dtype=np.int32)
word_point = self.words_rep[word][1].astype(dtype=np.int32)
self.word_reps[w_idx] = WordRep(word_code, word_point, word_rep_ptr)
word_rep_ptr += WordRep.memsize
print "GPU transfers done."
self.sent_reps_gpu = cuda.to_device(self.sent_reps)
# Prepare sentences for GPU transfer.
idx_sentences = [[self.dictionary.token2id[word] for word in sentence if word in self.dictionary]
for sentence in self.sentences]
# Prepare the kernel function
kernel = self.kernel_str.get_function("train_sg")
words = np.empty(self.num_sents, dtype=np.int32)
# sent_reps = np.copy(self.sent_reps)
for iter in range(num_iter):
# Sample words for each sentence and transfer to GPU
for s_idx in range(self.num_sents):
words[s_idx] = random.choice(idx_sentences[s_idx])
words_gpu = cuda.to_device(words)
kernel(self.sent_reps_gpu, np.float32(self.alpha), words_gpu, self.base_word_rep, self.syn1_gpu,
block=(self.size, 1, 1), grid=(self.num_sents, 1, 1))
# autoinit.context.synchronize()
self.sent_reps = cuda.from_device(self.sent_reps_gpu, self.sent_reps.shape, self.sent_reps.dtype)
pickle_dump(self.sent_reps, model_file_path)
示例6: get_phir_gpu
def get_phir_gpu (XK, XV, surface, field, par_reac, kernel):
REAL = par_reac.REAL
Nq = len(field.xq)
N = len(XK)
MV = numpy.zeros(len(XK))
L = numpy.sqrt(2*surface.Area) # Representative length
AI_int = 0
# Setup vector
K = par_reac.K
tic = time.time()
w = getWeights(K)
X_V = numpy.zeros(N*K)
X_Kx = numpy.zeros(N*K)
X_Ky = numpy.zeros(N*K)
X_Kz = numpy.zeros(N*K)
X_Kc = numpy.zeros(N*K)
X_Vc = numpy.zeros(N*K)
for i in range(N*K):
X_V[i] = XV[i/K]*w[i%K]*surface.Area[i/K]
X_Kx[i] = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,0]
X_Ky[i] = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,1]
X_Kz[i] = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,2]
X_Kc[i] = XK[i/K]
X_Vc[i] = XV[i/K]
toc = time.time()
time_set = toc - tic
sort = surface.sortSource
phir = cuda.to_device(numpy.zeros(Nq, dtype=REAL))
m_gpu = cuda.to_device(X_V[sort].astype(REAL))
mx_gpu = cuda.to_device(X_Kx[sort].astype(REAL))
my_gpu = cuda.to_device(X_Ky[sort].astype(REAL))
mz_gpu = cuda.to_device(X_Kz[sort].astype(REAL))
mKc_gpu = cuda.to_device(X_Kc[sort].astype(REAL))
mVc_gpu = cuda.to_device(X_Vc[sort].astype(REAL))
AI_int_gpu = cuda.to_device(numpy.zeros(Nq, dtype=numpy.int32))
xkDev = cuda.to_device(surface.xk.astype(REAL))
wkDev = cuda.to_device(surface.wk.astype(REAL))
get_phir = kernel.get_function("get_phir")
GSZ = int(numpy.ceil(float(Nq)/par_reac.BSZ))
get_phir(phir, field.xq_gpu, field.yq_gpu, field.zq_gpu, m_gpu, mx_gpu, my_gpu, mz_gpu, mKc_gpu, mVc_gpu,
surface.xjDev, surface.yjDev, surface.zjDev, surface.AreaDev, surface.kDev, surface.vertexDev,
numpy.int32(len(surface.xj)), numpy.int32(Nq), numpy.int32(par_reac.K), xkDev, wkDev, REAL(par_reac.threshold),
AI_int_gpu, numpy.int32(len(surface.xk)), surface.XskDev, surface.WskDev, block=(par_reac.BSZ,1,1), grid=(GSZ,1))
AI_aux = numpy.zeros(Nq, dtype=numpy.int32)
AI_aux = cuda.from_device(AI_int_gpu, Nq, dtype=numpy.int32)
AI_int = numpy.sum(AI_aux)
phir_cpu = numpy.zeros(Nq, dtype=REAL)
phir_cpu = cuda.from_device(phir, Nq, dtype=REAL)
return phir_cpu, AI_int
示例7: __init__
def __init__(self, code, point, struct_ptr):
self.code = cuda.to_device(code)
self.point = cuda.to_device(point)
self.code_shape, self.code_dtype = code.shape, code.dtype
self.point_shape, self.point_dtype = point.shape, point.dtype
cuda.memcpy_htod(int(struct_ptr), np.int32(code.size))
cuda.memcpy_htod(int(struct_ptr) + 8, np.intp(int(self.code)))
cuda.memcpy_htod(int(struct_ptr) + 8 + np.intp(0).nbytes, np.intp(int(self.point)))
示例8: sync_to_device
def sync_to_device(self):
self.object_array = np.array([f.as_array()
for f in self.object_list])
self.d_object_array = cuda.to_device(self.object_array)
self.d_object_count = cuda.to_device(np.array([self.object_count],
dtype=np.int32))
self.device_ptr = cuda.to_device(np.array([self.d_object_array,
self.d_object_count],
dtype=np.intp))
return self.device_ptr
示例9: nlargest
def nlargest(self, n):
"""Returns the per-individual threshold above which there are n outputs.
@param n: number of outputs which should be above the threshold
@type params: int
@return list of thresholds, in order of individuals, which delimit the top
n output values
"""
log.debug("enter nlargest with n=%d", n)
# Find one more output so that we can use strictly-less-than when counting
# and underestimate lift rather than overestimating it.
n = n + 1
passSizes = []
while n > 0:
nextSize = min(self.maxHeapFloats, n)
passSizes.append(nextSize)
n -= nextSize
log.debug("pass sizes: %r", passSizes)
thresholdsMat = np.ones(shape=(self.popSize,),
dtype=np.float32) * np.inf
self.thresholds = driver.to_device(thresholdsMat)
uintBytes = np.dtype(np.uint32).itemsize
thresholdCounts = np.zeros(shape=(self.popSize,),
dtype=np.uint32)
self.thresholdCounts = driver.to_device(thresholdCounts)
for passSize in passSizes:
log.debug("begin pass size %d", passSize)
self.nlargestKernel.prepared_call(self.nlargestGridDim,
self.outputs,
self.trainSet.size,
self.popSize,
passSize,
self.thresholds,
self.thresholdCounts)
driver.Context.synchronize()
if log.isEnabledFor(logging.DEBUG):
thresholdsMat = driver.from_device_like(self.thresholds, thresholdsMat)
log.debug("thresholds: %s", str(thresholdsMat))
thresholdCounts = driver.from_device_like(self.thresholdCounts, thresholdCounts)
log.debug("thresholdCounts: %s", str(thresholdCounts))
self.thresholdsMat = driver.from_device_like(self.thresholds, thresholdsMat)
return self.thresholdsMat
示例10: P2PKt_gpu
def P2PKt_gpu(surfSrc, surfTar, m, mKtc, Ktx_gpu, Kty_gpu, Ktz_gpu,
surf, LorY, w, param, timing, kernel):
if param.GPU==1:
tic = cuda.Event()
toc = cuda.Event()
else:
tic = Event()
toc = Event()
tic.record()
REAL = param.REAL
mDev = cuda.to_device(m.astype(REAL))
mKtcDev = cuda.to_device(mKtc.astype(REAL))
toc.record()
toc.synchronize()
timing.time_trans += tic.time_till(toc)*1e-3
tic.record()
GSZ = int(numpy.ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size
directKt_gpu = kernel.get_function("P2PKt")
AI_int = cuda.to_device(numpy.zeros(param.Nround, dtype=numpy.int32))
# GPU arrays are flattened, need to point to first element
ptr_offset = surf*len(surfTar.offsetTwigs[surf]) # Pointer to first element of offset arrays
ptr_list = surf*len(surfTar.P2P_list[surf]) # Pointer to first element in lists arrays
directKt_gpu(Ktx_gpu, Kty_gpu, Ktz_gpu,
surfSrc.offSrcDev, surfTar.offTwgDev, surfTar.P2P_lstDev, surfTar.sizeTarDev,
surfSrc.kDev, surfSrc.xjDev, surfSrc.yjDev, surfSrc.zjDev, mDev, mKtcDev,
surfTar.xiDev, surfTar.yiDev, surfTar.ziDev, surfSrc.AreaDev,
surfSrc.vertexDev, numpy.int32(ptr_offset), numpy.int32(ptr_list),
numpy.int32(LorY), REAL(param.kappa), REAL(param.threshold),
numpy.int32(param.BlocksPerTwig), numpy.int32(param.NCRIT), AI_int,
surfSrc.XskDev, surfSrc.WskDev, block=(param.BSZ,1,1), grid=(GSZ,1))
toc.record()
toc.synchronize()
timing.time_P2P += tic.time_till(toc)*1e-3
tic.record()
AI_aux = numpy.zeros(param.Nround, dtype=numpy.int32)
AI_aux = cuda.from_device(AI_int, param.Nround, dtype=numpy.int32)
timing.AI_int += sum(AI_aux[surfTar.unsort])
toc.record()
toc.synchronize()
timing.time_trans += tic.time_till(toc)*1e-3
return Ktx_gpu, Kty_gpu, Ktz_gpu
示例11: K
def K(self, Q, P, angles, quadratures):
drv.memcpy_htod(self.mod_K.get_global("cos_phi")[0], cos(angles).astype(scipy.float32))
drv.memcpy_htod(self.mod_K.get_global("sin_phi")[0], sin(angles).astype(scipy.float32))
Nx = Q.shape[0]
Ny = int(floor(quadratures.size / 1024.))
K = scipy.empty((Nx,), dtype=scipy.float32)
Kb = drv.mem_alloc(4*Ny*Nx)
Q_gpu = drv.to_device(Q)
P_gpu = drv.to_device(P)
self.K_gpu(drv.In(quadratures), Q_gpu, P_gpu, Kb,
block=(1, 1024, 1), grid=(Nx, Ny), shared=1024*4)
self.reduction_gpu(Kb, drv.Out(K), block=(1, Ny, 1), grid=(Nx, 1), shared=Ny*4)
return K/self.L
示例12: __init__
def __init__(self):
self.stream = cuda.Stream()
self.pool = pycuda.tools.PageLockedMemoryPool()
self._clear()
# These resources rely on the slots/ringbuffer mechanism for sharing,
# and so can be shared across any number of launches, genomes, and
# render kernels. Notably, seeds are self-synchronizing, so they're not
# attached to either stream object.
self.d_rb = cuda.to_device(np.array([0, 0], dtype=u32))
seeds = mwc.make_seeds(util.DEFAULT_RB_SIZE * 256)
self.d_seeds = cuda.to_device(seeds)
self._len_d_points = util.DEFAULT_RB_SIZE * 256 * 16
self.d_points = cuda.mem_alloc(self._len_d_points)
示例13: M2P_gpu
def M2P_gpu(surfSrc, surfTar, K_gpu, V_gpu, surf, ind0, param, LorY, timing, kernel):
if param.GPU==1:
tic = cuda.Event()
toc = cuda.Event()
else:
tic = Event()
toc = Event()
REAL = param.REAL
tic.record()
M2P_size = surfTar.offsetMlt[surf,len(surfTar.twig)]
MSort = numpy.zeros(param.Nm*M2P_size)
MdSort = numpy.zeros(param.Nm*M2P_size)
i = -1
for C in surfTar.M2P_list[surf,0:M2P_size]:
i+=1
MSort[i*param.Nm:i*param.Nm+param.Nm] = surfSrc.tree[C].M
MdSort[i*param.Nm:i*param.Nm+param.Nm] = surfSrc.tree[C].Md
# (free, total) = cuda.mem_get_info()
# print 'Global memory occupancy: %f%% free'%(free*100/total)
MDev = cuda.to_device(MSort.astype(REAL))
MdDev = cuda.to_device(MdSort.astype(REAL))
# (free, total) = cuda.mem_get_info()
# print 'Global memory occupancy: %f%% free'%(free*100/total)
# GPU arrays are flattened, need to point to first element
ptr_offset = surf*len(surfTar.offsetTwigs[surf]) # Pointer to first element of offset arrays
ptr_list = surf*len(surfTar.P2P_list[surf]) # Pointer to first element in lists arrays
GSZ = int(numpy.ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size
multipole_gpu = kernel.get_function("M2P")
multipole_gpu(K_gpu, V_gpu, surfTar.offMltDev, surfTar.sizeTarDev,
surfTar.xcDev, surfTar.ycDev, surfTar.zcDev,
MDev, MdDev, surfTar.xiDev, surfTar.yiDev, surfTar.ziDev,
ind0.indexDev, numpy.int32(ptr_offset), numpy.int32(ptr_list), REAL(param.kappa),
numpy.int32(param.BlocksPerTwig), numpy.int32(param.NCRIT), numpy.int32(LorY),
block=(param.BSZ,1,1), grid=(GSZ,1))
toc.record()
toc.synchronize()
timing.time_M2P += tic.time_till(toc)*1e-3
return K_gpu, V_gpu
示例14: batch_indexing
def batch_indexing(self, planes, data_points):
data_size = data_points.shape[0] / 128
self.benchmark_begin('preparing')
gpu_alloc_objs = []
# for data points
#addresses = []
#for point in data_points:
# point_addr = drv.to_device(point)
# gpu_alloc_objs.append(point_addr)
# addresses.append(int(point_addr))
#np_addresses = numpy.array(addresses).astype(numpy.uint64)
# 64 bit addressing space. each point costs 8 bytes
#arrays_gpu = drv.mem_alloc(np_addresses.shape[0] * 8)
#drv.memcpy_htod(arrays_gpu, np_addresses)
# for planes
planes_addresses = []
for plane in planes:
plane_addr = drv.to_device(plane)
gpu_alloc_objs.append(plane_addr)
planes_addresses.append(int(plane_addr))
planes_np_addresses = numpy.array(planes_addresses).astype(numpy.uint64)
# 64 bit addressing space. each point costs 8 bytes
planes_arrays_gpu = drv.mem_alloc(planes_np_addresses.shape[0] * 8)
drv.memcpy_htod(planes_arrays_gpu, planes_np_addresses)
# projections
projections = numpy.zeros(data_size).astype(numpy.uint64)
length = numpy.array([data_size]).astype(numpy.uint64)
print "total: " + str(data_size) + " data points to indexing."
self.benchmark_end('preparing')
self.benchmark_begin('cudaing')
self.indexing_kernel(
planes_arrays_gpu, drv.In(data_points), drv.Out(projections), drv.In(length),
block = self.block, grid = self.grid)
self.benchmark_end('cudaing')
#count = 0
#for pro in projections:
# print "count: " + str(count) + " " + str(pro)
# count += 1
#print projections.shape
return projections
示例15: index_list_backend
def index_list_backend(self, ilists):
from pytools import single_valued
ilist_length = single_valued(len(il) for il in ilists)
assert ilist_length == self.plan.dofs_per_face
from cgen import Typedef, POD
from pytools import flatten
flat_ilists_uncast = numpy.array(list(flatten(ilists)))
if numpy.max(flat_ilists_uncast) >= 256:
tp = numpy.uint16
else:
tp = numpy.uint8
flat_ilists = numpy.asarray(flat_ilists_uncast, dtype=tp)
assert (flat_ilists == flat_ilists_uncast).all()
return GPUIndexLists(
type=tp,
code=[Typedef(POD(tp, "index_list_entry_t"))],
device_memory=cuda.to_device(flat_ilists),
bytes=flat_ilists.size * flat_ilists.itemsize,
)