本文整理汇总了Python中pycuda.driver.memcpy_htod函数的典型用法代码示例。如果您正苦于以下问题:Python memcpy_htod函数的具体用法?Python memcpy_htod怎么用?Python memcpy_htod使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了memcpy_htod函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self, view_tile, size, sigma, debug=False):
self.debug = debug
if size[0] < 2 or size[1] < 2:
raise ValueError("Split needs to be at least 2x2")
self.data_sets = view_tile.get_Data()
for dset in self.data_sets:
data = dset.getDataSet()
if not data.flags['C_CONTIGUOUS']:
print "NOT CONTIGUOUS, trying to reformat the points"
data = np.require(data, dtype=data.dtype, requirements=['C'])
if not data.flags['C_CONTIGUOUS']:
raise Exception("Points are not contiguous")
dset.setDataSet(data)
self.view_tile = view_tile
self.sigma = sigma
self.pts_gpu = None
# Initiates all of cuda stuff
self.grid = np.zeros(size).astype(np.float32)
self.grid_gpu = cuda.mem_alloc_like(self.grid)
cuda.memcpy_htod(self.grid_gpu, self.grid)
kernel = SourceModule(self.__cuda_code)
self.gpu_gaussian = kernel.get_function("gpu_gaussian")
self.view = self.view_tile.get_View()
self.grid_size, self.block_size = self.__setup_cuda_sizes(size)
self.dx = 1 / float(size[1] - 1)
self.dy = 1 / float(size[0] - 1)
示例2: test_constant_memory
def test_constant_memory(self):
# contributed by Andrew Wagner
module = SourceModule("""
__constant__ float const_array[32];
__global__ void copy_constant_into_global(float* global_result_array)
{
global_result_array[threadIdx.x] = const_array[threadIdx.x];
}
""")
copy_constant_into_global = module.get_function("copy_constant_into_global")
const_array, _ = module.get_global('const_array')
host_array = np.random.randint(0,255,(32,)).astype(np.float32)
global_result_array = drv.mem_alloc_like(host_array)
drv.memcpy_htod(const_array, host_array)
copy_constant_into_global(
global_result_array,
grid=(1, 1), block=(32, 1, 1))
host_result_array = np.zeros_like(host_array)
drv.memcpy_dtoh(host_result_array, global_result_array)
assert (host_result_array == host_array).all
示例3: edgetaper_gpu
def edgetaper_gpu(y_gpu, sf, win='barthann'):
shape = np.array(y_gpu.shape).astype(np.uint32)
dtype = y_gpu.dtype
block_size = (16,16,1)
grid_size = (int(np.ceil(float(shape[1])/block_size[0])),
int(np.ceil(float(shape[0])/block_size[1])))
# Ensure that sf is odd
sf = sf+(1-np.mod(sf,2))
wx = scipy.signal.get_window(win, sf[1])
wy = scipy.signal.get_window(win, sf[0])
maxw = wx.max() * wy.max()
hsf = np.floor(sf/2)
wx = (wx[0:hsf[1]] / maxw).astype(dtype)
wy = (wy[0:hsf[0]] / maxw).astype(dtype)
preproc = _generate_preproc(dtype, shape)
preproc += '#define wx_size %d\n' % wx.size
preproc += '#define wy_size %d\n' % wy.size
mod = SourceModule(preproc + edgetaper_code, keep=True)
edgetaper_gpu = mod.get_function("edgetaper")
wx_gpu, wx_size = mod.get_global('wx')
wy_gpu, wy_size = mod.get_global('wy')
cu.memcpy_htod(wx_gpu, wx)
cu.memcpy_htod(wy_gpu, wy)
edgetaper_gpu(y_gpu, np.int32(hsf[1]), np.int32(hsf[0]),
block=block_size, grid=grid_size)
示例4: prepare_device_arrays
def prepare_device_arrays(self):
self.maxLayers = self.grid_prop.GetMaxLayers()
nczbins_fine = len(self.czcen_fine)
numLayers = np.zeros(nczbins_fine,dtype=np.int32)
densityInLayer = np.zeros((nczbins_fine*self.maxLayers),dtype=self.FTYPE)
distanceInLayer = np.zeros((nczbins_fine*self.maxLayers),dtype=self.FTYPE)
self.grid_prop.GetNumberOfLayers(numLayers)
self.grid_prop.GetDensityInLayer(densityInLayer)
self.grid_prop.GetDistanceInLayer(distanceInLayer)
# Copy all these earth info arrays to device:
self.d_numLayers = cuda.mem_alloc(numLayers.nbytes)
self.d_densityInLayer = cuda.mem_alloc(densityInLayer.nbytes)
self.d_distanceInLayer = cuda.mem_alloc(distanceInLayer.nbytes)
cuda.memcpy_htod(self.d_numLayers,numLayers)
cuda.memcpy_htod(self.d_densityInLayer,densityInLayer)
cuda.memcpy_htod(self.d_distanceInLayer,distanceInLayer)
self.d_ecen_fine = cuda.mem_alloc(self.ecen_fine.nbytes)
self.d_czcen_fine = cuda.mem_alloc(self.czcen_fine.nbytes)
cuda.memcpy_htod(self.d_ecen_fine,self.ecen_fine)
cuda.memcpy_htod(self.d_czcen_fine,self.czcen_fine)
return
示例5: _set
def _set(self, ary):
# Allocate a new buffer with suitable padding and pack it
buf = np.zeros((self.nrow, self.leaddim), dtype=self.dtype)
buf[:, :self.ncol] = self._pack(ary)
# Copy
cuda.memcpy_htod(self.data, buf)
示例6: from_np
def from_np(np_data):
cudabuf = cuda.mem_alloc(np_data.nbytes)
cuda.memcpy_htod(cudabuf, np_data)
# self.cpudata = np_data
tensor = MyTensor(cudabuf, shape=np_data.shape, size=np_data.size)
tensor.cpudata = np_data
return tensor
示例7: cuda_crossOver
def cuda_crossOver(sola, solb):
""" """
sol_len = len(sola);
a_gpu = cuda.mem_alloc(sola.nbytes);
b_gpu = cuda.mem_alloc(solb.nbytes);
cuda.memcpy_htod(a_gpu, sola);
cuda.memcpy_htod(b_gpu, solb);
func = mod.get_function("crossOver");
func(a_gpu,b_gpu, block=(sol_len,1,1));
a_new = numpy.empty_like(sola);
b_new = numpy.empty_like(solb);
cuda.memcpy_dtoh(a_new, a_gpu);
cuda.memcpy_dtoh(b_new, b_gpu);
if debug == True:
print "a:", a;
print "b:",b;
print "new a:",a_new;
print "new b:",b_new;
return a_new,b_new;
示例8: _to_device
def _to_device(self, module):
ptr, size = module.get_global(self.name)
if size != self.data.nbytes:
raise RuntimeError("Const %s needs %d bytes, but only space for %d" % (self, self.data.nbytes, size))
if self.state is DeviceDataMixin.HOST:
driver.memcpy_htod(ptr, self._data)
self.state = DeviceDataMixin.BOTH
示例9: __init__
def __init__(self, n_dict, V, dt, debug=False):
self.num_neurons = len(n_dict['id'])
self.dt = np.double(dt)
self.steps = max(int(round(dt / 1e-5)), 1)
self.debug = debug
self.ddt = dt / self.steps
self.V = V
self.n = garray.to_gpu(np.asarray(n_dict['initn'], dtype=np.float64))
self.V_1 = garray.to_gpu(np.asarray(n_dict['V1'], dtype=np.float64))
self.V_2 = garray.to_gpu(np.asarray(n_dict['V2'], dtype=np.float64))
self.V_3 = garray.to_gpu(np.asarray(n_dict['V3'], dtype=np.float64))
self.V_4 = garray.to_gpu(np.asarray(n_dict['V4'], dtype=np.float64))
self.V_l = garray.to_gpu(np.asarray(n_dict['V_l'], dtype = np.float64))
self.V_ca = garray.to_gpu(np.asarray(n_dict['V_ca'], dtype = np.float64))
self.V_k = garray.to_gpu(np.asarray(n_dict['V_k'], dtype = np.float64))
self.G_l = garray.to_gpu(np.asarray(n_dict['G_l'], dtype = np.float64))
self.G_ca = garray.to_gpu(np.asarray(n_dict['G_ca'], dtype = np.float64))
self.G_k = garray.to_gpu(np.asarray(n_dict['G_k'], dtype = np.float64))
self.Tphi = garray.to_gpu(np.asarray(n_dict['phi'], dtype=np.float64))
self.offset = garray.to_gpu(np.asarray(n_dict['offset'],
dtype=np.float64))
cuda.memcpy_htod(int(self.V), np.asarray(n_dict['initV'],
dtype=np.double))
self.update = self.get_euler_kernel()
示例10: evaluate
def evaluate(self, params, returnOutputs=False):
"""Evaluate several networks (with given params) on training set.
@param params: network params
@type params: list of Parameters
@param returnOutputs: return network output values (debug)
@type returnOutputs: bool, default False
@return output matrix if returnOutputs=True, else None
"""
if self.popSize != len(params):
raise ValueError("Need %d Parameter structures (provided %d)" % (
self.popSize, len(params)))
paramArrayType = Parameters * len(params)
driver.memcpy_htod(self.params, paramArrayType(*params))
# TODO: remove
driver.memset_d8(self.outputs, 0, self.popSize * self.trainSet.size * 4)
self.evaluateKernel.prepared_call(self.evaluateGridDim,
self.trainSetDev,
self.trainSet.size,
self.params,
self.popSize,
self.outputs)
driver.Context.synchronize()
self.outputsMat = driver.from_device(self.outputs,
shape=(self.popSize, self.trainSet.size),
dtype=np.float32)
if returnOutputs:
return self.outputsMat
示例11: __init__
def __init__(self, n_dict, V, dt, debug=False, cuda_verbose=False):
if cuda_verbose:
self.compile_options = ["--ptxas-options=-v"]
else:
self.compile_options = []
self.num_neurons = len(n_dict["id"])
self.dt = np.double(dt)
self.steps = max(int(round(dt / 1e-5)), 1)
self.debug = debug
self.ddt = dt / self.steps
self.V = V
self.n = garray.to_gpu(np.asarray(n_dict["initn"], dtype=np.float64))
self.V_1 = garray.to_gpu(np.asarray(n_dict["V1"], dtype=np.float64))
self.V_2 = garray.to_gpu(np.asarray(n_dict["V2"], dtype=np.float64))
self.V_3 = garray.to_gpu(np.asarray(n_dict["V3"], dtype=np.float64))
self.V_4 = garray.to_gpu(np.asarray(n_dict["V4"], dtype=np.float64))
self.V_l = garray.to_gpu(np.asarray(n_dict["V_l"], dtype=np.float64))
self.V_ca = garray.to_gpu(np.asarray(n_dict["V_ca"], dtype=np.float64))
self.V_k = garray.to_gpu(np.asarray(n_dict["V_k"], dtype=np.float64))
self.G_l = garray.to_gpu(np.asarray(n_dict["G_l"], dtype=np.float64))
self.G_ca = garray.to_gpu(np.asarray(n_dict["G_ca"], dtype=np.float64))
self.G_k = garray.to_gpu(np.asarray(n_dict["G_k"], dtype=np.float64))
self.Tphi = garray.to_gpu(np.asarray(n_dict["phi"], dtype=np.float64))
self.offset = garray.to_gpu(np.asarray(n_dict["offset"], dtype=np.float64))
cuda.memcpy_htod(int(self.V), np.asarray(n_dict["initV"], dtype=np.double))
self.update = self.get_euler_kernel()
示例12: __compile_kernels
def __compile_kernels(self):
""" DFS module """
f = self.forest
self.find_min_kernel = f.find_min_kernel
self.fill_kernel = f.fill_kernel
self.scan_reshuffle_tex = f.scan_reshuffle_tex
self.comput_total_2d = f.comput_total_2d
self.reduce_2d = f.reduce_2d
self.scan_total_2d = f.scan_total_2d
self.scan_reduce = f.scan_reduce
""" BFS module """
self.scan_total_bfs = f.scan_total_bfs
self.comput_bfs_2d = f.comput_bfs_2d
self.fill_bfs = f.fill_bfs
self.reshuffle_bfs = f.reshuffle_bfs
self.reduce_bfs_2d = f.reduce_bfs_2d
self.get_thresholds = f.get_thresholds
""" Other """
self.predict_kernel = f.predict_kernel
self.mark_table = f.mark_table
const_sorted_indices = f.bfs_module.get_global("sorted_indices_1")[0]
const_sorted_indices_ = f.bfs_module.get_global("sorted_indices_2")[0]
cuda.memcpy_htod(const_sorted_indices, np.uint64(self.sorted_indices_gpu.ptr))
cuda.memcpy_htod(const_sorted_indices_, np.uint64(self.sorted_indices_gpu_.ptr))
示例13: calc_bandwidth_h2d
def calc_bandwidth_h2d( s ):
t1 = datetime.now()
cuda.memcpy_htod( s.dev_a, s.a )
dt = datetime.now() - t1
dt_float = dt.seconds + dt.microseconds*1e-6
return s.nbytes/dt_float/gbytes
示例14: __compute_guassian_on_pts
def __compute_guassian_on_pts(self):
view = self.view_tile.get_View()
for dset in self.data_sets:
_data = np.array(dset.getDataSet(), copy=True)
_data[:, 0] = (_data[:, 0] - view.left)/view.width()
_data[:, 1] = (_data[:, 1] - view.bottom)/view.height()
for row in range(self.grid_size[0]):
for col in range(self.grid_size[1]):
# 3 * SIGMA give the 95%
left = 1 / float(self.grid_size[1]) * col - (3 * self.sigma)
right = 1 / float(self.grid_size[1]) * (col + 1) + (3 * self.sigma)
bottom = 1 / float(self.grid_size[0]) * row - (3 * self.sigma)
top = 1 / float(self.grid_size[0]) * (row + 1) + (3 * self.sigma)
pts = getFilteredDataSet(_data, (left, right, bottom, top))
if len(pts) > 0:
self.pts_gpu = cuda.mem_alloc_like(pts)
cuda.memcpy_htod(self.pts_gpu, pts)
self.gpu_gaussian(self.grid_gpu, # Grid
self.pts_gpu, # Points
np.int32(col), # Block Index x
np.int32(row), # Block Index y
np.int32(self.grid_size[1]), # Grid Dimensions x
np.int32(self.grid_size[0]), # Grid Dimensions y
np.int32(pts.shape[0]), # Point Length
np.float32(self.dx), # dx
np.float32(self.dy), # dy
np.float32(self.sigma), # Sigma
block=self.block_size)
self.pts_gpu.free()
示例15: interior_buffer
def interior_buffer(source_im, dest_im, b_size, g_size, RGB, neighbors):
# create Cheetah template and fill in variables for mask kernel
mask_template = Template(mask_source)
mask_template.BLOCK_DIM_X = b_size[0]
mask_template.BLOCK_DIM_Y = b_size[1]
mask_template.WIDTH = dest_im.shape[1]
mask_template.HEIGHT = dest_im.shape[0]
mask_template.RGB = RGB
mask_template.NEIGHBORS = neighbors
# compile the CUDA kernel
mask_kernel = cuda_compile(mask_template, "mask_kernel")
# alloc memory to GPU
d_source = cu.mem_alloc(source_im.nbytes)
cu.memcpy_htod(d_source, source_im)
# sends to GPU filter out interior points in the mask
mask_kernel(d_source, block=b_size, grid=g_size)
# retrieves interior point buffer from GPU
inner_buffer = np.array(dest_im, dtype =np.uint8)
cu.memcpy_dtoh(inner_buffer, d_source)
# returns the interior buffer
return inner_buffer