本文整理匯總了Python中pycuda.gpuarray.to_gpu方法的典型用法代碼示例。如果您正苦於以下問題:Python gpuarray.to_gpu方法的具體用法?Python gpuarray.to_gpu怎麽用?Python gpuarray.to_gpu使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pycuda.gpuarray
的用法示例。
在下文中一共展示了gpuarray.to_gpu方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: _cached_gpuarray
# 需要導入模塊: from pycuda import gpuarray [as 別名]
# 或者: from pycuda.gpuarray import to_gpu [as 別名]
def _cached_gpuarray(self, array):
"""
Given a numpy array,
calculate the python hash of its bytes;
If it is not found in the cache, upload to gpu
and store in cache, otherwise return cached allocation.
"""
array = np.ascontiguousarray(array)
key = hash(array.tobytes())
try:
array_gpu = self._gpuarray_cache[key]
except KeyError:
array_gpu = ga.to_gpu(array)
self._gpuarray_cache[key] = array_gpu
# for testing: read_back_and_check!
return array_gpu
示例2: gpu_initialise_rx_arrays
# 需要導入模塊: from pycuda import gpuarray [as 別名]
# 或者: from pycuda.gpuarray import to_gpu [as 別名]
def gpu_initialise_rx_arrays(G):
"""Initialise arrays on GPU for receiver coordinates and to store field components for receivers.
Args:
G (class): Grid class instance - holds essential parameters describing the model.
"""
import pycuda.gpuarray as gpuarray
# Array to store receiver coordinates on GPU
rxcoords = np.zeros((len(G.rxs), 3), dtype=np.int32)
for i, rx in enumerate(G.rxs):
rxcoords[i, 0] = rx.xcoord
rxcoords[i, 1] = rx.ycoord
rxcoords[i, 2] = rx.zcoord
# Array to store field components for receivers on GPU - rows are field components; columns are iterations; pages are receivers
rxs = np.zeros((len(Rx.gpu_allowableoutputs), G.iterations, len(G.rxs)), dtype=floattype)
# Copy arrays to GPU
rxcoords_gpu = gpuarray.to_gpu(rxcoords)
rxs_gpu = gpuarray.to_gpu(rxs)
return rxcoords_gpu, rxs_gpu
示例3: compute_vertical_bitvector_data
# 需要導入模塊: from pycuda import gpuarray [as 別名]
# 或者: from pycuda.gpuarray import to_gpu [as 別名]
def compute_vertical_bitvector_data(data, use_CUDA):
#---build item to idx mapping---#
idx = 0
item2idx = {}
for transaction in data:
for item in transaction:
if not item in item2idx:
item2idx[item] = idx
idx += 1
idx2item = { idx : str(int(item)) for item, idx in item2idx.items() }
#---build vertical data---#
vb_data = np.zeros((len(item2idx), len(data)), dtype=int)
for trans_id, transaction in enumerate(data):
for item in transaction:
vb_data[item2idx[item], trans_id] = 1
if use_CUDA:
vb_data = gpuarray.to_gpu(vb_data.astype(np.uint16))
print('Data transformed into vertical bitvector representation with shape: ', np.shape(vb_data))
return vb_data, idx2item
##############
# COMPUTE L1 #
##############
示例4: gpuReduce
# 需要導入模塊: from pycuda import gpuarray [as 別名]
# 或者: from pycuda.gpuarray import to_gpu [as 別名]
def gpuReduce(self, outtype, mapper,reduceop, arglist):
"""Performs a map or reduce task on GPU by taking C code embedded in strings
>>> rdd = sc.parallelize(range(1,10000))
>>> rdd.gpuReduce(long, "x[i] * y[i]", "+" ,"long *x, long *y")
"""
cpudataset = np.asarray(self.collect())
a = gpuarray.to_gpu(cpudataset)
b = a
if reduceop == "*": initval = "1"
elif reduceop == "+": initval = "0"
else: print("Currently only \"+\" and \"*\" operations are supported \
by GPU reduction")
reduceexpr = "a" + reduceop + "b"
print reduceexpr
krnl = reduction.ReductionKernel(outtype, neutral=initval, map_expr=mapper, reduce_expr=reduceexpr, arguments=arglist)
results = krnl(a, b).get()
print results
示例5: gpuWordCount
# 需要導入模塊: from pycuda import gpuarray [as 別名]
# 或者: from pycuda.gpuarray import to_gpu [as 別名]
def gpuWordCount(self):
"""Performs word count by first rearranging and superposing the input
data to itself and tracking assiging each word a value of 1 by tracking
space key (ASCII code = 32) occurence.
>>> rdd = sc.textFile("README.md")
>>> rdd.gpuWordCount()
"""
import pycuda.driver as cuda
start = time.time()
cpudataset = " ".join(self.collect())
asciidata = np.asarray([ord(x) for x in cpudataset], dtype=np.uint8)
gpudataset = gpuarray.to_gpu(asciidata)
countkrnl = reduction.ReductionKernel(long, neutral = "0",
map_expr = "(a[i] == 32)*(b[i] != 32)",
reduce_expr = "a + b", arguments = "char *a, char *b")
results = 1 + countkrnl(gpudataset[:-1],gpudataset[1:]).get()
return results
示例6: up_sweep
# 需要導入模塊: from pycuda import gpuarray [as 別名]
# 或者: from pycuda.gpuarray import to_gpu [as 別名]
def up_sweep(x):
# let's typecast to be safe.
x = np.float64(x)
x_gpu = gpuarray.to_gpu(np.float64(x) )
x_old_gpu = x_gpu.copy()
for k in range( int(np.log2(x.size) ) ) :
num_threads = int(np.ceil( x.size / 2**(k+1)))
grid_size = int(np.ceil(num_threads / 32))
if grid_size > 1:
block_size = 32
else:
block_size = num_threads
up_gpu(x_gpu, x_old_gpu, np.int32(k) , block=(block_size,1,1), grid=(grid_size,1,1))
x_old_gpu[:] = x_gpu[:]
x_out = x_gpu.get()
return(x_out)
# kernel for down-sweep phase
開發者ID:PacktPublishing,項目名稱:Hands-On-GPU-Programming-with-Python-and-CUDA,代碼行數:23,代碼來源:work-efficient_prefix.py
示例7: down_sweep
# 需要導入模塊: from pycuda import gpuarray [as 別名]
# 或者: from pycuda.gpuarray import to_gpu [as 別名]
def down_sweep(y):
y = np.float64(y)
y[-1] = 0
y_gpu = gpuarray.to_gpu(y)
y_old_gpu = y_gpu.copy()
for k in reversed(range(int(np.log2(y.size)))):
num_threads = int(np.ceil( y.size / 2**(k+1)))
grid_size = int(np.ceil(num_threads / 32))
if grid_size > 1:
block_size = 32
else:
block_size = num_threads
down_gpu(y_gpu, y_old_gpu, np.int32(k), block=(block_size,1,1), grid=(grid_size,1,1))
y_old_gpu[:] = y_gpu[:]
y_out = y_gpu.get()
return(y_out)
# full implementation of work-efficient parallel prefix sum
開發者ID:PacktPublishing,項目名稱:Hands-On-GPU-Programming-with-Python-and-CUDA,代碼行數:23,代碼來源:work-efficient_prefix.py
示例8: gpu_mandelbrot
# 需要導入模塊: from pycuda import gpuarray [as 別名]
# 或者: from pycuda.gpuarray import to_gpu [as 別名]
def gpu_mandelbrot(width, height, real_low, real_high, imag_low, imag_high, max_iters, upper_bound):
# we set up our complex lattice as such
real_vals = np.matrix(np.linspace(real_low, real_high, width), dtype=np.complex64)
imag_vals = np.matrix(np.linspace( imag_high, imag_low, height), dtype=np.complex64) * 1j
mandelbrot_lattice = np.array(real_vals + imag_vals.transpose(), dtype=np.complex64)
# copy complex lattice to the GPU
mandelbrot_lattice_gpu = gpuarray.to_gpu(mandelbrot_lattice)
# allocate an empty array on the GPU
mandelbrot_graph_gpu = gpuarray.empty(shape=mandelbrot_lattice.shape, dtype=np.float32)
mandel_ker( mandelbrot_lattice_gpu, mandelbrot_graph_gpu, np.int32(max_iters), np.float32(upper_bound))
mandelbrot_graph = mandelbrot_graph_gpu.get()
return mandelbrot_graph
開發者ID:PacktPublishing,項目名稱:Hands-On-GPU-Programming-with-Python-and-CUDA,代碼行數:20,代碼來源:gpu_mandelbrot0.py
示例9: scikit_gpu_fft_pipeline
# 需要導入模塊: from pycuda import gpuarray [as 別名]
# 或者: from pycuda.gpuarray import to_gpu [as 別名]
def scikit_gpu_fft_pipeline(filename):
data = []
start = timer()
with open(filename, 'r') as file_obj:
for _ in range(((32768*1024*SIZE_MULTIPLIER//GULP_SIZE)//COMPLEX_MULTIPLIER)//GULP_FRAME_FFT):
data = np.fromfile(file_obj, dtype=np.complex64, count=GULP_SIZE*GULP_FRAME_FFT).reshape((GULP_FRAME_FFT, GULP_SIZE))
g_data = gpuarray.to_gpu(data)
plan = Plan(data.shape[1], np.complex64, np.complex64, batch=GULP_FRAME_FFT)
plan_inverse = Plan(data.shape[1], np.complex64, np.complex64, batch=GULP_FRAME_FFT)
tmp1 = gpuarray.empty(data.shape, dtype=np.complex64)
tmp2 = gpuarray.empty(data.shape, dtype=np.complex64)
fft(g_data, tmp1, plan)
ifft(tmp1, tmp2, plan_inverse)
for _ in range(NUMBER_FFT-1):
# Can't do FFT in place for fairness (emulating full pipeline)
tmp1 = gpuarray.empty(data.shape, dtype=np.complex64)
fft(tmp2, tmp1, plan)
tmp2 = gpuarray.empty(data.shape, dtype=np.complex64)
ifft(tmp1, tmp2, plan_inverse)
end = timer()
return end-start
示例10: interpolate
# 需要導入模塊: from pycuda import gpuarray [as 別名]
# 或者: from pycuda.gpuarray import to_gpu [as 別名]
def interpolate(self, flow, freqs, amps, phases):
flow = numpy.float32(flow)
texlen = numpy.int32(len(freqs))
fmax = numpy.float32(freqs[texlen-1])
freqs_gpu = gpuarray.to_gpu(freqs)
freqs_gpu.bind_to_texref_ext(self.freq_tex, allow_offset=False)
amps_gpu = gpuarray.to_gpu(amps)
amps_gpu.bind_to_texref_ext(self.amp_tex, allow_offset=False)
phases_gpu = gpuarray.to_gpu(phases)
phases_gpu.bind_to_texref_ext(self.phase_tex, allow_offset=False)
fn1 = self.fn1.prepared_call
fn2 = self.fn2.prepared_call
fn1((1, 1), (self.nb, 1, 1), self.lower, self.upper, texlen, self.df, flow, fmax)
fn2((self.nb, 1), (self.nt, 1, 1), self.output, self.df, self.hlen, flow, fmax, texlen, self.lower, self.upper)
pycbc.scheme.mgr.state.context.synchronize()
return
示例11: gpu_initialise_arrays
# 需要導入模塊: from pycuda import gpuarray [as 別名]
# 或者: from pycuda.gpuarray import to_gpu [as 別名]
def gpu_initialise_arrays(self):
"""Initialise standard field arrays on GPU."""
import pycuda.gpuarray as gpuarray
self.ID_gpu = gpuarray.to_gpu(self.ID)
self.Ex_gpu = gpuarray.to_gpu(np.zeros((self.nx + 1, self.ny + 1, self.nz + 1), dtype=floattype))
self.Ey_gpu = gpuarray.to_gpu(np.zeros((self.nx + 1, self.ny + 1, self.nz + 1), dtype=floattype))
self.Ez_gpu = gpuarray.to_gpu(np.zeros((self.nx + 1, self.ny + 1, self.nz + 1), dtype=floattype))
self.Hx_gpu = gpuarray.to_gpu(np.zeros((self.nx + 1, self.ny + 1, self.nz + 1), dtype=floattype))
self.Hy_gpu = gpuarray.to_gpu(np.zeros((self.nx + 1, self.ny + 1, self.nz + 1), dtype=floattype))
self.Hz_gpu = gpuarray.to_gpu(np.zeros((self.nx + 1, self.ny + 1, self.nz + 1), dtype=floattype))
示例12: gpu_initialise_dispersive_arrays
# 需要導入模塊: from pycuda import gpuarray [as 別名]
# 或者: from pycuda.gpuarray import to_gpu [as 別名]
def gpu_initialise_dispersive_arrays(self):
"""Initialise dispersive material coefficient arrays on GPU."""
import pycuda.gpuarray as gpuarray
self.Tx_gpu = gpuarray.to_gpu(self.Tx)
self.Ty_gpu = gpuarray.to_gpu(self.Ty)
self.Tz_gpu = gpuarray.to_gpu(self.Tz)
self.updatecoeffsdispersive_gpu = gpuarray.to_gpu(self.updatecoeffsdispersive)
示例13: from_np
# 需要導入模塊: from pycuda import gpuarray [as 別名]
# 或者: from pycuda.gpuarray import to_gpu [as 別名]
def from_np(self, matrix):
return gpuarray.to_gpu(matrix.astype(self.floattype))
示例14: test_performance
# 需要導入模塊: from pycuda import gpuarray [as 別名]
# 或者: from pycuda.gpuarray import to_gpu [as 別名]
def test_performance(self):
c = random.rand(2000,2000)
x = Variable([2000,2000])
K = np.abs(random.rand(9,9))
G = CompGraph(vstack([ subsample((conv_nofft(K, x) -c)*5, [2,4]), x*10 ]))
xtest1 = random.rand(2000*2000).astype(np.float32)
ytest1 = np.zeros(G.output_size, dtype=np.float32)
t1_cpu = time.time()
for i in range(10):
ytest1 = G.forward(xtest1, ytest1)
t2_cpu = time.time()
xtest = gpuarray.to_gpu(xtest1.astype(np.float32))
ytest = gpuarray.to_gpu(ytest1.astype(np.float32))
t1_gpu = time.time()
for i in range(10):
ytest = G.forward_cuda(xtest, ytest)
t2_gpu = time.time()
t_cpu = t2_cpu - t1_cpu
t_gpu = t2_gpu - t1_gpu
logging.info("Forward timing: cpu=%.2f ms gpu=%.2f ms factor=%.3f" % (t_cpu, t_gpu, t_gpu/t_cpu))
self.assertTrue(t_gpu < t_cpu)
t1_cpu = time.time()
for i in range(10):
xtest1 = G.adjoint(ytest1, xtest1)
t2_cpu = time.time()
t1_gpu = time.time()
for i in range(10):
xtest = G.adjoint_cuda(ytest, xtest)
t2_gpu = time.time()
t_cpu = t2_cpu - t1_cpu
t_gpu = t2_gpu - t1_gpu
logging.info("Adjoint timing: cpu=%.2f ms gpu=%.2f ms factor=%.3f" % (t_cpu, t_gpu, t_gpu/t_cpu))
self.assertTrue(t_gpu < t_cpu)
#print( G.start.adjoint_cuda(G, 0, "i", None)[0] )
示例15: gpuWordCount
# 需要導入模塊: from pycuda import gpuarray [as 別名]
# 或者: from pycuda.gpuarray import to_gpu [as 別名]
def gpuWordCount(self):
def gpuFunc(iterator):
# 1. Data preparation
iterator = iter(iterator)
cpu_data = list(iterator)
cpu_dataset = " ".join(cpu_data)
ascii_data = np.asarray([ord(x) for x in cpu_dataset], dtype=np.uint8)
# 2. Driver initialization and data transfer
cuda.init()
dev = cuda.Device(0)
contx = dev.make_context()
gpu_dataset = gpuarray.to_gpu(ascii_data)
# 3. GPU kernel.
# The kernel's algorithm counts the words by keeping
# track of the space between them
countkrnl = reduction.ReductionKernel(long, neutral = "0",
map_expr = "(a[i] == 32)*(b[i] != 32)",
reduce_expr = "a + b", arguments = "char *a, char *b")
results = countkrnl(gpu_dataset[:-1],gpu_dataset[1:]).get()
yield results
# Release GPU context resources
contx.pop()
del gpu_dataset
del contx
gc.collect()
vals = self.rdd.mapPartitions(gpuFunc)
return vals