本文整理汇总了C++中cuMemAlloc函数的典型用法代码示例。如果您正苦于以下问题:C++ cuMemAlloc函数的具体用法?C++ cuMemAlloc怎么用?C++ cuMemAlloc使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了cuMemAlloc函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: DeallocateResources
const bool CUDARunner::AllocateResources(const int numb, const int numt)
{
bool allocated=true;
CUresult rval;
DeallocateResources();
m_in=(cuda_in *)malloc(sizeof(cuda_in));
m_out=(cuda_out *)malloc(numb*numt*sizeof(cuda_out));
rval=cuMemAlloc(&m_devin,sizeof(cuda_in));
if(rval!=CUDA_SUCCESS)
{
printf("Error %d allocating CUDA memory\n",rval);
m_devin=0;
allocated=false;
}
rval=cuMemAlloc(&m_devout,numb*numt*sizeof(cuda_out));
if(rval!=CUDA_SUCCESS)
{
printf("Error %d allocating CUDA memory\n",rval);
m_devout=0;
allocated=false;
}
printf("Done allocating CUDA resources for (%d,%d)\n",numb,numt);
return allocated;
}
示例2: gpu_transpose_naive
void gpu_transpose_naive(int *dest, const int *src, int height, int width) {
assert((width & (width - 1)) == 0); // TODO
assert((height & (height - 1)) == 0);
cuda->set_default_module("transpose.ptx");
CUfunction transpose_kernel = cuda->get_kernel("transpose_naive");
int grid_dim_x = width / BLOCK_DIM_X;
int grid_dim_y = height / BLOCK_DIM_Y;
CUdeviceptr device_src;
CUdeviceptr device_dest;
cuMemAlloc(&device_src, width*height*sizeof(int));
cuMemAlloc(&device_dest, width*height*sizeof(int));
cuMemcpyHtoD(device_src, src, width*height*sizeof(int));
void *args[] = {&device_dest, &device_src, &height, &width};
cuda->launch_kernel_2d_sync(transpose_kernel,
grid_dim_x, grid_dim_y,
BLOCK_DIM_X, BLOCK_DIM_Y,
args);
cuMemcpyDtoH(dest, device_dest, width*height*sizeof(int));
cuMemFree(device_src);
cuMemFree(device_dest);
cuda->ctx_synchronize();
}
示例3: get_dev_mem
/*
* get device memory
*/
void
get_dev_mem(void){
res = cuMemAlloc(&x_dev, N * sizeof(double));
if(res != CUDA_SUCCESS){
printf("cuMemAlloc(x) failed: res = %s\n", conv(res));
exit(1);
}
res = cuMemAlloc(&v_dev, N * sizeof(double));
if(res != CUDA_SUCCESS){
printf("cuMemAlloc(v) failed: res = %s\n", conv(res));
exit(1);
}
res = cuMemAlloc(&error_dev, sizeof(int));
if(res != CUDA_SUCCESS){
printf("cuMemAlloc(error) failed: res = %s\n", conv(res));
exit(1);
}
res = cuMemAlloc(&s_time_dev, sizeof(double));
if(res != CUDA_SUCCESS){
printf("cuMemAlloc(s_time) failed: res = %s\n", conv(res));
exit(1);
}
}
示例4: gpu_transpose_with_shared_mem
void gpu_transpose_with_shared_mem(int *dest, const int *src, int height, int width) {
assert((width & (width - 1)) == 0); // TODO
assert((height & (height - 1)) == 0);
cuda->set_default_module(CUDA_PTX_PREFIX"transpose.cu.ptx");
CUfunction transpose_kernel = cuda->get_kernel("transpose_with_shared_mem");
int grid_dim_x = width / TILE_DIM;
int grid_dim_y = height / TILE_DIM;
CUdeviceptr device_src;
CUdeviceptr device_dest;
cuMemAlloc(&device_src, width*height*sizeof(int));
cuMemAlloc(&device_dest, width*height*sizeof(int));
cuMemcpyHtoD(device_src, src, width*height*sizeof(int));
void *args[] = {&device_dest, &device_src};
cuda->launch_kernel_2d_sync(transpose_kernel,
grid_dim_x, grid_dim_y,
TILE_DIM, 2,
args);
cuMemcpyDtoH(dest, device_dest, width*height*sizeof(int));
cuMemFree(device_src);
cuMemFree(device_dest);
cuda->ctx_synchronize();
}
示例5: cuda_over_map
Object cuda_over_map(Object self, int nparts, int *argcv,
Object *argv, int flags) {
CUresult error;
cuInit(0);
int deviceCount = 0;
error = cuDeviceGetCount(&deviceCount);
if (deviceCount == 0) {
raiseError("No CUDA devices found");
}
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
CUfunction cuFunc;
error = cuDeviceGet(&cuDevice, 0);
error = cuCtxCreate(&cuContext, 0, cuDevice);
CUdeviceptr d_A;
CUdeviceptr d_B;
CUdeviceptr d_res;
errcheck(cuModuleLoad(&cuModule, grcstring(argv[argcv[0]])));
CUdeviceptr dps[argcv[0]];
void *args[argcv[0]+2];
int size = INT_MAX;
for (int i=0; i<argcv[0]; i++) {
struct CudaFloatArray *a = (struct CudaFloatArray *)argv[i];
if (a->size < size)
size = a->size;
errcheck(cuMemAlloc(&dps[i], size * sizeof(float)));
errcheck(cuMemcpyHtoD(dps[i], &a->data, size * sizeof(float)));
args[i+1] = &dps[i];
}
struct CudaFloatArray *r =
(struct CudaFloatArray *)(alloc_CudaFloatArray(size));
int fsize = sizeof(float) * size;
errcheck(cuMemAlloc(&d_res, fsize));
errcheck(cuMemcpyHtoD(d_res, &r->data, fsize));
args[0] = &d_res;
args[argcv[0]+1] = &size;
int threadsPerBlock = 256;
int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
char name[256];
strcpy(name, "block");
strcat(name, grcstring(argv[argcv[0]]) + strlen("_cuda/"));
for (int i=0; name[i] != 0; i++)
if (name[i] == '.') {
name[i] = 0;
break;
}
errcheck(cuModuleGetFunction(&cuFunc, cuModule, name));
errcheck(cuLaunchKernel(cuFunc, blocksPerGrid, 1, 1,
threadsPerBlock, 1, 1,
0,
NULL, args, NULL));
errcheck(cuMemcpyDtoH(&r->data, d_res, fsize));
cuMemFree(d_res);
for (int i=0; i<argcv[0]; i++)
cuMemFree(dps[i]);
return (Object)r;
}
示例6: main
/**
* This measures the overhead in launching a kernel function on each GPU in the
* system.
*
* It does this by executing a small kernel (copying 1 value in global memory) a
* very large number of times and taking the average execution time. This
* program uses the CUDA driver API.
*/
int main() {
CU_ERROR_CHECK(cuInit(0));
int count;
CU_ERROR_CHECK(cuDeviceGetCount(&count));
float x = 5.0f;
for (int d = 0; d < count; d++) {
CUdevice device;
CU_ERROR_CHECK(cuDeviceGet(&device, d));
CUcontext context;
CU_ERROR_CHECK(cuCtxCreate(&context, 0, device));
CUdeviceptr in, out;
CU_ERROR_CHECK(cuMemAlloc(&in, sizeof(float)));
CU_ERROR_CHECK(cuMemAlloc(&out, sizeof(float)));
CU_ERROR_CHECK(cuMemcpyHtoD(in, &x, sizeof(float)));
CUmodule module;
CU_ERROR_CHECK(cuModuleLoadData(&module, imageBytes));
CUfunction function;
CU_ERROR_CHECK(cuModuleGetFunction(&function, module, "kernel"));
void * params[] = { &in, &out };
CUevent start, stop;
CU_ERROR_CHECK(cuEventCreate(&start, 0));
CU_ERROR_CHECK(cuEventCreate(&stop, 0));
CU_ERROR_CHECK(cuEventRecord(start, 0));
for (int i = 0; i < ITERATIONS; i++)
CU_ERROR_CHECK(cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL));
CU_ERROR_CHECK(cuEventRecord(stop, 0));
CU_ERROR_CHECK(cuEventSynchronize(stop));
float time;
CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop));
CU_ERROR_CHECK(cuEventDestroy(start));
CU_ERROR_CHECK(cuEventDestroy(stop));
CU_ERROR_CHECK(cuMemFree(in));
CU_ERROR_CHECK(cuMemFree(out));
fprintf(stdout, "Device %d: %fms\n", d, (time / (double)ITERATIONS));
CU_ERROR_CHECK(cuModuleUnload(module));
CU_ERROR_CHECK(cuCtxDestroy(context));
}
return 0;
}
示例7: TestSAXPY
CUresult
TestSAXPY( chCUDADevice *chDevice, size_t N, float alpha )
{
CUresult status;
CUdeviceptr dptrOut = 0;
CUdeviceptr dptrIn = 0;
float *hostOut = 0;
float *hostIn = 0;
CUDA_CHECK( cuCtxPushCurrent( chDevice->context() ) );
CUDA_CHECK( cuMemAlloc( &dptrOut, N*sizeof(float) ) );
CUDA_CHECK( cuMemsetD32( dptrOut, 0, N ) );
CUDA_CHECK( cuMemAlloc( &dptrIn, N*sizeof(float) ) );
CUDA_CHECK( cuMemHostAlloc( (void **) &hostOut, N*sizeof(float), 0 ) );
CUDA_CHECK( cuMemHostAlloc( (void **) &hostIn, N*sizeof(float), 0 ) );
for ( size_t i = 0; i < N; i++ ) {
hostIn[i] = (float) rand() / (float) RAND_MAX;
}
CUDA_CHECK( cuMemcpyHtoDAsync( dptrIn, hostIn, N*sizeof(float ), NULL ) );
{
CUmodule moduleSAXPY;
CUfunction kernelSAXPY;
void *params[] = { &dptrOut, &dptrIn, &N, &alpha };
moduleSAXPY = chDevice->module( "saxpy.ptx" );
if ( ! moduleSAXPY ) {
status = CUDA_ERROR_NOT_FOUND;
goto Error;
}
CUDA_CHECK( cuModuleGetFunction( &kernelSAXPY, moduleSAXPY, "saxpy" ) );
CUDA_CHECK( cuLaunchKernel( kernelSAXPY, 1500, 1, 1, 512, 1, 1, 0, NULL, params, NULL ) );
}
CUDA_CHECK( cuMemcpyDtoHAsync( hostOut, dptrOut, N*sizeof(float), NULL ) );
CUDA_CHECK( cuCtxSynchronize() );
for ( size_t i = 0; i < N; i++ ) {
if ( fabsf( hostOut[i] - alpha*hostIn[i] ) > 1e-5f ) {
status = CUDA_ERROR_UNKNOWN;
goto Error;
}
}
status = CUDA_SUCCESS;
printf( "Well it worked!\n" );
Error:
cuCtxPopCurrent( NULL );
cuMemFreeHost( hostOut );
cuMemFreeHost( hostIn );
cuMemFree( dptrOut );
cuMemFree( dptrIn );
return status;
}
示例8: try_init
void *swanMalloc( size_t len ) {
void *ptr;
CUdeviceptr dptr;
CUresult err;
try_init();
if( len == 0 ) {
// printf("SWAN: WARNING - swnaMalloc() called with 0\n");
return NULL;
}
err = cuMemAlloc( (CUdeviceptr*) &dptr, len );
ptr = (void*)dptr;
if ( err != CUDA_SUCCESS ) {
printf("Attempted to allocate %lu bytes (%lu already allocated)\n", len, state.bytes_allocated );
abort();
error("swanMalloc failed\n" );
}
state.bytes_allocated += len;
// MJH likes his memory clean
swanMemset( ptr, 0, len );
return ptr;
}
示例9: halide_dev_malloc
WEAK void halide_dev_malloc(void *user_context, buffer_t* buf) {
if (buf->dev) {
// This buffer already has a device allocation
return;
}
size_t size = __buf_size(user_context, buf);
#ifdef DEBUG
halide_printf(user_context, "dev_malloc allocating buffer of %zd bytes, "
"extents: %zdx%zdx%zdx%zd strides: %zdx%zdx%zdx%zd (%d bytes per element)\n",
size, buf->extent[0], buf->extent[1], buf->extent[2], buf->extent[3],
buf->stride[0], buf->stride[1], buf->stride[2], buf->stride[3],
buf->elem_size);
#endif
CUdeviceptr p;
TIME_CALL( cuMemAlloc(&p, size), "dev_malloc");
buf->dev = (uint64_t)p;
halide_assert(user_context, buf->dev);
#ifdef DEBUG
halide_assert(user_context, halide_validate_dev_pointer(user_context, buf));
#endif
}
示例10: get_read_ptr_cuda
CUdeviceptr get_read_ptr_cuda(ComputeEnv *env,int devid, size_t read_byte_size) {
if (cuda_valid_list[devid]) {
return cuda_ptr_list[devid];
}
if (host_valid == false) {
/* xx */
abort();
return 0;
}
CUDADev *dev = &env->cuda_dev_list[devid];
cuCtxPushCurrent(dev->context);
if (cuda_ptr_list[devid] == 0) {
CUresult err;
err = cuMemAlloc(&cuda_ptr_list[devid], byte_size);
if (err != CUDA_SUCCESS) {
abort();
}
}
//double t0 = getsec();
cuMemcpyHtoD(cuda_ptr_list[devid], host_ptr, read_byte_size);
//double t1 = getsec();
//env->transfer_wait = t1-t0;
cuda_valid_list[devid] = true;
CUcontext old;
cuCtxPopCurrent(&old);
return cuda_ptr_list[devid];
}
示例11: main
int main(){
init_test();
const std::string source =
".version 4.2\n"
".target sm_20\n"
".address_size 64\n"
".visible .entry kernel(.param .u64 kernel_param_0) {\n"
".reg .s32 %r<2>;\n"
".reg .s64 %rd<3>;\n"
"bra BB1_2;\n"
"ld.param.u64 %rd1, [kernel_param_0];\n"
"cvta.to.global.u64 %rd2, %rd1;\n"
"mov.u32 %r1, 5;\n"
"st.global.u32 [%rd2], %r1;\n"
"BB1_2: ret;\n"
"}\n";
CUmodule modId = 0;
CUfunction funcHandle = 0;
cu_assert(cuModuleLoadData(&modId, source.c_str()));
cu_assert(cuModuleGetFunction(&funcHandle, modId, "kernel"));
CUdeviceptr devValue;
int hostValue = 10;
cu_assert(cuMemAlloc(&devValue, sizeof(int)));
cu_assert(cuMemcpyHtoD(devValue, &hostValue, sizeof(hostValue)));
void * params[] = {&devValue};
cu_assert(cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr));
cu_assert(cuMemcpyDtoH(&hostValue, devValue, sizeof(hostValue)));
assert(hostValue == 10);
std::cout << hostValue << "\n";
cu_assert(cuMemFree(devValue));
cu_assert(cuModuleUnload(modId));
return 0;
}
示例12: sararfftnd_one_complex_to_real
void sararfftnd_one_complex_to_real(
sararfftnd_plan plan, sarafft_complex *h_data
) {
CUdeviceptr d_data;
size_t planSize = getPlanSize( plan );
if ( CUDA_SUCCESS != cuMemAlloc( &d_data, planSize ) ) {
printf( "cuMemAlloc failed for plansize %li!\n", planSize );
fflush ( stdout );
exit( 90 );
}
if ( CUDA_SUCCESS != cuMemcpyHtoD( d_data, h_data, planSize ) ) {
printf( "cuMemcpyHtoD failed!\n" );
fflush ( stdout );
exit( 91 );
}
if ( CUFFT_SUCCESS != cufftExecC2R( plan, ( cufftComplex* )d_data, ( cufftReal* )d_data ) ) {
printf( "cufftExecR2C failed!\n" );
fflush ( stdout );
exit( 92 );
}
if ( CUDA_SUCCESS != cuMemcpyDtoH( h_data, d_data, planSize ) ) {
printf( "cuMemcpyDtoH failed!\n" );
fflush ( stdout );
exit( 93 );
}
if ( CUDA_SUCCESS != cuMemFree( d_data ) ) {
printf( "cuMemFree failed!\n" );
fflush ( stdout );
exit( 94 );
}
}
示例13: prealloc
bool prealloc(ComputeEnv *env) {
int devid;
if (host_ptr == nullptr) {
host_ptr = _mm_malloc(byte_size, 64);
if (host_ptr == nullptr) {
return false;
}
}
switch (env->target_processor.type) {
case W2XCONV_PROC_HOST:
break;
case W2XCONV_PROC_OPENCL:
devid = env->target_processor.devid;
if (cl_ptr_list[devid] == nullptr) {
cl_int err;
OpenCLDev *dev = &env->cl_dev_list[devid];
cl_ptr_list[devid] = clCreateBuffer(dev->context,
CL_MEM_READ_WRITE,
byte_size, nullptr, &err);
if (cl_ptr_list[devid] == nullptr) {
return false;
}
/* touch memory to force allocation */
char data = 0;
err = clEnqueueWriteBuffer(dev->queue, cl_ptr_list[devid],
CL_TRUE, 0, 1, &data, 0, nullptr, nullptr);
if (err != CL_SUCCESS) {
clReleaseMemObject(cl_ptr_list[devid]);
cl_ptr_list[devid] = nullptr;
return false;
}
}
break;
case W2XCONV_PROC_CUDA:
devid = env->target_processor.devid;
if (cuda_ptr_list[devid] == 0) {
CUresult err;
CUDADev *dev = &env->cuda_dev_list[devid];
cuCtxPushCurrent(dev->context);
err = cuMemAlloc(&cuda_ptr_list[devid], byte_size);
CUcontext old;
cuCtxPopCurrent(&old);
if (err != CUDA_SUCCESS) {
return false;
}
}
break;
}
return true;
}
示例14: mem_alloc
void mem_alloc(device_memory& mem, MemoryType type)
{
cuda_push_context();
CUdeviceptr device_pointer;
cuda_assert(cuMemAlloc(&device_pointer, mem.memory_size()))
mem.device_pointer = (device_ptr)device_pointer;
cuda_pop_context();
}
示例15: mem
CUresult CuContext::ByteAlloc(size_t size, DeviceMemPtr* ppMem) {
DeviceMemPtr mem(new CuDeviceMem);
CUresult result = cuMemAlloc(&mem->_deviceptr, size);
HANDLE_RESULT();
mem->_size = size;
mem->_context = this;
ppMem->swap(mem);
return CUDA_SUCCESS;
}