本文整理汇总了C++中THCudaCheck函数的典型用法代码示例。如果您正苦于以下问题:C++ THCudaCheck函数的具体用法?C++ THCudaCheck怎么用?C++ THCudaCheck使用的例子?那么, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了THCudaCheck函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C++代码示例。
示例1: THTensor_
void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, struct THCTensor *src)
{
THArgCheck(THTensor_(nElement)(self) == THCTensor_(nElement)(state, src), 2, "sizes do not match");
THArgCheck(THTensor_(isContiguous)(self), 2, "Target tensor must be contiguous");
THArgCheck(THCTensor_(isContiguous)(state, src), 3, "Source tensor must be contiguous");
if (THTensor_(nElement)(self) == 0) return;
// Perform the copy wrt the current stream on the CudaTensor's device.
int tensorDevice = THCTensor_(getDevice)(state, src);
int currentDevice;
THCudaCheck(cudaGetDevice(¤tDevice));
if (currentDevice != tensorDevice) {
THCudaCheck(cudaSetDevice(tensorDevice));
}
THCudaCheck(cudaMemcpyAsync(THTensor_(data)(self),
THCTensor_(data)(state, src),
THCTensor_(nElement)(state, src) * sizeof(real),
cudaMemcpyDeviceToHost,
THCState_getDeviceStream(state, tensorDevice,
THCState_getCurrentStreamIndex(state))));
if (currentDevice != tensorDevice) {
THCudaCheck(cudaSetDevice(currentDevice));
}
}
示例2: cutorch_streamWaitFor
/*
Usage:
cutorch.streamWaitFor(waiterStream, {waitForStream1, ..., waitForStreamN})
for streams on the current device. Creates a one-way barrier where
waiterStream waits for waitForStream1-N to reach the current point.
*/
static int cutorch_streamWaitFor(lua_State *L)
{
THCState *state = cutorch_getstate(L);
int curDev = -1;
THCudaCheck(cudaGetDevice(&curDev));
/* Check that the waiting stream is in bounds; this will error out if not */
int waitingId = (int) luaL_checknumber(L, 1);
cudaStream_t streamWaiting =
THCState_getDeviceStream(state, curDev, waitingId);
/* Validate the streams that we are waiting on */
int streams = checkAndCountListOfStreams(L, state, 2, curDev);
if (streams < 1) {
/* nothing to synchronize */
return 0;
}
/* One-way dependency; streamWaiting will wait for the list of streams to
wait on to complete execution of pending scheduled kernels/events */
cudaEvent_t * events = (cudaEvent_t*)malloc(sizeof(cudaEvent_t) * streams);
createSingleDeviceEvents(L, state, 2, curDev, events);
/* Then, wait on them */
for (int i = 0; i < streams; i++) {
THCudaCheck(cudaStreamWaitEvent(streamWaiting, events[i], 0));
THCudaCheck(cudaEventDestroy(events[i]));
}
free(events);
return 0;
}
示例3: THCTensor_
void THCTensor_(copyAsyncCPU)(THCState *state, THCTensor *self, struct THTensor *src)
{
THArgCheck(THCTensor_(nElement)(state, self) == THTensor_(nElement)(src), 2, "sizes do not match");
THArgCheck(THCTensor_(isContiguous)(state, self), 2, "Target tensor must be contiguous");
THArgCheck(THTensor_(isContiguous)(src), 3, "Source tensor must be contiguous");
if (THCTensor_(nElement)(state, self) == 0) return;
// Perform the copy wrt the current stream on the CudaTensor's device.
int tensorDevice = THCTensor_(getDevice)(state, self);
int currentDevice;
THCudaCheck(cudaGetDevice(¤tDevice));
if (currentDevice != tensorDevice) {
THCudaCheck(cudaSetDevice(tensorDevice));
}
THCStream *stream = THCState_getStream(state);
THCudaCheck(cudaMemcpyAsync(THCTensor_(data)(state, self),
THTensor_(data)(src),
THTensor_(nElement)(src) * sizeof(real),
cudaMemcpyHostToDevice,
stream->stream));
THCudaCheck(THCCachingHostAllocator_recordEvent(THStorage_(data)(src->storage), stream));
if (currentDevice != tensorDevice) {
THCudaCheck(cudaSetDevice(currentDevice));
}
}
示例4: THCudaShutdown
void THCudaShutdown(THCState* state)
{
THCRandom_shutdown(state);
THCudaBlas_shutdown(state);
free(state->blasState);
free(state->rngState);
free(state->deviceProperties);
int prevDev = -1;
THCudaCheck(cudaGetDevice(&prevDev));
for (int dev = 0; dev < state->numDevices; ++dev) {
THCudaCheck(cudaSetDevice(dev));
/* Free Torch-defined streams (0 is the default stream) */
for (int stream = 1; stream <= state->numUserStreams; ++stream) {
THCudaCheck(cudaStreamDestroy(state->streamsPerDevice[dev][stream]));
}
free(state->streamsPerDevice[dev]);
}
free(state->streamsPerDevice);
THCudaCheck(cudaSetDevice(prevDev));
}
示例5: THCState_getPeerToPeerAccess
int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess)
{
if (dev < 0 || dev >= state->numDevices) {
THError("%d is not a device", dev);
}
if (devToAccess < 0 || devToAccess >= state->numDevices) {
THError("%d is not a device", devToAccess);
}
if (state->p2pAccessEnabled[dev][devToAccess] == -1) {
int prevDev = 0;
THCudaCheck(cudaGetDevice(&prevDev));
THCudaCheck(cudaSetDevice(dev));
int access = 0;
THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess));
if (access) {
cudaError_t err = cudaDeviceEnablePeerAccess(devToAccess, 0);
if (err == cudaErrorPeerAccessAlreadyEnabled) {
// ignore and clear the error if access was already enabled
cudaGetLastError();
} else {
THCudaCheck(err);
}
state->p2pAccessEnabled[dev][devToAccess] = 1;
} else {
state->p2pAccessEnabled[dev][devToAccess] = 0;
}
THCudaCheck(cudaSetDevice(prevDev));
}
return state->p2pAccessEnabled[dev][devToAccess];
}
示例6: cutorch_streamBarrier
/*
Usage:
cutorch.streamBarrier({stream1, stream2, ..., streamN})
applies to streams for the current device. Creates a N-way barrier
to synchronize all of the streams given
*/
static int cutorch_streamBarrier(lua_State *L)
{
THCState *state = cutorch_getstate(L);
int curDev = -1;
THCudaCheck(cudaGetDevice(&curDev));
int streams = checkAndCountListOfStreams(L, state, 1, curDev);
if (streams < 2) {
/* nothing to synchronize together */
return 0;
}
/* Multi-way dependency (barrier); all streams must complete execution
of pending scheduled kernels/events */
cudaEvent_t * events = (cudaEvent_t*)malloc(sizeof(cudaEvent_t) * streams);
/* First, create an event and record them for all streams */
int eventsCreated = createSingleDeviceEvents(L, state, 1, curDev, events);
/* Then, wait on the event. Each stream is actually waiting on itself here
too, but that's harmless and isn't worth weeding out. */
waitSingleDeviceEvents(L, state, 1, curDev, events, eventsCreated);
for (int i = 0; i < eventsCreated; i++)
THCudaCheck(cudaEventDestroy(events[i]));
free(events);
return 0;
}
示例7: THCStream_newWithPriority
THCStream* THCStream_newWithPriority(int flags, int priority)
{
THCStream* self = (THCStream*) malloc(sizeof(THCStream));
self->refcount = 1;
THCudaCheck(cudaGetDevice(&self->device));
THCudaCheck(cudaStreamCreateWithPriority(&self->stream, flags, priority));
return self;
}
示例8: THCStream_new
THCStream* THCStream_new(int flags)
{
THCStream* self = (THCStream*) malloc(sizeof(THCStream));
self->refcount = 1;
THCudaCheck(cudaGetDevice(&self->device));
THCudaCheck(cudaStreamCreateWithFlags(&self->stream, flags));
return self;
}
示例9: THCStorage_
void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, scalar_t value)
{
THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds");
cudaStream_t stream = THCState_getCurrentStream(state);
THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self) + index, &value, sizeof(scalar_t),
cudaMemcpyHostToDevice,
stream));
THCudaCheck(cudaStreamSynchronize(stream));
}
示例10: cutorch_Event_new
static int cutorch_Event_new(lua_State *L)
{
cudaEvent_t *event = luaT_alloc(L, sizeof(cudaEvent_t));
THCudaCheck(cudaEventCreate(event));
THCState *state = cutorch_getstate(L);
THCudaCheck(cudaEventRecord(*event, THCState_getCurrentStream(state)));
luaT_pushudata(L, event, "cutorch.Event");
return 1;
}
示例11: THCState_reserveStreams
void THCState_reserveStreams(THCState* state, int numStreams, int nonBlocking)
{
if (numStreams <= state->numUserStreams)
{
return;
}
int prevDev = -1;
THCudaCheck(cudaGetDevice(&prevDev));
/* Otherwise, we have to allocate a new set of streams and stream data */
for (int dev = 0; dev < state->numDevices; ++dev) {
THCudaCheck(cudaSetDevice(dev));
/* +1 for the default stream as well */
cudaStream_t* newStreams =
(cudaStream_t*) malloc((numStreams + 1) * sizeof(cudaStream_t));
void** newScratchSpace =
(void**) malloc((numStreams + 1) * sizeof(void*));
/* Copy over old stream data
(0 is default stream, 1 ... numUserStreams are rest) */
for (int stream = 0; stream <= state->numUserStreams; ++stream) {
newStreams[stream] =
THCState_getDeviceStream(state, dev, stream);
newScratchSpace[stream] =
THCState_getDeviceScratchSpace(state, dev, stream);
}
/* Allocate new stream resources */
size_t scratchSpaceSize = THCState_getDeviceScratchSpaceSize(state, dev);
unsigned int flags =
nonBlocking ? cudaStreamNonBlocking : cudaStreamDefault;
for (int stream = state->numUserStreams + 1; stream <= numStreams; ++stream) {
newStreams[stream] = NULL;
THCudaCheck(cudaStreamCreateWithFlags(newStreams + stream, flags));
newScratchSpace[stream] = NULL;
THCudaCheck(THCudaMalloc(state, &newScratchSpace[stream], scratchSpaceSize));
}
THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev);
free(res->streams);
res->streams = newStreams;
free(res->devScratchSpacePerStream);
res->devScratchSpacePerStream = newScratchSpace;
}
state->numUserStreams = numStreams;
THCudaCheck(cudaSetDevice(prevDev));
}
示例12: THCState_setDevice
void THCState_setDevice(THCState *state, int device)
{
int curDev;
THCudaCheck(cudaGetDevice(&curDev));
if (device != curDev) {
THCudaCheck(cudaSetDevice(device));
THCRandom_setGenerator(state, device);
THCudaBlas_setHandle(state, device);
/* The stream is per device, so update the stream as well */
THCState_setStream(state, device, THCState_getCurrentStreamIndex(state));
}
}
示例13: Stream
/*
Usage:
cutorch.streamWaitForMultiDevice(gpuWaiter, streamWaiter,
{[gpu1]={stream1_1, ..., stream1_N},
[gpuK]={streamK_1, ..., streamK_M}})
with a specified GPU per each list of streams.
Stream (gpuWaiter, streamWaiter) will wait on all of the other streams
(gpu1, stream1_1), ..., (gpu1, stream1_N), ...,
(gpuK, streamK_1), ..., (gpuK, streamK_M) to complete fully, as a one-way
barrier only (only streamWaiter is blocked).
The streams to wait on are bucketed per device. Equivalent to
streamWaitFor() if only one GPU's streams are listed.
*/
static int cutorch_streamWaitForMultiDevice(lua_State *L)
{
THCState *state = cutorch_getstate(L);
int prevDev = -1;
THCudaCheck(cudaGetDevice(&prevDev));
/* Validate waiting (gpu, stream); this will error out if not */
int gpuWaiter = (int) luaL_checknumber(L, 1) - 1;
int streamWaiter = (int) luaL_checknumber(L, 2);
cudaStream_t streamWaiting =
THCState_getDeviceStream(state, gpuWaiter, streamWaiter);
/* Validate and count set of {gpu={streams...}} we are waiting on */
int gpus = 0;
int streams = 0;
checkAndCountListOfGPUStreamPairs(L, state, 3, &gpus, &streams);
if (streams < 1) {
/* nothing to synchronize together */
return 0;
}
/*
Events can only be recorded on the same device on which they are created.
-For each GPU, create and record event per each stream given
for that GPU.
-For (gpuWaiter, streamWaiter), wait on all of the above events.
*/
cudaEvent_t* events = (cudaEvent_t*) malloc(sizeof(cudaEvent_t) * streams);
/* First, create an event per GPU and record events for the specified stream
on that GPU */
createMultiDeviceEvents(L, state, 3, events);
/* Then, wait on the events */
THCudaCheck(cudaSetDevice(gpuWaiter));
for (int i = 0; i < streams; ++i) {
THCudaCheck(cudaStreamWaitEvent(streamWaiting, events[i], 0));
}
/* Clean up events */
for (int i = 0; i < streams; ++i) {
THCudaCheck(cudaEventDestroy(events[i]));
}
free(events);
THCudaCheck(cudaSetDevice(prevDev));
return 0;
}
示例14: cutorch_streamSynchronize
/*
Usage:
cutorch.streamSynchronize(n)
For the current device, synchronizes with the given stream only
(cudaStreamSynchronize).
0 is the default stream on the device.
*/
static int cutorch_streamSynchronize(lua_State *L)
{
THCState *state = cutorch_getstate(L);
int streamId = (int) luaL_checknumber(L, 1);
int curDev = -1;
THCudaCheck(cudaGetDevice(&curDev));
/* This also validates the stream */
cudaStream_t stream = THCState_getDeviceStream(state, curDev, streamId);
THCudaCheck(cudaStreamSynchronize(stream));
return 0;
}
示例15: cutorch_getDeviceProperties
static int cutorch_getDeviceProperties(lua_State *L)
{
int device = (int)luaL_checknumber(L, 1)-1;
// switch context to given device so the call to cudaMemGetInfo is for the correct device
int oldDevice;
THCudaCheck(cudaGetDevice(&oldDevice));
THCudaCheck(cudaSetDevice(device));
struct cudaDeviceProp prop;
THCudaCheck(cudaGetDeviceProperties(&prop, device));
lua_newtable(L);
SET_DEVN_PROP(canMapHostMemory);
SET_DEVN_PROP(clockRate);
SET_DEVN_PROP(computeMode);
SET_DEVN_PROP(deviceOverlap);
SET_DEVN_PROP(integrated);
SET_DEVN_PROP(kernelExecTimeoutEnabled);
SET_DEVN_PROP(major);
SET_DEVN_PROP(maxThreadsPerBlock);
SET_DEVN_PROP(memPitch);
SET_DEVN_PROP(minor);
SET_DEVN_PROP(multiProcessorCount);
SET_DEVN_PROP(regsPerBlock);
SET_DEVN_PROP(sharedMemPerBlock);
SET_DEVN_PROP(textureAlignment);
SET_DEVN_PROP(totalConstMem);
SET_DEVN_PROP(totalGlobalMem);
SET_DEVN_PROP(warpSize);
SET_DEVN_PROP(pciBusID);
SET_DEVN_PROP(pciDeviceID);
SET_DEVN_PROP(pciDomainID);
SET_DEVN_PROP(maxTexture1D);
SET_DEVN_PROP(maxTexture1DLinear);
size_t freeMem;
THCudaCheck(cudaMemGetInfo (&freeMem, NULL));
lua_pushnumber(L, freeMem);
lua_setfield(L, -2, "freeGlobalMem");
lua_pushstring(L, prop.name);
lua_setfield(L, -2, "name");
// restore context
THCudaCheck(cudaSetDevice(oldDevice));
return 1;
}