本文整理汇总了C#中CudaDeviceVariable.Memset方法的典型用法代码示例。如果您正苦于以下问题:C# CudaDeviceVariable.Memset方法的具体用法?C# CudaDeviceVariable.Memset怎么用?C# CudaDeviceVariable.Memset使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类CudaDeviceVariable
的用法示例。
在下文中一共展示了CudaDeviceVariable.Memset方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C#代码示例。
示例1: Run
public void Run(MatOperation operation, CudaDeviceVariable<float> A, int ACount, int AColumnHint, CudaDeviceVariable<float> B, int BCount, int BColumnHint, CudaDeviceVariable<float> Result, int ResultCount, int ResultColumnHint, float beta = 1.0f)
{
Result.Memset(BitConverter.ToUInt32(BitConverter.GetBytes(0.0f), 0));
switch (operation)
{
case MatOperation.Multiplication: // vectors/matrices have to be always in the correct dimesions!
if (BCount > 1 && ACount > 1 && BColumnHint == 1 && ACount / AColumnHint > 1 && BCount / BColumnHint == AColumnHint) //. A*vecB
{
MyCublasFactory.Instance.Gemv(Operation.Transpose, // transpose beacuase it does Ax row wise if x is a row vector :D
AColumnHint, ACount / AColumnHint, 1.0f,
A, AColumnHint,
B, 1,
beta, Result, 1);
}
else if (ACount > 1 && BCount > 1 && ACount / AColumnHint == 1 && BColumnHint > 1 && BCount / BColumnHint == AColumnHint) // vecA*B
{
MyCublasFactory.Instance.Gemv(Operation.NonTranspose, // transpose beacuase it does Ax row wise if x is a row vector :D
BColumnHint, BCount / BColumnHint, 1.0f,
B, BColumnHint,
A, 1,
beta, Result, 1);
}
else if (ACount / AColumnHint == 1 && BColumnHint == 1 && ACount > 1 && BCount > 1) //. trans(vecA) * vecB
{
Run(MatOperation.DotProd, A, ACount, AColumnHint, B, BCount, BColumnHint, Result, ResultCount, ResultColumnHint, beta);
}
else if (ACount != 1 || BCount != 1)// A*B matrix multiplication
{
MyCublasFactory.Instance.Gemm(Operation.NonTranspose, Operation.NonTranspose,
ACount / AColumnHint, BColumnHint, AColumnHint, 1.0f,
A, ACount / AColumnHint,
B, BCount / BColumnHint,
beta, Result, ResultColumnHint);
}
break;
case MatOperation.DotProd:
MyCublasFactory.Instance.Gemv(Operation.Transpose, // transpose beacuase it does Ax row wise if x is a row vector :D
ACount, 1, 1.0f,
A, ACount,
B, 1,
beta, Result, 1);
break;
default:
MyLog.Writer.WriteLine(MyLogLevel.ERROR, "Trying to run cublas for undefined MatOperation");
break;
}
}
示例2: Run
public void Run(MatOperation operation, CudaDeviceVariable<float> A, int ACount, int AColumnHint, CudaDeviceVariable<float> B, int BCount, int BColumnHint, CudaDeviceVariable<float> Result, int ResultCount, int ResultColumnHint, float beta = 1.0f)
{
Result.Memset(BitConverter.ToUInt32(BitConverter.GetBytes(0.0f), 0));
switch (operation)
{
case MatOperation.Multiplication: // vectors/matrices have to be always in the correct dimesions!
if (BCount > 1 && ACount >= 1 && BColumnHint == 1 && ACount / AColumnHint > 1 && BCount / BColumnHint == AColumnHint) //. A*vecB
{
MyCublasFactory.Instance.Gemv(Operation.Transpose, // transpose beacuase it does Ax row wise if x is a row vector :D
AColumnHint, ACount / AColumnHint, 1.0f,
A, AColumnHint,
B, 1,
beta, Result, 1);
}
else if (ACount >= 1 && BCount > 1 && ACount / AColumnHint == 1 && BColumnHint > 1 && BCount / BColumnHint == AColumnHint) // vecA*B
{
MyCublasFactory.Instance.Gemv(Operation.NonTranspose, // transpose beacuase it does Ax row wise if x is a row vector :D
BColumnHint, BCount / BColumnHint, 1.0f,
B, BColumnHint,
A, 1,
beta, Result, 1);
}
else if (ACount / AColumnHint == 1 && BColumnHint == 1 && ACount > 1 && BCount > 1) //. trans(vecA) * vecB
{
Run(MatOperation.DotProd, A, ACount, AColumnHint, B, BCount, BColumnHint, Result, ResultCount, ResultColumnHint, beta);
}
else if (ACount != 1 || BCount != 1)// A*B matrix multiplication
{
// Cublas is using fortran matrices.. thus tey have to be swapped such as described in: http://peterwittek.com/cublas-matrix-c-style.html
int m = BColumnHint;
int n = ACount / AColumnHint;
int k = AColumnHint;
int lda = BColumnHint;
int ldb = AColumnHint;
int ldc = ResultColumnHint;
MyCublasFactory.Instance.Gemm(Operation.NonTranspose, Operation.NonTranspose,
m, n, k, 1.0f,
B, lda,
A, ldb,
beta, Result, ldc);
}
break;
case MatOperation.DotProd:
if (ACount != BCount || ResultCount != 1)
{
MyLog.Writer.WriteLine(MyLogLevel.ERROR, callee.Name + ": Inconsistent vector dimensions for MyMatrixCublasOps.");
break;
}
MyCublasFactory.Instance.Gemv(Operation.Transpose, // transpose beacuase it does Ax row wise if x is a row vector :D
ACount, 1, 1.0f,
A, ACount,
B, 1,
beta, Result, 1);
break;
default:
MyLog.Writer.WriteLine(MyLogLevel.ERROR, "Trying to run cublas for undefined MatOperation");
break;
}
}
示例3: Reset
protected override void Reset()
{
base.Reset();
isScreenClear = false;
// Allocate the history
m_HistoryDeviceBuffer = new CudaDeviceVariable<float>(m_Rows * m_Cols);
m_HistoryDeviceBuffer.Memset(0);
m_History = new List<string>();
m_History.Add("");
}
示例4: Generate
private void Generate(CudaKernel kernelPositionWeight, int width, int height, int depth)
{
int count = width * height * depth;
int widthD = width - 1;
int heightD = height - 1;
int depthD = depth - 1;
int countDecremented = widthD * heightD * depthD;
dim3 blockDimensions = new dim3(8, 8, 8);
dim3 gridDimensions = new dim3((int)Math.Ceiling(width / 8.0), (int)Math.Ceiling(height / 8.0), (int)Math.Ceiling(depth / 8.0));
dim3 gridDimensionsDecremented = new dim3((int)Math.Ceiling(widthD / 8.0), (int)Math.Ceiling(heightD / 8.0), (int)Math.Ceiling(depthD / 8.0));
CUDANoiseCube noiseCube = new CUDANoiseCube();
CudaArray3D noiseArray = noiseCube.GenerateUniformArray(16, 16, 16);
CudaTextureArray3D noiseTexture = new CudaTextureArray3D(kernelPositionWeight, "noiseTexture", CUAddressMode.Wrap, CUFilterMode.Linear, CUTexRefSetFlags.NormalizedCoordinates, noiseArray);
CudaDeviceVariable<Voxel> voxelsDev = new CudaDeviceVariable<Voxel>(count);
kernelPositionWeight.BlockDimensions = blockDimensions;
typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelPositionWeight, gridDimensions);
kernelPositionWeight.Run(voxelsDev.DevicePointer, width, height, depth);
kernelNormalAmbient.BlockDimensions = blockDimensions;
typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelNormalAmbient, gridDimensions);
kernelNormalAmbient.Run(voxelsDev.DevicePointer, width, height, depth, container.Settings.AmbientRayWidth, container.Settings.AmbientSamplesCount);
int nearestW = NearestPowerOfTwo(widthD);
int nearestH = NearestPowerOfTwo(heightD);
int nearestD = NearestPowerOfTwo(depthD);
int nearestCount = nearestW * nearestH * nearestD;
CudaDeviceVariable<int> trisCountDevice = new CudaDeviceVariable<int>(nearestCount);
trisCountDevice.Memset(0);
CudaDeviceVariable<int> offsetsDev = new CudaDeviceVariable<int>(countDecremented);
kernelMarchingCubesCases.BlockDimensions = blockDimensions;
typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesCases, gridDimensionsDecremented);
kernelMarchingCubesCases.Run(voxelsDev.DevicePointer, width, height, depth, offsetsDev.DevicePointer, trisCountDevice.DevicePointer, nearestW, nearestH, nearestD);
CudaDeviceVariable<int> prefixSumsDev = prefixScan.PrefixSumArray(trisCountDevice, nearestCount);
int lastTrisCount = 0;
trisCountDevice.CopyToHost(ref lastTrisCount, (nearestCount - 1) * sizeof(int));
int lastPrefixSum = 0;
prefixSumsDev.CopyToHost(ref lastPrefixSum, (nearestCount - 1) * sizeof(int));
int totalVerticesCount = (lastTrisCount + lastPrefixSum) * 3;
if (totalVerticesCount > 0)
{
if (container.Geometry != null)
container.Geometry.Dispose();
container.VertexCount = totalVerticesCount;
container.Geometry = new Buffer(graphicsDevice, new BufferDescription()
{
BindFlags = BindFlags.VertexBuffer,
CpuAccessFlags = CpuAccessFlags.None,
OptionFlags = ResourceOptionFlags.None,
SizeInBytes = Marshal.SizeOf(typeof(VoxelMeshVertex)) * totalVerticesCount,
Usage = ResourceUsage.Default
});
CudaDirectXInteropResource directResource = new CudaDirectXInteropResource(container.Geometry.ComPointer, CUGraphicsRegisterFlags.None, CudaContext.DirectXVersion.D3D11, CUGraphicsMapResourceFlags.None);
kernelMarchingCubesVertices.BlockDimensions = blockDimensions;
typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesVertices, gridDimensionsDecremented);
directResource.Map();
kernelMarchingCubesVertices.Run(directResource.GetMappedPointer(), voxelsDev.DevicePointer, prefixSumsDev.DevicePointer, offsetsDev.DevicePointer, width, height, depth, nearestW, nearestH, nearestD);
directResource.UnMap();
directResource.Dispose();
}
else
{
container.VertexCount = 0;
if (container.Geometry != null)
container.Geometry.Dispose();
}
noiseCube.Dispose();
prefixSumsDev.Dispose();
trisCountDevice.Dispose();
offsetsDev.Dispose();
noiseArray.Dispose();
noiseTexture.Dispose();
voxelsDev.Dispose();
}
示例5: Main
//.........这里部分代码省略.........
// create CUDA event handles
// use blocking sync
CudaEvent start_event, stop_event;
CUEventFlags eventflags = ((device_sync_method == CUCtxFlags.BlockingSync) ? CUEventFlags.BlockingSync : CUEventFlags.Default);
start_event = new CudaEvent(eventflags);
stop_event = new CudaEvent(eventflags);
// time memcopy from device
start_event.Record(); // record in stream-0, to ensure that all previous CUDA calls have completed
hAligned_a.AsyncCopyToDevice(d_a, streams[0].Stream);
stop_event.Record();
stop_event.Synchronize(); // block until the event is actually recorded
time_memcpy = CudaEvent.ElapsedTime(start_event, stop_event);
Console.Write("memcopy:\t{0:0.00}\n", time_memcpy);
// time kernel
threads = new dim3(512, 1);
blocks = new dim3(n / (int)threads.x, 1);
start_event.Record();
init_array.BlockDimensions = threads;
init_array.GridDimensions = blocks;
init_array.RunAsync(streams[0].Stream, d_a.DevicePointer, d_c.DevicePointer, niterations);
stop_event.Record();
stop_event.Synchronize();
time_kernel = CudaEvent.ElapsedTime(start_event, stop_event);
Console.Write("kernel:\t\t{0:0.00}\n", time_kernel);
//////////////////////////////////////////////////////////////////////
// time non-streamed execution for reference
threads = new dim3(512, 1);
blocks = new dim3(n / (int)threads.x, 1);
start_event.Record();
for(int k = 0; k < nreps; k++)
{
init_array.BlockDimensions = threads;
init_array.GridDimensions = blocks;
init_array.Run(d_a.DevicePointer, d_c.DevicePointer, niterations);
hAligned_a.SynchronCopyToHost(d_a);
}
stop_event.Record();
stop_event.Synchronize();
elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event);
Console.Write("non-streamed:\t{0:0.00} ({1:00} expected)\n", elapsed_time / nreps, time_kernel + time_memcpy);
//////////////////////////////////////////////////////////////////////
// time execution with nstreams streams
threads = new dim3(512, 1);
blocks = new dim3(n / (int)(nstreams * threads.x), 1);
byte[] memset = new byte[nbytes]; // set host memory bits to all 1s, for testing correctness
for (int i = 0; i < nbytes; i++)
{
memset[i] = 255;
}
System.Runtime.InteropServices.Marshal.Copy(memset, 0, hAligned_a.PinnedHostPointer, nbytes);
d_a.Memset(0); // set device memory to all 0s, for testing correctness
start_event.Record();
for(int k = 0; k < nreps; k++)
{
init_array.BlockDimensions = threads;
init_array.GridDimensions = blocks;
// asynchronously launch nstreams kernels, each operating on its own portion of data
for(int i = 0; i < nstreams; i++)
init_array.RunAsync(streams[i].Stream, d_a.DevicePointer + i * n / nstreams * sizeof(int), d_c.DevicePointer, niterations);
// asynchronously launch nstreams memcopies. Note that memcopy in stream x will only
// commence executing when all previous CUDA calls in stream x have completed
for (int i = 0; i < nstreams; i++)
hAligned_a.AsyncCopyFromDevice(d_a, i * n / nstreams * sizeof(int), i * n / nstreams * sizeof(int), nbytes / nstreams, streams[i].Stream);
}
stop_event.Record();
stop_event.Synchronize();
elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event);
Console.Write("{0} streams:\t{1:0.00} ({2:0.00} expected with compute capability 1.1 or later)\n", nstreams, elapsed_time / nreps, time_kernel + time_memcpy / nstreams);
// check whether the output is correct
Console.Write("-------------------------------\n");
//We can directly access data in hAligned_a using the [] operator, but copying
//data first to h_a is faster.
System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, nbytes / sizeof(int));
bool bResults = correct_data(h_a, n, c*nreps*niterations);
// release resources
for(int i = 0; i < nstreams; i++) {
streams[i].Dispose();
}
start_event.Dispose();
stop_event.Dispose();
hAligned_a.Dispose();
d_a.Dispose();
d_c.Dispose();
CudaContext.ProfilerStop();
ctx.Dispose();
ShrQATest.shrQAFinishExit(args, bResults ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED);
}
示例6: Reset
protected override void Reset()
{
TextureHeight = BIN_PIXEL_HEIGHT;
TextureWidth = BIN_PIXEL_WIDTH * BINS;
if (m_d_HistogramData != null)
{
m_d_HistogramData.Dispose();
}
m_d_HistogramData = new CudaDeviceVariable<int>(BINS);
m_d_HistogramData.Memset(0);
}