本文整理汇总了C#中Context.CreateBuffer方法的典型用法代码示例。如果您正苦于以下问题:C# Context.CreateBuffer方法的具体用法?C# Context.CreateBuffer怎么用?C# Context.CreateBuffer使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Context
的用法示例。
在下文中一共展示了Context.CreateBuffer方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的C#代码示例。
示例1: TestHostToDeviceTransferPinned
private double TestHostToDeviceTransferPinned(Context context, CommandQueue commandQueue, int memSize, AccessMode accessMode)
{
// Create a host buffer
using (Buffer pinnedData = context.CreateBuffer(MemoryFlags.ReadWrite | MemoryFlags.AllocateHostPointer, memSize))
{
// get a mapped pointer
IntPtr h_data;
commandQueue.EnqueueMapBuffer(pinnedData, true, MapFlags.Write, 0, memSize, out h_data);
// initialize
for (int i = 0; i < memSize; i++)
Marshal.WriteByte(h_data, i, (byte)i);
// unmap and make data in the host buffer valid
commandQueue.EnqueueUnmapMemObject(pinnedData, h_data);
// allocate device memory
using (Buffer deviceData = context.CreateBuffer(MemoryFlags.ReadWrite, memSize))
{
// sync queue to host
commandQueue.Finish();
Stopwatch timer = Stopwatch.StartNew();
if (accessMode == AccessMode.Direct)
{
commandQueue.EnqueueMapBuffer(pinnedData, true, MapFlags.Read, 0, memSize, out h_data);
// DIRECT: API access to device buffer
for (int i = 0; i < MemoryCopyIterations; i++)
{
commandQueue.EnqueueWriteBuffer(deviceData, false, 0, memSize, h_data);
}
commandQueue.Finish();
}
else
{
// MAPPED: mapped pointers to device buffer for conventional pointer access
IntPtr dm_idata;
commandQueue.EnqueueMapBuffer(deviceData, true, MapFlags.Write, 0, memSize, out dm_idata);
commandQueue.EnqueueMapBuffer(pinnedData, true, MapFlags.Read, 0, memSize, out h_data);
for (int i = 0; i < MemoryCopyIterations; i++)
{
CopyMemory(dm_idata, h_data, (UIntPtr)memSize);
}
commandQueue.EnqueueUnmapMemObject(deviceData, dm_idata);
}
//get the the elapsed time in seconds
double elapsedTimeInSeconds = timer.Elapsed.TotalSeconds;
// Calculate bandwidth in MB/s
// This is for kernels that read and write GMEM simultaneously
// Obtained Throughput for unidirectional block copies will be 1/2 of this #
double bandwidthInMBs = 2.0 * ((double)memSize * (double)MemoryCopyIterations) / (elapsedTimeInSeconds * (double)(1 << 20));
return bandwidthInMBs;
}
}
}
示例2: TestDeviceToDeviceTransfer
private double TestDeviceToDeviceTransfer(Context context, CommandQueue commandQueue, int memorySize)
{
if (context == null)
throw new ArgumentNullException("context");
double elapsedTimeInSeconds = 0.0;
double bandwidthInMBs = 0.0;
// allocate host memory
byte[] data = new byte[memorySize];
// initialize the memory
for (int i = 0; i < memorySize; i++)
data[i] = 0xFF;
// allocate device input and output memory and initialize the device input memory
using (Buffer d_idata = context.CreateBuffer(MemoryFlags.ReadOnly, memorySize),
d_odata = context.CreateBuffer(MemoryFlags.WriteOnly, memorySize))
{
unsafe
{
fixed (byte* rawData = data)
{
using (commandQueue.EnqueueWriteBuffer(d_idata, true, 0, memorySize, (IntPtr)rawData))
{
}
}
}
// sync queue to host, start timer 0, and copy data from one GPU buffer to another GPU buffer
commandQueue.Finish();
Stopwatch timer = Stopwatch.StartNew();
for (int i = 0; i < MemoryCopyIterations; i++)
{
using (commandQueue.EnqueueCopyBuffer(d_idata, d_odata, 0, 0, memorySize))
{
}
}
// sync with GPU
commandQueue.Finish();
// get the elapsed time in seconds
elapsedTimeInSeconds = timer.Elapsed.TotalSeconds;
// Calculate bandwidth in MB/s
// This is for kernels that read and write GMEM simultaneously
// Obtained Throughput for unidirectional block copies will be 1/2 of this #
bandwidthInMBs = 2.0 * ((double)memorySize * (double)MemoryCopyIterations) / (elapsedTimeInSeconds * (double)(1 << 20));
}
return bandwidthInMBs;
}
示例3: TestHostToDeviceTransferPaged
private unsafe double TestHostToDeviceTransferPaged(Context context, CommandQueue commandQueue, int memSize, AccessMode accessMode)
{
// standard host allocation
byte[] data = new byte[memSize];
for (int i = 0; i < data.Length; i++)
data[i] = (byte)i;
fixed (byte* pdata = data)
{
// allocate device memory
using (Buffer deviceData = context.CreateBuffer(MemoryFlags.ReadWrite, memSize))
{
// sync queue to host
commandQueue.Finish();
Stopwatch timer = Stopwatch.StartNew();
if (accessMode == AccessMode.Direct)
{
// DIRECT: API access to device buffer
for (int i = 0; i < MemoryCopyIterations; i++)
{
commandQueue.EnqueueWriteBuffer(deviceData, false, 0, memSize, (IntPtr)pdata);
}
commandQueue.Finish();
}
else
{
// MAPPED: mapped pointers to device buffer for conventional pointer access
IntPtr dm_idata;
commandQueue.EnqueueMapBuffer(deviceData, true, MapFlags.Write, 0, memSize, out dm_idata);
for (int i = 0; i < MemoryCopyIterations; i++)
{
CopyMemory(dm_idata, (IntPtr)pdata, (UIntPtr)memSize);
}
commandQueue.EnqueueUnmapMemObject(deviceData, dm_idata);
}
//get the the elapsed time in seconds
double elapsedTimeInSeconds = timer.Elapsed.TotalSeconds;
// Calculate bandwidth in MB/s
// This is for kernels that read and write GMEM simultaneously
// Obtained Throughput for unidirectional block copies will be 1/2 of this #
double bandwidthInMBs = 2.0 * ((double)memSize * (double)MemoryCopyIterations) / (elapsedTimeInSeconds * (double)(1 << 20));
return bandwidthInMBs;
}
}
}
示例4: ECCTest
static void ECCTest()
{
const int BigIntBytes = 4 * 8;
const int PointBytes = BigIntBytes * 3;
const int InputWorkItemBytes = PointBytes + BigIntBytes;
const int OutputWorkItemBytes = PointBytes;
uint[] x1 = new uint[] {0x895aa032, 0x0d07522a, 0x506abf79, 0xabbc5c54, 0x1c2d6914, 0xb758abae, 0x914fa51b, 0xdfa23008};
uint[] y1 = new uint[] {0xefa18861, 0x602dfbbd, 0xe98d5b8c, 0xf884eb9e, 0x9898b025, 0x022e6bad, 0x31f238ee, 0x0bf40155};
uint[] z1 = new uint[] {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
uint[] x2 = new uint[] {0xd6d7e35d, 0x2febd950, 0x2f987f4d, 0xb30482f7, 0x1164ce2e, 0xfce2b6ce, 0x12367d71, 0x15c1cdd1};
uint[] y2 = new uint[] {0xc1add051, 0x2dcfd682, 0x0d53b2d6, 0xbd9ad440, 0xad0f523b, 0x559ebb59, 0x45d34876, 0xdd307c87};
uint[] z2 = new uint[] {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
uint[] x3 = new uint[] {0xb75c6254, 0x7b278510, 0xf45598f8, 0xdb81bb86, 0x4c48ee2b, 0x1dfe6ba4, 0xcbb54aa0, 0x616966b1};
uint[] y3 = new uint[] {0x356c3d49, 0x3c98aa53, 0xff99ca5b, 0x3d58a64f, 0xc0ac8b7e, 0x65168611, 0x0bb52f28, 0x9defd775};
uint[] z3 = new uint[] {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
uint[] s = new uint[] {0x9b2d206a, 0x8a022706, 0x5ce5a47a, 0x9f363b87, 0xcac90283, 0x2004790d, 0x1f2e5787, 0xadeba125};
uint[] x = new uint[8], y = new uint[8], z = new uint[8];
using (Context context = new Context (DeviceType.GPU))
using (CommandQueue queue = context.CreateCommandQueue (context.Devices[0], CommandQueueProperties.Default))
using (CLProgram prog = context.CreateProgram (OclCodeStore.GetOclCode ("ecc-p256"), context.Devices, null)) {
int maxWorkItemSize = (int)queue.Device.MaxWorkItemSizes[0] / 2;
int parallels = (int)queue.Device.MaxComputeUnits * maxWorkItemSize;
int local_size = maxWorkItemSize;
while (local_size > parallels)
local_size >>= 1;
using (Memory inMem = context.CreateBuffer (MemoryFlags.ReadOnly, InputWorkItemBytes * parallels))
using (Memory outMem = context.CreateBuffer (MemoryFlags.WriteOnly, OutputWorkItemBytes * parallels))
using (Kernel kernel = prog.CreateKernel ("Test")) {
kernel.SetArgument (0, inMem);
kernel.SetArgument (1, outMem);
{
int wrote = 0;
for (int i = 0; i < parallels; i ++) {
queue.WriteBuffer (inMem, wrote, x1, 0, BigIntBytes); wrote += BigIntBytes;
queue.WriteBuffer (inMem, wrote, y1, 0, BigIntBytes); wrote += BigIntBytes;
queue.WriteBuffer (inMem, wrote, z1, 0, BigIntBytes); wrote += BigIntBytes;
queue.WriteBuffer (inMem, wrote, s, 0, BigIntBytes); wrote += BigIntBytes;
s[0] ++;
}
}
TimeSpan time = Execute (null, 1, 0, delegate () {
queue.Execute (kernel, 0, parallels, local_size);
});
Console.WriteLine ("{0} mul/s", parallels / time.TotalSeconds);
{
int read = 0;
queue.ReadBuffer (outMem, read, x, 0, BigIntBytes); read += BigIntBytes;
queue.ReadBuffer (outMem, read, y, 0, BigIntBytes); read += BigIntBytes;
queue.ReadBuffer (outMem, read, z, 0, BigIntBytes);
}
}
}
/*for (int i = 0; i < 8; i ++)
Console.WriteLine ("x[{0}]=0x{1:x8} y[{0}]=0x{2:x8} z[{0}]=0x{3:x8}", i, x[i], y[i], z[i]);
Console.WriteLine ("cmpl");
Console.ReadLine ();*/
}
示例5: SHATest
static void SHATest(int parallels)
{
const int memTests = 100;
const int updateTests = 100;
TimeSpan total = TimeSpan.Zero;
//int parallels;
int blocks_per_instance = 1024;
byte[] input, state, state_ref;
using (Context context = new Context (DeviceType.GPU))
using (CommandQueue queue = context.CreateCommandQueue (context.Devices[0], CommandQueueProperties.Default)) {
int local_size = (int)queue.Device.MaxWorkItemSizes[0] / 2;
parallels = local_size * (int)queue.Device.MaxComputeUnits;
input = new byte[SHA256.MessageSize * parallels * blocks_per_instance];
state = new byte[parallels * SHA256.StateSize];
state_ref = new byte[parallels * SHA256.StateSize];
new Random ().NextBytes (input);
// balance
if (parallels < local_size) {
local_size = parallels / (int)queue.Device.MaxComputeUnits;
local_size = (int)Math.Pow (2, Math.Ceiling (Math.Log (local_size, 2)));
if (local_size <= 0)
local_size = 1;
}
using (CLProgram prog = context.CreateProgram (OclCodeStore.GetOclCode ("sha-256"), context.Devices, null))
using (Memory inMem = context.CreateBuffer (MemoryFlags.ReadOnly, input.Length))
using (Memory stateMem = context.CreateBuffer (MemoryFlags.ReadWrite, state.Length))
using (Memory constMem = context.CreateBuffer (MemoryFlags.ReadOnly, 4 * SHA256.Constants.Length))
using (Kernel kernel = prog.CreateKernel ("core256")) {
// Copy constant values
queue.WriteBuffer (constMem, 0, SHA256.Constants, 0, SHA256.Constants.Length * 4);
// Init State
uint[] temp = new uint[parallels * SHA256.StateSize / 4];
for (int i = 0; i < parallels; i++) {
for (int j = 0; j < SHA256.InitialValues.Length; j++)
temp[i * SHA256.InitialValues.Length + j] = SHA256.InitialValues[j];
}
queue.WriteBuffer (stateMem, 0, temp, 0, temp.Length * 4);
int global_size = parallels;
int max_local_size = local_size;
while (local_size > global_size || local_size > max_local_size)
local_size >>= 1;
// Setup Kernel Arguments
kernel.SetArgument (0, inMem);
kernel.SetArgument (1, stateMem);
kernel.SetArgument (2, constMem);
kernel.SetLocalDataShare (3, 4 * SHA256.Constants.Length);
kernel.SetArgument (4, blocks_per_instance, 4);
total += Execute ("write", memTests, input.Length, delegate () {
queue.WriteBuffer (inMem, 0, input, 0, input.Length);
});
total += Execute ("kernel", updateTests, input.Length, delegate () {
queue.Execute (kernel, 0, global_size, local_size);
});
total += Execute ("read", memTests, state.Length, delegate () {
queue.ReadBuffer (stateMem, 0, state, 0, state.Length);
});
WriteTime ("total", total, input.Length);
}
}
#if false
SHA256.InitState (state_ref);
for (int i = 0; i < blocks_per_instance; i ++) {
SHA256.Update (input, i * SHA256.MessageSize * parallels, SHA256.MessageSize * parallels, state_ref);
}
for (int i = 0; i < state.Length; i++) {
if (state[i] != state_ref[i]) {
Console.ForegroundColor = ConsoleColor.Red;
Console.WriteLine ("err");
Console.ForegroundColor = ConsoleColor.White;
break;
}
}
#endif
//Console.WriteLine ("cmpl");
//Console.ReadLine ();
}
示例6: CamelliaTest2
static void CamelliaTest2()
{
const int memTests = 1;
const int encryptTests = 1;
const int ProcessUnitDataSize = 16 * 32; // 32bit-width bitslice
byte[] key = new byte[16];
byte[] input = new byte[ProcessUnitDataSize * 2 * 1024 * 64];
byte[] output = new byte[input.Length];
byte[] output_ref = new byte[input.Length];
uint[] keyTable;
new Random ().NextBytes (key);
new Random ().NextBytes (input);
Camellia.GenerateKeyTable (key, out keyTable);
TimeSpan total = TimeSpan.Zero;
using (Context context = new Context (DeviceType.GPU))
using (CommandQueue queue = context.CreateCommandQueue (context.Devices[0], CommandQueueProperties.Default))
using (CLProgram prog = context.CreateProgram (OclCodeStore.GetOclCode ("bitslice_camellia"), context.Devices, null))
using (Memory mem = context.CreateBuffer (MemoryFlags.ReadWrite, input.Length)) {
/*using (Memory keyMem = context.CreateBuffer (MemoryFlags.ReadWrite, expandedKey.Length * 32))
using (Memory nonsliceKeyMem = context.CreateBuffer (MemoryFlags.WriteOnly, expandedKey.Length))
using (Kernel kernel = prog.CreateKernel ("bitslice_key")) {
kernel.SetArgument (0, nonsliceKeyMem);
kernel.SetArgument (1, keyMem);
queue.WriteBuffer (nonsliceKeyMem, 0, expandedKey, 0, expandedKey.Length);
queue.Execute (kernel, 0, expandedKey.Length * 8 / 4, 8);
}*/
int localMemorySize = (int)(queue.Device.LocalMemSize / 2);
int maxWorkItemSize = (int)queue.Device.MaxWorkItemSizes[0];
// global/local size setting for encrypt kernel
int global_size = input.Length / ProcessUnitDataSize;
int local_size = int.MaxValue;// localMemorySize / 512;
local_size = Math.Min (local_size, maxWorkItemSize);
local_size = Math.Min (local_size, global_size);
// global/local size setting for bitslice kernel
int slice_global_size = input.Length / ProcessUnitDataSize * 32;
int slice_local_size = (localMemorySize / 512) * 32;
slice_local_size = Math.Min (slice_local_size, maxWorkItemSize);
slice_local_size = Math.Min (slice_local_size, slice_global_size);
total += Execute ("write", memTests, input.Length, delegate () {
queue.WriteBuffer (mem, 0, input, 0, input.Length);
});
using (Kernel kernel_encrypt = prog.CreateKernel ("encrypt"))
using (Kernel kernel_bitslice = prog.CreateKernel ("bitslice_kernel"))
using (Kernel kernel_shuffle1 = prog.CreateKernel ("shuffle_state1"))
using (Kernel kernel_shuffle2 = prog.CreateKernel ("shuffle_state2")) {
kernel_bitslice.SetArgument (0, mem);
kernel_bitslice.SetLocalDataShare (1, 512 * slice_local_size / 32);
kernel_shuffle1.SetArgument (0, mem);
kernel_shuffle1.SetLocalDataShare (1, 512 * slice_local_size / 32);
kernel_shuffle2.SetArgument (0, mem);
kernel_shuffle2.SetLocalDataShare (1, 512 * slice_local_size / 32);
kernel_encrypt.SetArgument (0, mem);
//kernel_encrypt.SetLocalDataShare (1, 512 * local_size);
total += Execute ("kernel(bitslice)", encryptTests, input.Length, delegate () {
queue.Execute (kernel_bitslice, 0, slice_global_size, slice_local_size);
});
total += Execute ("kernel(shuffle)", encryptTests, input.Length, delegate () {
queue.Execute (kernel_shuffle1, 0, slice_global_size, slice_local_size);
});
total += Execute ("kernel(encrypt)", encryptTests, input.Length, delegate () {
queue.Execute (kernel_encrypt, 0, global_size, local_size);
});
total += Execute ("kernel(shuffle)", encryptTests, input.Length, delegate () {
queue.Execute (kernel_shuffle2, 0, slice_global_size, slice_local_size);
});
total += Execute ("kernel(unbitslice)", encryptTests, input.Length, delegate () {
queue.Execute (kernel_bitslice, 0, slice_global_size, slice_local_size);
});
}
total += Execute ("read", memTests, input.Length, delegate () {
queue.ReadBuffer (mem, 0, output, 0, output.Length);
});
}
WriteTime ("total", total, input.Length);
#if true
Camellia.Encrypt (key, input, output_ref);
for (int i = 0; i < output.Length; i++)
if (output[i] != output_ref[i]) {
ConsoleColor defColor = Console.ForegroundColor;
Console.ForegroundColor = ConsoleColor.Red;
Console.WriteLine ("err");
Console.WriteLine (" expected | actual | input");
for (int k = i; k < i + 64 && k < output.Length; k ++) {
Console.ForegroundColor = (output[k] == output_ref[k] ? defColor : ConsoleColor.Red);
Console.WriteLine (" {0}: {1:x2} | {2:x2} | {3:x2}", k, output_ref[k], output[k], input[k]);
}
Console.ForegroundColor = defColor;
break;
}
#endif
Console.WriteLine ("cmpl");
//.........这里部分代码省略.........
示例7: CamelliaTest
static void CamelliaTest()
{
const int memTests = 1;
const int encryptTests = 1;
byte[] key = new byte[16];
byte[] input = new byte[1024 * 1024 * 64];
byte[] output = new byte[input.Length];
byte[] output_ref = new byte[input.Length];
uint[] keyTable;
new Random ().NextBytes (key);
new Random ().NextBytes (input);
Camellia.GenerateKeyTable (key, out keyTable);
using (Context context = new Context (DeviceType.GPU))
using (CommandQueue queue = context.CreateCommandQueue (context.Devices[0], CommandQueueProperties.Default))
using (CLProgram prog = context.CreateProgram (OclCodeStore.GetOclCode ("camellia"), context.Devices, null))
using (Memory inMem = context.CreateBuffer (MemoryFlags.ReadOnly, input.Length))
using (Memory outMem = context.CreateBuffer (MemoryFlags.WriteOnly, input.Length))
using (Memory keyMem = context.CreateBuffer (MemoryFlags.ReadOnly, keyTable.Length * 4))
using (Memory sbox1Mem = context.CreateBuffer (MemoryFlags.ReadOnly, Camellia.SBOX1_1110.Length * 4))
using (Memory sbox2Mem = context.CreateBuffer (MemoryFlags.ReadOnly, Camellia.SBOX2_0222.Length * 4))
using (Memory sbox3Mem = context.CreateBuffer (MemoryFlags.ReadOnly, Camellia.SBOX3_3033.Length * 4))
using (Memory sbox4Mem = context.CreateBuffer (MemoryFlags.ReadOnly, Camellia.SBOX4_4404.Length * 4)) {
TimeSpan total = TimeSpan.Zero;
queue.WriteBuffer (keyMem, 0, keyTable, 0, keyTable.Length * 4);
queue.WriteBuffer (sbox1Mem, 0, Camellia.SBOX1_1110, 0, Camellia.SBOX1_1110.Length * 4);
queue.WriteBuffer (sbox2Mem, 0, Camellia.SBOX2_0222, 0, Camellia.SBOX2_0222.Length * 4);
queue.WriteBuffer (sbox3Mem, 0, Camellia.SBOX3_3033, 0, Camellia.SBOX3_3033.Length * 4);
queue.WriteBuffer (sbox4Mem, 0, Camellia.SBOX4_4404, 0, Camellia.SBOX4_4404.Length * 4);
queue.WriteBuffer (inMem, 0, new byte[inMem.Size], 0, (int)inMem.Size);
queue.WriteBuffer (outMem, 0, new byte[outMem.Size], 0, (int)outMem.Size);
const int mode = 1;
int local_loops = 4;
if (mode == 2 && local_loops < 4) local_loops = 4;
using (Kernel kernel = prog.CreateKernel ("encrypt" + mode.ToString ())) {
kernel.SetArgument (0, inMem);
kernel.SetArgument (1, outMem);
kernel.SetArgument (2, keyMem);
kernel.SetArgument (3, sbox1Mem);
kernel.SetArgument (4, sbox2Mem);
kernel.SetArgument (5, sbox3Mem);
kernel.SetArgument (6, sbox4Mem);
kernel.SetLocalDataShare (7, (int)sbox1Mem.Size);
kernel.SetLocalDataShare (8, (int)sbox2Mem.Size);
kernel.SetLocalDataShare (9, (int)sbox3Mem.Size);
kernel.SetLocalDataShare (10, (int)sbox4Mem.Size);
kernel.SetArgument (11, (uint)(input.Length / 16), 4);
total += Execute ("write", memTests, input.Length, delegate () {
queue.WriteBuffer (inMem, 0, input, 0, input.Length);
});
int global_size = input.Length / 16 / local_loops;
int local_size = (int)queue.Device.MaxWorkItemSizes[0];
while (local_size > global_size)
local_size >>= 1;
total += Execute ("kernel(encrypt)", encryptTests, input.Length, delegate () {
queue.Execute (kernel, 0, global_size, local_size);
});
}
total += Execute ("read", memTests, input.Length, delegate () {
queue.ReadBuffer (outMem, 0, output, 0, output.Length);
});
WriteTime ("total", total, input.Length);
}
#if false
Camellia.Encrypt (key, input, output_ref);
for (int i = 0; i < output.Length; i++)
if (output[i] != output_ref[i]) {
Console.ForegroundColor = ConsoleColor.Red;
Console.WriteLine ("err");
Console.ForegroundColor = ConsoleColor.White;
break;
}
#endif
Console.WriteLine ("cmpl");
Console.ReadLine ();
}
示例8: AESTest2
static void AESTest2()
{
const int memTests = 1;
const int encryptTests = 1;
const int ProcessUnitDataSize = 16 * 32; // 32bit-width bitslice
byte[] key = new byte[16];
byte[] input = new byte[ProcessUnitDataSize * 2 * 1024 * 64];
byte[] output = new byte[input.Length];
byte[] output_ref = new byte[input.Length];
byte[] expandedKey;
new Random ().NextBytes (key);
new Random ().NextBytes (input);
AES.KeyExpansion (key, out expandedKey);
TimeSpan total = TimeSpan.Zero;
bool private_memory_mode = true;
using (Context context = new Context (DeviceType.GPU))
using (CommandQueue queue = context.CreateCommandQueue (context.Devices[0], CommandQueueProperties.Default))
using (CLProgram prog = context.CreateProgram (OclCodeStore.GetOclCode (private_memory_mode ? "bitslice_aes4" : "bitslice_aes3"), context.Devices, null))
using (Memory mem = context.CreateBuffer (MemoryFlags.ReadWrite, input.Length))
using (Memory keyMem = context.CreateBuffer (MemoryFlags.ReadWrite, expandedKey.Length * 32)) {
using (Memory nonsliceKeyMem = context.CreateBuffer (MemoryFlags.WriteOnly, expandedKey.Length))
using (Kernel kernel = prog.CreateKernel ("bitslice_key")) {
kernel.SetArgument (0, nonsliceKeyMem);
kernel.SetArgument (1, keyMem);
queue.WriteBuffer (nonsliceKeyMem, 0, expandedKey, 0, expandedKey.Length);
queue.Execute (kernel, 0, expandedKey.Length * 8 / 4, 8);
}
int localMemorySize = (int)(queue.Device.LocalMemSize / 2);
int maxWorkItemSize = (int)queue.Device.MaxWorkItemSizes[0];
// global/local size setting for encrypt kernel
int global_size = (private_memory_mode ? input.Length / ProcessUnitDataSize : input.Length / ProcessUnitDataSize * 4);
int local_size = (private_memory_mode ? int.MaxValue : (localMemorySize / 512) * 4);
local_size = Math.Min (local_size, maxWorkItemSize);
local_size = Math.Min (local_size, global_size);
// global/local size setting for bitslice kernel
int slice_global_size = input.Length / ProcessUnitDataSize * 32;
int slice_local_size = (localMemorySize / 512) * 32;
slice_local_size = Math.Min (slice_local_size, maxWorkItemSize);
slice_local_size = Math.Min (slice_local_size, slice_global_size);
total += Execute ("write", memTests, input.Length, delegate () {
queue.WriteBuffer (mem, 0, input, 0, input.Length);
});
using (Kernel kernel_encrypt = prog.CreateKernel ("encrypt"))
using (Kernel kernel_bitslice = prog.CreateKernel ("bitslice_kernel")) {
kernel_bitslice.SetArgument (0, mem);
kernel_bitslice.SetLocalDataShare (1, 512 * slice_local_size / 32);
kernel_encrypt.SetArgument (0, mem);
kernel_encrypt.SetArgument (1, keyMem);
if (!private_memory_mode)
kernel_encrypt.SetLocalDataShare (2, 512 * local_size / 4);
total += Execute ("kernel(bitslice)", encryptTests, input.Length, delegate () {
queue.Execute (kernel_bitslice, 0, slice_global_size, slice_local_size);
});
total += Execute ("kernel(encrypt)", encryptTests, input.Length, delegate () {
queue.Execute (kernel_encrypt, 0, global_size, local_size);
});
total += Execute ("kernel(unbitslice)", encryptTests, input.Length, delegate () {
queue.Execute (kernel_bitslice, 0, slice_global_size, slice_local_size);
});
}
total += Execute ("read", memTests, input.Length, delegate () {
queue.ReadBuffer (mem, 0, output, 0, output.Length);
});
}
WriteTime ("total", total, input.Length);
#if true
AES.Encrypt (key, input, output_ref);
for (int i = 0; i < output.Length; i++)
if (output[i] != output_ref[i]) {
Console.ForegroundColor = ConsoleColor.Red;
Console.WriteLine ("err");
Console.ForegroundColor = ConsoleColor.White;
break;
}
#endif
Console.WriteLine ("cmpl");
Console.ReadLine ();
}
示例9: AESTest
static void AESTest()
{
const int memTests = 1;
const int encryptTests = 1;
byte[] key = new byte[16];
byte[] input = new byte[1024 * 1024 * 64];
byte[] output = new byte[input.Length];
byte[] output_ref = new byte[input.Length];
byte[] expandedKey;
new Random ().NextBytes (key);
new Random ().NextBytes (input);
AES.KeyExpansion (key, out expandedKey);
TimeSpan total = TimeSpan.Zero;
using (Context context = new Context (DeviceType.GPU))
using (CommandQueue queue = context.CreateCommandQueue (context.Devices[0], CommandQueueProperties.Default))
#if false
using (CLProgram prog = context.CreateProgram (OclCodeStore.GetOclCode ("bitslice_aes2"), context.Devices, null))
using (Memory mem = context.CreateBuffer (MemoryFlags.ReadWrite, input.Length))
using (Memory keyMem = context.CreateBuffer (MemoryFlags.ReadWrite, expandedKey.Length * 32)) {
using (Memory nonsliceKeyMem = context.CreateBuffer (MemoryFlags.WriteOnly, expandedKey.Length))
using (Kernel kernel = prog.CreateKernel ("bitslice_key")) {
kernel.SetArgument (0, nonsliceKeyMem);
kernel.SetArgument (1, keyMem);
queue.WriteBuffer (nonsliceKeyMem, 0, expandedKey, 0, expandedKey.Length);
queue.Execute (kernel, 0, expandedKey.Length * 8 / 4, 8);
}
int global_size = input.Length / (16 * 32);
int local_size = (int)queue.Device.LocalMemSize / 512 / 2;
while (local_size > global_size)
local_size >>= 1;
total += Execute ("write", memTests, input.Length, delegate () {
queue.WriteBuffer (mem, 0, input, 0, input.Length);
});
#if false
using (Kernel kernel_encrypt = prog.CreateKernel ("encrypt1")) {
kernel_encrypt.SetArgument (0, mem);
kernel_encrypt.SetArgument (1, keyMem);
kernel_encrypt.SetLocalDataShare (2, 512 * local_size);
total += Execute ("kernel(encrypt)", encryptTests, input.Length, delegate () {
queue.Execute (kernel_encrypt, 0, global_size, local_size);
});
}
#else
using (Kernel kernel_encrypt2 = prog.CreateKernel ("encrypt2"))
using (Kernel kernel_bitslice = prog.CreateKernel ("bitslice_kernel")) {
kernel_bitslice.SetArgument (0, mem);
kernel_encrypt2.SetArgument (0, mem);
kernel_encrypt2.SetArgument (1, keyMem);
kernel_encrypt2.SetLocalDataShare (2, 512 * local_size);
#if true
total += Execute ("kernel(bitslice)", encryptTests, input.Length, delegate () {
queue.Execute (kernel_bitslice, 0, global_size, local_size);
});
total += Execute ("kernel(encrypt2)", encryptTests, input.Length, delegate () {
queue.Execute (kernel_encrypt2, 0, global_size, local_size);
});
total += Execute ("kernel(unbitslice)", encryptTests, input.Length, delegate () {
queue.Execute (kernel_bitslice, 0, global_size, local_size);
});
#else
total += Execute ("kernel", encryptTests, input.Length, delegate () {
EventHandle bitslice_wait, encrypt_wait;
queue.ExecuteAsync (kernel_bitslice, 0, global_size, local_size, out bitslice_wait);
queue.ExecuteAsync (kernel_encrypt2, 0, global_size, local_size, new EventHandle[] { bitslice_wait }, out encrypt_wait);
queue.Execute (kernel_bitslice, 0, global_size, local_size, new EventHandle[] { encrypt_wait });
});
#endif
}
#endif
total += Execute ("read", memTests, input.Length, delegate () {
queue.ReadBuffer (mem, 0, output, 0, output.Length);
});
}
#elif false
using (CLProgram prog = context.CreateProgram (OclCodeStore.GetOclCode ("bitslice_aes"), context.Devices, null))
using (Memory inMem = context.CreateBuffer (MemoryFlags.ReadWrite, input.Length))
using (Memory keyMem = context.CreateBuffer (MemoryFlags.ReadWrite, expandedKey.Length * 32)) {
using (Memory nonsliceKeyMem = context.CreateBuffer (MemoryFlags.ReadOnly, expandedKey.Length))
using (Kernel kernel = prog.CreateKernel ("bitslice_key")) {
kernel.SetArgument (0, nonsliceKeyMem);
kernel.SetArgument (1, keyMem);
queue.WriteBuffer (nonsliceKeyMem, 0, expandedKey, 0, expandedKey.Length);
queue.Execute (kernel, 0, expandedKey.Length * 8, 128);
}
int global_size = input.Length / 16;
int local_size = 128;
total += Execute ("write", memTests, input.Length, delegate () {
queue.WriteBuffer (inMem, 0, input, 0, input.Length);
});
using (Kernel kernel = prog.CreateKernel ("encrypt")) {
//.........这里部分代码省略.........
示例10: ExecuteKernel
private unsafe void ExecuteKernel(
Context context,
Device device,
CommandQueue commandQueue,
Kernel kernel,
float[] input,
float[] output,
int globalWorkSize,
int localWorkSize,
bool warming,
bool useHostPointer,
bool autoGroupSize,
bool enableProfiling,
out TimeSpan stopwatchTime,
out TimeSpan profiledTime,
out TimeSpan readTime)
{
MemoryFlags inFlags = (useHostPointer ? MemoryFlags.UseHostPointer : MemoryFlags.CopyHostPointer) | MemoryFlags.ReadOnly;
MemoryFlags outFlags = (useHostPointer ? MemoryFlags.UseHostPointer : MemoryFlags.CopyHostPointer) | MemoryFlags.ReadWrite;
int taskSize = input.Length;
// allocate buffers
fixed (float* pinput = input, poutput = output)
{
using (Buffer inputBuffer = context.CreateBuffer(inFlags, sizeof(float) * taskSize, (IntPtr)pinput),
outputBuffer = context.CreateBuffer(outFlags, sizeof(float) * taskSize, (IntPtr)poutput))
{
kernel.Arguments[0].SetValue(inputBuffer);
kernel.Arguments[1].SetValue(outputBuffer);
Console.WriteLine("Original global work size {0}", globalWorkSize);
Console.WriteLine("Original local work size {0}", localWorkSize);
if (autoGroupSize)
{
Console.WriteLine("Run-time determines optimal workgroup size");
}
IntPtr workGroupSizeMaximum = kernel.GetWorkGroupSize(device);
Console.WriteLine("Maximum workgroup size for this kernel {0}", workGroupSizeMaximum.ToInt64());
if (warming)
{
Console.Write("Warming up OpenCL execution...");
using (commandQueue.EnqueueNDRangeKernel(kernel, new[] { (IntPtr)globalWorkSize }, autoGroupSize ? null : new[] { (IntPtr)localWorkSize }))
{
}
commandQueue.Finish();
Console.WriteLine("Done");
}
Console.Write("Executing OpenCL kernel...");
Stopwatch timer = Stopwatch.StartNew();
// execute kernel, pls notice autoGroupSize
using (Event perfEvent = commandQueue.EnqueueNDRangeKernel(kernel, new[] { (IntPtr)globalWorkSize }, autoGroupSize ? null : new[] { (IntPtr)localWorkSize }))
{
Event.WaitAll(perfEvent);
stopwatchTime = timer.Elapsed;
Console.WriteLine("Done");
if (enableProfiling)
{
ulong start = perfEvent.CommandStartTime;
ulong end = perfEvent.CommandEndTime;
// a tick is 100ns
profiledTime = TimeSpan.FromTicks((long)(end - start) / 100);
}
else
{
profiledTime = TimeSpan.Zero;
}
}
timer.Restart();
if (useHostPointer)
{
IntPtr tmpPtr;
using (commandQueue.EnqueueMapBuffer(outputBuffer, true, MapFlags.Read, 0, sizeof(float) * taskSize, out tmpPtr))
{
}
Assert.AreEqual((IntPtr)poutput, tmpPtr, "EnqueueMapBuffer failed to return original pointer");
using (commandQueue.EnqueueUnmapMemObject(outputBuffer, tmpPtr))
{
}
}
else
{
using (commandQueue.EnqueueReadBuffer(outputBuffer, true, 0, sizeof(float) * taskSize, (IntPtr)poutput))
{
}
}
commandQueue.Finish();
readTime = timer.Elapsed;
}
//.........这里部分代码省略.........