本文整理汇总了Python中plan.Plan.Ns方法的典型用法代码示例。如果您正苦于以下问题:Python Plan.Ns方法的具体用法?Python Plan.Ns怎么用?Python Plan.Ns使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类plan.Plan
的用法示例。
在下文中一共展示了Plan.Ns方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: plan_parallel_ragged_gather_gemv2
# 需要导入模块: from plan import Plan [as 别名]
# 或者: from plan.Plan import Ns [as 别名]
def plan_parallel_ragged_gather_gemv2(queue, Ms, Ns, alpha, A, A_js, X, X_js,
beta, Y, group_size = 32, Y_in=None, tag=None):
"""
"""
# TODO: if alpha or beta is a float
# then render it into the kernel text.
try:
float(alpha)
alpha = [alpha] * len(Y)
except TypeError:
pass
try:
float(beta)
beta = [beta] * len(Y)
except TypeError:
pass
cl_alpha = to_device(queue, np.asarray(alpha, Y.buf.dtype))
cl_beta = to_device(queue, np.asarray(beta, Y.buf.dtype))
if Y_in is None:
Y_in = Y
# XXX check for e.g. all Ns being the same thing
# especially all Ns == 1
cl_Ns = to_device(queue, np.asarray(Ns, 'int32'))
# XXX check that all the ints are ints not longs
textconf = {
'type_alpha': cl_alpha.ocldtype,
'type_beta': cl_beta.ocldtype,
'type_A': A.cl_buf.ocldtype,
'type_X': X.cl_buf.ocldtype,
'type_Y': Y.cl_buf.ocldtype,
'y_len': len(Y),
'lsize': group_size,
}
text = """
__kernel void fn(
const __global int *Ns,
const __global ${type_alpha} * alphas,
const __global int *A_starts,
const __global ${type_A} *A_data,
const __global int *A_js_starts,
const __global int *A_js_lens,
const __global int *A_js_data,
const __global int *X_starts,
const __global ${type_X} *X_data,
const __global int *X_js_starts,
const __global int *X_js_data,
const __global ${type_beta} * betas,
const __global int *Y_in_starts,
const __global ${type_Y} *Y_in_data,
const __global int *Y_starts,
const __global int *Y_lens,
__global ${type_Y} *Y_data)
{
//const int mm = get_global_id(1); //TODO
__local ${type_Y} partialDotProduct[${lsize}]; //Scratch space for the dot products
//Y is divided into groups of size group_size. Each work-item does enough dot-products to cover one of the groups
for (uint yi = get_group_id(0); yi < ${y_len}; yi += get_num_groups(0)) {
const __global int* X_js_row = X_js_data + X_js_starts[yi];
const __global int* A_js_row = A_js_data + A_js_starts[yi];
const ${type_alpha} alpha = alphas[yi];
const ${type_beta} beta = betas[yi];
int y_offset = Y_starts[yi];
int y_in_offset = Y_in_starts[yi];
Y_data[y_offset] = beta * Y_in_data[y_in_offset];
float sum = 0;
int n_dot_products = A_js_lens[yi]; //Do all of xjs dot products at same time
for(int j = 0; j < n_dot_products; j++) {
int x_ji = X_js_row[j];
int a_ji = A_js_row[j];
int N_i = Ns[a_ji];
const __global ${type_A}* A_row = A_data + A_starts[a_ji]; //Get the rows for the product
const __global ${type_X}* X_row = X_data + X_starts[x_ji];
//Each work item will do some fraction of the multiplications and store the result locally
for (uint x = get_local_id(0); x < N_i; x += get_local_size(0)) {
sum += A_row[x] * X_row[x];
}
}
partialDotProduct[get_local_id(0)] = sum;
//Parallel reduction of locally stored sums
for (uint stride = 1; stride < get_local_size(0); stride *= 2) {
barrier(CLK_LOCAL_MEM_FENCE);
#.........这里部分代码省略.........
示例2: plan_ragged_gather_gemv
# 需要导入模块: from plan import Plan [as 别名]
# 或者: from plan.Plan import Ns [as 别名]
def plan_ragged_gather_gemv(queue, Ms, Ns, alpha, A, A_js, X, X_js,
beta, Y, Y_in=None, tag=None):
"""
"""
# TODO: if alpha or beta is a float
# then render it into the kernel text.
try:
float(alpha)
alpha = [alpha] * len(Y)
except TypeError:
pass
try:
float(beta)
beta = [beta] * len(Y)
except TypeError:
pass
cl_alpha = to_device(queue, np.asarray(alpha, Y.buf.dtype))
cl_beta = to_device(queue, np.asarray(beta, Y.buf.dtype))
if Y_in is None:
Y_in = Y
# XXX check for e.g. all Ns being the same thing
# especially all Ns == 1
cl_Ns = to_device(queue, np.asarray(Ns, 'int32'))
# XXX check that all the ints are ints not longs
textconf = {
'type_alpha': cl_alpha.ocldtype,
'type_beta': cl_beta.ocldtype,
'type_A': A.cl_buf.ocldtype,
'type_X': X.cl_buf.ocldtype,
'type_Y': Y.cl_buf.ocldtype,
}
text = """
__kernel void fn(
__global int *Ns,
__global ${type_alpha} * alphas,
__global int *A_starts,
__global ${type_A} *A_data,
__global int *A_js_starts,
__global int *A_js_lens,
__global int *A_js_data,
__global int *X_starts,
__global ${type_X} *X_data,
__global int *X_js_starts,
__global int *X_js_data,
__global ${type_beta} * betas,
__global int *Y_in_starts,
__global ${type_Y} *Y_in_data,
__global int *Y_starts,
__global int *Y_lens,
__global ${type_Y} *Y_data)
{
const int mm = get_global_id(0);
const int bb = get_global_id(1);
const int M = Y_lens[bb];
if (mm < M)
{
const ${type_alpha} alpha = alphas[bb];
const ${type_beta} beta = betas[bb];
int n_dot_products = A_js_lens[bb];
int y_offset = Y_starts[bb];
int y_in_offset = Y_in_starts[bb];
X_js_data += X_js_starts[bb];
A_js_data += A_js_starts[bb];
Y_data[y_offset + mm] = beta * Y_in_data[y_in_offset + mm];
for (int ii = 0; ii < n_dot_products; ++ii)
{
int x_ji = X_js_data[ii];
int a_ji = A_js_data[ii];
int N_i = Ns[a_ji];
int x_offset = X_starts[x_ji];
int a_offset = A_starts[a_ji];
// compute the matrix-vector product
// dot(X[x_ji], A[a_ji])
${type_Y} y_sum = 0;
for (int nn = 0; nn < N_i; ++nn) //Parallel reduction. How big is N_i?
{
y_sum += X_data[x_offset + nn]
* A_data[a_offset + nn * M + mm];
}
Y_data[y_offset + mm] += alpha * y_sum;
}
}
}
"""
text = Template(text, output_encoding='ascii').render(**textconf)
gsize = (int(max(Ms)), int(len(Y)),)
lsize = None
_fn = cl.Program(queue.context, text).build().fn
#.........这里部分代码省略.........