本文整理汇总了Python中tvm.create_schedule函数的典型用法代码示例。如果您正苦于以下问题:Python create_schedule函数的具体用法?Python create_schedule怎么用?Python create_schedule使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了create_schedule函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_rfactor
def test_rfactor():
n = tvm.var('n')
k1 = tvm.reduce_axis((0, n), name="k1")
k2 = tvm.reduce_axis((0, n), name="k2")
A = tvm.placeholder((n, n, n), name='A')
B = tvm.compute((n, ), lambda i: tvm.sum(A[i, k1, k2], axis=[k1, k2]))
# normal schedule
s = tvm.create_schedule(B.op)
BF = s.rfactor(B, k1)
assert(tuple(BF.shape) == (n, n))
assert(set(BF.op.body[0].axis) == set([k2]))
assert(s[B].op.body[0].axis[0].dom.extent == n)
assert(len(s[B].all_iter_vars) == 2)
# schedule with splot
s = tvm.create_schedule(B.op)
ko, ki = s[B].split(k1, factor=4)
xo, xi = s[B].split(B.op.axis[0], factor=8)
BF = s.rfactor(B, ki)
assert(BF.shape[0].value == 4)
assert(BF.shape[1] == n)
assert(BF.op.body[0].axis[0] == k2)
assert(BF.op.body[0].axis[1].var == ko.var)
assert(s[B].op.body[0].axis[0].dom.extent.value == 4)
# schedule with factor_axis
s = tvm.create_schedule(B.op)
ko, ki = s[B].split(k1, factor=4)
xo, xi = s[B].split(B.op.axis[0], factor=8)
BF = s.rfactor(B, ki, 1)
assert(n == BF.shape[0])
assert(BF.shape[1].value == 4)
assert(BF.op.body[0].axis[0] == k2)
assert(BF.op.body[0].axis[1].var == ko.var)
assert(s[B].op.body[0].axis[0].dom.extent.value == 4)
示例2: verify_full
def verify_full(shape, dtype, fill_value):
A = tvm.placeholder(shape, dtype=dtype, name="A")
B = topi.cpp.full_like(A, fill_value)
C = topi.cpp.full(shape, dtype, fill_value)
s1 = tvm.create_schedule([B.op])
s2 = tvm.create_schedule([C.op])
def get_ref_data():
return np.full(shape, fill_value, dtype)
np_nd = get_ref_data()
def check_device(device):
if not tvm.module.enabled(device):
print("Skip because %s is not enabled" % device)
return
target = topi.cpp.TEST_create_target(device)
ctx = tvm.context(device, 0)
out = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
f = tvm.build(s1, [A, B], device, name="full_like")
f(tvm.nd.array(np.zeros(shape, dtype), ctx), out)
tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
f = tvm.build(s2, [C], device, name="full")
f(out)
tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
for device in ["llvm"]:
check_device(device)
示例3: test_rpc_module
def test_rpc_module():
# graph
n = tvm.convert(1024)
A = tvm.placeholder((n,), name='A')
B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
temp = util.tempdir()
s = tvm.create_schedule(B.op)
xo, xi = s[B].split(B.op.axis[0], factor=64)
s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
# Build the dynamic lib.
# If we don't want to do metal and only use cpu, just set target to be target
f = tvm.build(s, [A, B], "metal", target_host=target, name="myadd")
path_dso1 = temp.relpath("dev_lib.dylib")
f.export_library(path_dso1, xcode.create_dylib,
arch=arch, sdk=sdk)
xcode.codesign(path_dso1)
s = tvm.create_schedule(B.op)
xo, xi = s[B].split(B.op.axis[0], factor=64)
s[B].parallel(xi)
s[B].pragma(xo, "parallel_launch_point")
s[B].pragma(xi, "parallel_barrier_when_finish")
f = tvm.build(s, [A, B], target, name="myadd_cpu")
path_dso2 = temp.relpath("cpu_lib.dylib")
f.export_library(path_dso2, xcode.create_dylib,
arch=arch, sdk=sdk)
xcode.codesign(path_dso2)
# Start RPC test server that contains the compiled library.
server = xcode.popen_test_rpc(proxy_host, proxy_port, key,
destination=destination,
libs=[path_dso1, path_dso2])
# connect to the proxy
remote = rpc.connect(proxy_host, proxy_port, key=key)
ctx = remote.metal(0)
f1 = remote.load_module("dev_lib.dylib")
a_np = np.random.uniform(size=1024).astype(A.dtype)
a = tvm.nd.array(a_np, ctx)
b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
cost = time_f(a, b).mean
print('%g secs/op' % cost)
np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
# CPU
ctx = remote.cpu(0)
f2 = remote.load_module("cpu_lib.dylib")
a_np = np.random.uniform(size=1024).astype(A.dtype)
a = tvm.nd.array(a_np, ctx)
b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
time_f = f2.time_evaluator(f1.entry_name, ctx, number=10)
cost = time_f(a, b).mean
print('%g secs/op' % cost)
np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
示例4: test_add_pipeline
def test_add_pipeline():
nn = 64
max_threads = 4
n = tvm.convert(nn)
A = tvm.placeholder((n,), name='A')
def extern_generator(ins, outs):
"""Manually write the IR for the extern function, add pipeline"""
ib = tvm.ir_builder.create()
with ib.for_range(0, (n+1) // 2) as i:
ib.emit(outs[0].vstore(i*2, ins[0].vload(i*2, "float32x2") + tvm.const(1, "float32x2")))
return ib.get()
def extern_generator_gpu(ins, outs):
"""Manually write the IR for the extern function, add pipeline"""
ib = tvm.ir_builder.create()
bx = tvm.thread_axis("blockIdx.x")
tx = tvm.thread_axis("threadIdx.x")
ib.scope_attr(bx, "thread_extent", (nn+max_threads-1) // max_threads)
ib.scope_attr(tx, "thread_extent", max_threads)
idx = bx.var * max_threads + tx.var
with ib.if_scope(ib.likely(idx < n)):
ib.emit(outs[0].vstore(idx*2, ins[0].vload(idx*2, "float32x2") + tvm.const(1, "float32x2")))
return ib.get()
C_cpu = tvm.extern(A.shape, [A], extern_generator, name='C')
C_gpu = tvm.extern(A.shape, [A], extern_generator_gpu, name='C')
s_cpu = tvm.create_schedule(C_cpu.op)
s_gpu = tvm.create_schedule(C_gpu.op)
print(tvm.lower(s_cpu, [A, C_cpu], simple_mode=True))
print(tvm.lower(s_gpu, [A, C_gpu], simple_mode=True))
def check_target(target):
if not tvm.module.enabled(target):
return
s = s_gpu if target in ['opencl', 'cuda'] else s_cpu
C = C_gpu if target in ['opencl', 'cuda'] else C_cpu
# build and invoke the kernel.
f = tvm.build(s, [A, C], target)
ctx = tvm.context(target, 0)
# launch the kernel.
n = nn
a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
f(a, c)
tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
check_target("llvm")
check_target("opencl")
check_target("cuda")
示例5: dump_graph_lib
def dump_graph_lib(target_dir):
dim = 4
A = tvm.placeholder((dim,), name='A')
B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
sched = tvm.create_schedule(B.op)
node0 = {"op": "null", "name": "x", "inputs": []}
node1 = {"op": "tvm_op", "name": "add",
"inputs": [[0, 0, 0]],
"attrs": {"func_name": "myadd",
"flatten_data": "1",
"num_inputs" : "1",
"num_outputs" : "1"}}
nodes = [node0, node1]
arg_nodes = [0]
node_row_ptr = [0, 1, 2]
outputs = [[1, 0, 0]]
shape = (4,)
attrs = {
"shape" : ["list_shape", [shape, shape]],
"dltype" : ["list_str", ["float32", "float32"]],
"storage_id" : ["list_int", [0, 1]],
}
graph = {"nodes": nodes,
"arg_nodes": arg_nodes,
"node_row_ptr": node_row_ptr,
"heads": outputs,
"attrs": attrs}
graph = json.dumps(graph)
mlib = tvm.build(sched, [A, B], "llvm", name="myadd")
mlib.export_library(os.path.join(target_dir, "graph_addone_lib.so"))
with open(os.path.join(target_dir, "graph_addone.json"), "w") as fo:
fo.write(graph)
示例6: intrin_vadd
def intrin_vadd(n, cache_read=False, cache_write=False):
scope_ubuf = 'local'
dtype = 'float32'
x = tvm.placeholder((n,), dtype=dtype, name='vx')
y = tvm.placeholder((n,), dtype=dtype, name='vy')
z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z')
s = tvm.create_schedule(z.op)
def create_buffer(t):
return tvm.decl_buffer(t.shape, t.dtype,
name='W'+t.name,
scope=scope_ubuf,
offset_factor=16)
binds = {}
if cache_read:
binds[x] = create_buffer(x)
binds[y] = create_buffer(y)
if cache_write:
binds[z] = create_buffer(z)
def intrin_func(ins, outs):
ib = tvm.ir_builder.create()
ib.emit(tvm.call_extern(outs[0].dtype, 'vadd', ins[0].access_ptr("r"), ins[1].access_ptr('r'), outs[0].access_ptr('wr')))
return ib.get()
with tvm.build_config(offset_factor=16):
return tvm.decl_tensor_intrin(z.op, intrin_func, binds=binds)
示例7: test_scan_group
def test_scan_group():
m = tvm.var("m")
n = tvm.var("n")
x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
s_state = tvm.placeholder((m, n))
s_init = tvm.compute((1, n), lambda _, i: x[0, i])
s_update1 = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + x[t, i])
s_update2 = tvm.compute((m, n), lambda t, i: s_update1[t, i] + 1)
s_update3 = tvm.compute((m, n), lambda t, i: s_update2[t, i] + 1)
res = tvm.scan(s_init, s_update3, s_state, inputs=x)
s = tvm.create_schedule(res.op)
assert s[s_update1].group is not None
assert s[s_update2].group == s[s_update1].group
# Assign within group, is valid
s[s_update1].compute_at(s[s_update2], s_update2.op.axis[1])
# create a new group, for [s_update2 and s_update1]
g2 = s.create_group(outputs=s_update2, inputs=[s_state, x])
assert g2.group is not None
assert g2.group == s[s_update3].group
assert s[s_update2].group == g2
assert s[s_update1].group == g2
g2.compute_at(s[s_update3], s_update3.op.axis[1])
assert g2.attach_stage == s[s_update3]
try:
# compute outside group error.
s[s_update2].compute_at(s[s_init], s_init.op.axis[0])
assert False
except tvm.TVMError:
pass
示例8: test_bound_nest_thread
def test_bound_nest_thread():
m = tvm.var('m')
A = tvm.placeholder((m), name='A')
A1 = tvm.compute((m,), lambda i: A[i], name='A1')
A2 = tvm.compute((m,), lambda i: A1[i] + 2, name='A2')
A3 = tvm.compute((m,), lambda i: A2[i] + 3, name='A3')
s = tvm.create_schedule(A3.op)
s[A2].set_scope("shared")
s[A1].set_scope("local")
block_x = tvm.thread_axis("blockIdx.x")
thread_x = tvm.thread_axis("threadIdx.x")
bx, tx = s[A3].split(A3.op.axis[0], factor=32)
s[A3].bind(bx, block_x)
s[A3].bind(tx, thread_x)
s[A2].compute_at(s[A3], tx)
_, xi = s[A2].split(A2.op.axis[0], nparts=1)
s[A2].bind(xi, thread_x)
s[A1].compute_at(s[A3], tx)
s = s.normalize()
bounds = tvm.schedule.InferBound(s)
assert(bounds[A1.op.axis[0]].extent.value==1)
assert(bounds[A2.op.axis[0]].extent.value==32)
assert(bounds[A3.op.axis[0]].extent == m)
示例9: test_schedule_create
def test_schedule_create():
m = tvm.var('m')
n = tvm.var('n')
l = tvm.var('l')
A = tvm.placeholder((m, l), name='A')
B = tvm.placeholder((n, l), name='B')
AA = tvm.compute((m, l), lambda i, j: A[i, j])
T = tvm.compute((m, n, l), lambda i, j, k: AA(i, k) * B(j, k))
s = tvm.create_schedule(T.op)
s[AA].set_scope("shared")
xo, xi = s[T].split(T.op.axis[0], factor=10)
xi1, xi2 = s[T].split(xi, factor=2)
s[AA].compute_at(s[T], xi1)
xo, xi = s[AA].split(AA.op.axis[0], factor=10)
s[T].reorder(xi2, xi1)
assert T.op.axis[1] in s[T].leaf_iter_vars
# save load json
json_str = tvm.save_json(s)
s_loaded = tvm.load_json(json_str)
assert isinstance(s_loaded, tvm.schedule.Schedule)
assert(str(s_loaded.outputs[0].body) == str(s.outputs[0].body))
# pickle unpickle
dump = pkl.dumps(s)
s_loaded = pkl.loads(dump)
assert isinstance(s_loaded, tvm.schedule.Schedule)
assert(str(s_loaded.outputs[0].body) == str(s.outputs[0].body))
示例10: test_add
def test_add():
nn = 1024
n = tvm.convert(nn)
A = tvm.placeholder((n,), name='A')
B = tvm.placeholder((n,), name='B')
C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
s = tvm.create_schedule(C.op)
def check_c():
mhost = tvm.build(s, [A, B, C], "c", name="fadd")
temp = util.tempdir()
path_dso = temp.relpath("temp.so")
mhost.export_library(path_dso)
m = tvm.module.load(path_dso)
fadd = m['fadd']
ctx = tvm.cpu(0)
# launch the kernel.
n = nn
a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
fadd(a, b, c)
tvm.testing.assert_allclose(
c.asnumpy(), a.asnumpy() + b.asnumpy())
check_c()
示例11: schedule_softmax
def schedule_softmax(outs):
"""Schedule for softmax
Parameters
----------
outs: Array of Tensor
The computation graph description of softmax
in the format of an array of tensors.
Returns
-------
sch: Schedule
The computation schedule for the op.
"""
outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
s = tvm.create_schedule([x.op for x in outs])
tvm.schedule.AutoInlineInjective(s)
softmax = outs[0]
max_elem = softmax.op.input_tensors[1]
expsum = softmax.op.input_tensors[2]
s[expsum].compute_at(s[softmax], s[softmax].op.axis[1])
s[max_elem].compute_at(s[softmax], s[softmax].op.axis[1])
px, x = s[softmax].split(softmax.op.axis[0], nparts=1)
s[softmax].bind(px, tvm.thread_axis("pipeline"))
return s
示例12: test_pack_buffer_intermediate
def test_pack_buffer_intermediate():
nn = 1024
n = tvm.convert(nn)
A = tvm.placeholder((n,), name='A')
B = tvm.compute((n,), lambda i: A[i] + 1, name="B")
def extern_generator(ins, outs):
"""Manually write the IR for the extern function, add pipeline."""
return tvm.call_packed("my_extern_array_func2", ins[0], outs[0])
C = tvm.extern(B.shape, [B], extern_generator, name='C')
s = tvm.create_schedule(C.op)
def check_target(target):
if not tvm.module.enabled(target):
return
# build and invoke the kernel.
f = tvm.build(s, [A, C], target)
ctx = tvm.cpu(0)
# launch the kernel.
n = nn
a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
@tvm.register_func
def my_extern_array_func2(aa, bb):
assert aa.shape == a.shape
tvm.testing.assert_allclose(
aa.asnumpy(), a.asnumpy() + 1)
aa.copyto(bb)
f(a, c)
tvm.testing.assert_allclose(
c.asnumpy(), a.asnumpy() + 1)
check_target("llvm")
示例13: test_lstm_cell_inline
def test_lstm_cell_inline():
num_step = 128
num_input = 256
num_hidden = 1152
batch_size = 4
# Global transition matrix
X = tvm.placeholder((num_step - 1, batch_size, num_input), name="X")
Wi2h = tvm.placeholder((4, num_hidden, num_input), name="Wi2h")
Wh2h = tvm.placeholder((4, num_hidden, num_hidden), name="Wh2h")
# h: output hidden state, c: cell state.
s_state_h = tvm.placeholder((num_step, batch_size, num_hidden))
s_state_c = tvm.placeholder((num_step, batch_size, num_hidden))
s_init_c = tvm.compute((1, batch_size, num_hidden),
lambda *i: 0.0, name="init_c")
s_init_h = tvm.compute((1, batch_size, num_hidden),
lambda *i: 0.0, name="init_h")
# LSTM transition
k = tvm.reduce_axis((0, num_input), name="ki2h")
s_i2h = tvm.compute(
(num_step, 4, batch_size, num_hidden),
lambda t, x, i, j: tvm.sum(X[t - 1, i, k] * Wi2h[x, j, k], axis=k),
name="s_i2h")
k = tvm.reduce_axis((0, num_hidden), name="ki2h")
s_h2h = tvm.compute(
(num_step, 4, batch_size, num_hidden),
lambda t, x, i, j: tvm.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k),
name="s_h2h")
# Gate rules
gates = tvm.compute(s_i2h.shape, lambda *i:
s_i2h(*i) + s_h2h(*i), name="gates")
gshape = (num_step, batch_size, num_hidden)
in_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 0, i, j]), name="in_gate")
in_transform = tvm.compute(gshape, lambda t, i, j: tvm.tanh(gates[t, 1, i, j]), name="in_transform")
forget_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 2, i, j]), name="forget_gate")
out_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 3, i, j]), name="out_gate")
next_c = tvm.compute(gshape,
lambda t, i, j:
forget_gate[t, i, j] * s_state_c[t - 1, i, j] +
in_gate[t, i, j] * in_transform[t, i, j], name="next_c")
next_h = tvm.compute(gshape,
lambda t, i, j: out_gate[t, i, j] * tvm.tanh(next_c[t, i, j]), name="next_h")
update_c = tvm.compute(gshape, lambda *i: next_c(*i), name="update_c")
update_h = tvm.compute(gshape, lambda *i: next_h(*i), name="update_h")
# schedule
scan_h, scan_c = tvm.scan(
[s_init_h, s_init_c],
[update_h, update_c],
[s_state_h, s_state_c],
inputs=[X],
name="lstm_scan")
# schedule
s = tvm.create_schedule(scan_h.op)
# Inline gate computations
s[gates].compute_inline()
s[in_gate].compute_inline()
s[in_transform].compute_inline()
s[forget_gate].compute_inline()
s[out_gate].compute_inline()
# verify we can lower correctly
tvm.lower(s, [X, Wi2h, Wh2h, scan_h, scan_c])
示例14: test_add_pipeline
def test_add_pipeline():
nn = 1024
n = tvm.convert(nn)
A = tvm.placeholder((n,), name='A')
def extern_generator(ins, outs):
"""Manually write the IR for the extern function, add pipeline"""
ib = tvm.ir_builder.create()
with ib.for_range(0, n/2) as i:
ib.emit(outs[0].vstore(i*2, ins[0].vload(i*2, "float32x2") + tvm.const(1, "float32x2")))
return ib.get()
C = tvm.extern(A.shape, [A], extern_generator, name='C')
s = tvm.create_schedule(C.op)
print(tvm.lower(s, [A, C], simple_mode=True))
def check_llvm():
if not tvm.module.enabled("llvm"):
return
# build and invoke the kernel.
f = tvm.build(s, [A, C], "llvm")
ctx = tvm.cpu(0)
# launch the kernel.
n = nn
a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
f(a, c)
np.testing.assert_allclose(
c.asnumpy(), a.asnumpy() + 1)
check_llvm()
示例15: test_sort_np
def test_sort_np():
dshape = (1, 2, 3, 4, 5, 6)
axis = 4
reduced_shape = (1, 2, 3, 4, 6)
is_descend = False
data = tvm.placeholder(dshape, name='data')
sort_num = tvm.placeholder(reduced_shape, name="sort_num", dtype="int32")
out = tvm.extern(data.shape, [data, sort_num],
lambda ins, outs: tvm.call_packed(
"tvm.contrib.sort.argsort", ins[0],
ins[1], outs[0], axis, is_descend),
dtype='int32', name="sort_tensor")
ctx = tvm.cpu(0)
target = "llvm"
s = tvm.create_schedule(out.op)
f = tvm.build(s, [data, sort_num, out], target)
np_data = np.random.uniform(size=dshape)
np_out = np.argsort(np_data, axis=axis)
sort_num_input = np.full(reduced_shape, dshape[axis])
a = tvm.nd.array(np.array(np_data).astype(data.dtype), ctx)
b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx)
c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx)
f(a, b, c)
tvm.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)