本文整理匯總了Python中tvm.gpu方法的典型用法代碼示例。如果您正苦於以下問題:Python tvm.gpu方法的具體用法?Python tvm.gpu怎麽用?Python tvm.gpu使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類tvm
的用法示例。
在下文中一共展示了tvm.gpu方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: test_broadcast_to
# 需要導入模塊: import tvm [as 別名]
# 或者: from tvm import gpu [as 別名]
def test_broadcast_to(in_shape, out_shape):
global TASK
TASK = "bcast_to_i" + "_".join([str(ele) for ele in in_shape])\
+ "o" + "_".join([str(ele) for ele in out_shape])
# Build the logic and compile the function
A = tvm.te.placeholder(shape=in_shape, name="A")
B = topi.broadcast_to(A, out_shape)
s = topi.cuda.schedule_broadcast(B)
fcuda = tvm.build(s, [A, B], "cuda", name="broadcast_to")
data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
out_npy = np.broadcast_to(data_npy, out_shape)
data_nd = tvm.nd.array(data_npy, tvm.gpu())
out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), tvm.gpu())
for _ in range(2):
fcuda(data_nd, out_nd)
tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
示例2: test_matmul_add
# 需要導入模塊: import tvm [as 別名]
# 或者: from tvm import gpu [as 別名]
def test_matmul_add():
n = 1024
l = 128
m = 235
A = tvm.placeholder((n, l), name='A')
B = tvm.placeholder((l, m), name='B')
C = cublas.matmul(A, B)
s = tvm.create_schedule(C.op)
def verify(target="cuda"):
if not tvm.module.enabled(target):
print("skip because %s is not enabled..." % target)
return
if not tvm.get_global_func("tvm.contrib.cublas.matmul", True):
print("skip because extern function is not available")
return
ctx = tvm.gpu(0)
f = tvm.build(s, [A, B, C], target)
a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
f(a, b, c)
np.testing.assert_allclose(
c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5)
verify()
示例3: test_broadcast_to
# 需要導入模塊: import tvm [as 別名]
# 或者: from tvm import gpu [as 別名]
def test_broadcast_to(in_shape, out_shape):
global TASK
TASK = "bcast_to_i" + "_".join([str(ele) for ele in in_shape])\
+ "o" + "_".join([str(ele) for ele in out_shape])
# Build the logic and compile the function
A = tvm.placeholder(shape=in_shape, name="A")
B = topi.broadcast_to(A, out_shape)
s = topi.cuda.schedule_broadcast(B)
fcuda = tvm.build(s, [A, B], "cuda", name="broadcast_to")
data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
out_npy = np.broadcast_to(data_npy, out_shape)
data_nd = tvm.nd.array(data_npy, tvm.gpu())
out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), tvm.gpu())
for _ in range(2):
fcuda(data_nd, out_nd)
np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
示例4: build_tvm
# 需要導入模塊: import tvm [as 別名]
# 或者: from tvm import gpu [as 別名]
def build_tvm(self, net, torch_inputs):
self.graph_pth = torch2trt.GraphModule(net, torch_inputs)
with torch2trt.core.tvm_network():
trace, graph_pth = torch2trt.core.torch2tvm(
net,
torch_inputs,
input_names=self.input_names,
verbose=self.verbose)
self.refit_weight_dict = graph_pth.refit_weight_dict
input_names = get_torch_forward_name(net.forward)
self.graph_pth = graph_pth
outputs = graph_pth.get_resolved_outputs()
tvm_weight_dict = graph_pth.context.tvm_weight_dict
self.params = {k.name_hint: v for k, v in tvm_weight_dict.items()}
print(len(self.params))
self.graph = expr.Function(analysis.free_vars(outputs), outputs)
if self.graph_post_fn is not None:
self.graph = self.graph_post_fn(self.graph)
self.ctx = TVMInference(self.graph, self.params, input_names=input_names, ctx=tvm.gpu(0), cudnn=True)
示例5: test_cuda_lib
# 需要導入模塊: import tvm [as 別名]
# 或者: from tvm import gpu [as 別名]
def test_cuda_lib():
ctx = tvm.gpu(0)
for device in ["llvm", "cuda"]:
if not tvm.runtime.enabled(device):
print("skip because %s is not enabled..." % device)
return
nn = 12
n = tvm.runtime.convert(nn)
A = te.placeholder((n,), name='A')
B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
s = te.create_schedule(B.op)
bx, tx = s[B].split(B.op.axis[0], factor=4)
s[B].bind(bx, te.thread_axis("blockIdx.x"))
s[B].bind(tx, te.thread_axis("threadIdx.x"))
from tvm.contrib import util
temp = util.tempdir()
fn_add = tvm.build(s, [A, B], target="cuda", target_host="llvm", name="add")
path_lib = temp.relpath("deploy_lib.so")
fn_add.export_library(path_lib)
m = tvm.runtime.load_module(path_lib)
a = tvm.nd.array(np.random.uniform(size=nn).astype(A.dtype), ctx)
b = tvm.nd.array(np.zeros(nn, dtype=A.dtype), ctx)
m['add'](a, b)
np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
示例6: test_cuda_reduction_binding
# 需要導入模塊: import tvm [as 別名]
# 或者: from tvm import gpu [as 別名]
def test_cuda_reduction_binding():
if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
print("skip because cuda is not enabled..")
return
k = te.reduce_axis((0, 32), 'k')
A = te.placeholder((96, 32), name='A')
B = te.compute( (96,), lambda m:
te.sum(A[m, k], axis=k),
name='B')
s = te.create_schedule(B.op)
s[B].reorder(B.op.reduce_axis[0], B.op.axis[0])
mo, _ = s[B].split(B.op.axis[0], 32)
s[B].bind(mo, te.thread_axis("blockIdx.x"))
fcuda = tvm.build(s, [A, B], "cuda")
示例7: test_cuda_const_float_to_half
# 需要導入模塊: import tvm [as 別名]
# 或者: from tvm import gpu [as 別名]
def test_cuda_const_float_to_half():
# This import is required to use nvcc to perform code gen;
# otherwise it is found that the code gen is done by nvrtc.
from tvm import autotvm
shape = (2, 3, 4)
a = te.placeholder(shape, dtype='float16', name='a')
b = tvm.tir.const(0.5, dtype='float16')
c = te.compute(shape, lambda i, j, k: a[i, j, k] > b, name='c')
s = te.create_schedule(c.op)
axes = [axis for axis in c.op.axis]
fused = s[c].fuse(*axes)
bx, tx = s[c].split(fused, factor=64)
s[c].bind(bx, te.thread_axis('blockIdx.x'))
s[c].bind(tx, te.thread_axis('threadIdx.x'))
func = tvm.build(s, [a, c], 'cuda')
ctx = tvm.gpu(0)
a_np = np.random.uniform(size=shape).astype(a.dtype)
c_np = np.zeros(shape=shape, dtype=c.dtype)
a = tvm.nd.array(a_np, ctx)
c = tvm.nd.array(c_np, ctx)
func(a, c)
np.testing.assert_equal(c.asnumpy(), a_np > b.value)
示例8: test_expand_dims
# 需要導入模塊: import tvm [as 別名]
# 或者: from tvm import gpu [as 別名]
def test_expand_dims():
# based on topi test
def verify_expand_dims(dshape, dtype, oshape, axis, num_newaxis):
x = relay.Var("x", relay.TensorType(dshape, dtype))
func = relay.Function([x], relay.expand_dims(x, axis, num_newaxis))
for target, ctx in ctx_list():
if dtype == 'float16' and target == 'cuda' and not have_fp16(tvm.gpu(0).compute_version):
continue
data = np.random.uniform(size=dshape).astype(dtype)
ref_res = data.reshape(oshape)
intrp = relay.create_executor("graph", ctx=ctx, target=target)
op_res = intrp.evaluate(func)(data)
np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
for dtype in ['float16', 'float32']:
verify_expand_dims((3, 10), dtype, (3, 10, 1, 1), 2, 2)
verify_expand_dims((3, 10), dtype, (1, 3, 10), -3, 1)
示例9: verify_matmul_add
# 需要導入模塊: import tvm [as 別名]
# 或者: from tvm import gpu [as 別名]
def verify_matmul_add(in_dtype, out_dtype, rtol=1e-5):
n = 1024
l = 128
m = 236
A = te.placeholder((n, l), name='A', dtype=in_dtype)
B = te.placeholder((l, m), name='B', dtype=in_dtype)
C = cublas.matmul(A, B, dtype=out_dtype)
s = te.create_schedule(C.op)
def verify(target="cuda"):
if not tvm.runtime.enabled(target):
print("skip because %s is not enabled..." % target)
return
if not tvm.get_global_func("tvm.contrib.cublas.matmul", True):
print("skip because extern function is not available")
return
ctx = tvm.gpu(0)
f = tvm.build(s, [A, B, C], target)
a = tvm.nd.array(np.random.uniform(0, 128, size=(n, l)).astype(A.dtype), ctx)
b = tvm.nd.array(np.random.uniform(0, 128, size=(l, m)).astype(B.dtype), ctx)
c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
f(a, b, c)
tvm.testing.assert_allclose(
c.asnumpy(), np.dot(a.asnumpy().astype(C.dtype), b.asnumpy().astype(C.dtype)), rtol=rtol)
verify()
示例10: test_broadcast_to
# 需要導入模塊: import tvm [as 別名]
# 或者: from tvm import gpu [as 別名]
def test_broadcast_to(in_shape, out_shape):
global TASK
TASK = "bcast_to_i" + "_".join([str(ele) for ele in in_shape])\
+ "o" + "_".join([str(ele) for ele in out_shape])
# Build the logic and compile the function
A = te.placeholder(shape=in_shape, name="A")
B = topi.broadcast_to(A, out_shape)
s = topi.cuda.schedule_broadcast(B)
fcuda = tvm.build(s, [A, B], "cuda", name="broadcast_to")
data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
out_npy = np.broadcast_to(data_npy, out_shape)
data_nd = tvm.nd.array(data_npy, tvm.gpu())
out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), tvm.gpu())
for _ in range(2):
fcuda(data_nd, out_nd)
tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
示例11: verify_vectorization
# 需要導入模塊: import tvm [as 別名]
# 或者: from tvm import gpu [as 別名]
def verify_vectorization(n, m, dtype):
def check_device(device):
if not tvm.runtime.enabled(device):
print("Skip because %s is not enabled" % device)
return
if dtype == "float16" and device == "cuda" and not have_fp16(tvm.gpu(0).compute_version):
print("Skip because gpu does not have fp16 support")
return
with tvm.target.create(device):
ctx = tvm.context(device, 0)
A = te.placeholder((n, m), name='A', dtype=dtype)
B = te.compute((n, m), lambda i, j:
A[i, j] + tvm.tir.const(1, A.dtype), name='B')
S = topi.testing.get_elemwise_schedule(device)(B)
fun = tvm.build(S, [A, B], device)
np_A = tvm.nd.empty((n, m), A.dtype, ctx).copyfrom(
np.random.uniform(size=(n, m)))
np_B = tvm.nd.empty((n, m), B.dtype, ctx)
fun(np_A, np_B)
tvm.testing.assert_allclose(np_B.asnumpy(), np_A.asnumpy() + 1, rtol=1e-5)
for device in ["cuda"]:
check_device(device)
示例12: run
# 需要導入模塊: import tvm [as 別名]
# 或者: from tvm import gpu [as 別名]
def run(args):
onnx_model = onnx.load_model(run_onnx_util.onnx_model_file(args.test_dir, args.model_file))
ctx = tvm.gpu()
input_names, output_names = run_onnx_util.onnx_input_output_names(
os.path.join(args.test_dir, args.model_file))
test_data_dir = os.path.join(args.test_dir, 'test_data_set_0')
inputs, outputs = run_onnx_util.load_test_data(
test_data_dir, input_names, output_names)
inputs = dict(inputs)
graph_module = None
if args.frontend == 'nnvm':
graph_module = build_graph_nnvm(args, ctx, onnx_model, inputs, input_names)
elif args.frontend == 'relay':
graph_module = build_graph_relay(args, ctx, onnx_model, inputs, input_names)
else:
raise RuntimeError('Invalid frontend: {}'.format(args.frontend))
graph_module.run()
for i, (name, expected) in enumerate(outputs):
tvm_output = tvm.nd.empty(expected.shape, expected.dtype, ctx=ctx)
actual = graph_module.get_output(i, tvm_output).asnumpy()
np.testing.assert_allclose(expected, actual,
rtol=1e-3, atol=1e-4), name
print('%s: OK' % name)
print('ALL OK')
def compute():
graph_module.run()
cupy.cuda.device.Device().synchronize()
return run_onnx_util.run_benchmark(compute, args.iterations)
示例13: test_reduce_map
# 需要導入模塊: import tvm [as 別名]
# 或者: from tvm import gpu [as 別名]
def test_reduce_map(in_shape, axis, keepdims, type="sum", test_id=0):
global TASK
# Build the logic and compile the function
A = tvm.te.placeholder(shape=in_shape, name="A")
if type == "sum":
TASK = "sum_map_id%d" %test_id
B = topi.sum(A, axis=axis, keepdims=keepdims)
elif type == "max":
TASK = "max_map_id%d" %test_id
B = topi.max(A, axis=axis, keepdims=keepdims)
elif type == "min":
TASK = "min_map_id%d" %test_id
B = topi.min(A, axis=axis, keepdims=keepdims)
else:
raise NotImplementedError
s = topi.cuda.schedule_reduce(B)
with tvm.build_config(auto_unroll_max_step=16,
auto_unroll_min_depth=0):
fcuda = tvm.build(s, [A, B], "cuda", name="sum")
# Test
in_npy = np.random.normal(size=in_shape).astype(np.float32)
if type == "sum":
out_npy = in_npy.sum(axis=axis, keepdims=keepdims)
elif type == "max":
out_npy = in_npy.max(axis=axis, keepdims=keepdims)
elif type == "min":
out_npy = in_npy.min(axis=axis, keepdims=keepdims)
else:
raise NotImplementedError
data_tvm = tvm.nd.array(in_npy, ctx=tvm.gpu())
out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=tvm.gpu())
for _ in range(2):
fcuda(data_tvm, out_tvm)
tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, rtol=4e-4, atol=4e-4)
示例14: save_param_dict
# 需要導入模塊: import tvm [as 別名]
# 或者: from tvm import gpu [as 別名]
def save_param_dict(params):
"""Save parameter dictionary to binary bytes.
The result binary bytes can be loaded by the
GraphModule with API "load_params".
Parameters
----------
params : dict of str to NDArray
The parameter dictionary.
Returns
-------
param_bytes: bytearray
Serialized parameters.
Examples
--------
.. code-block:: python
# compile and save the modules to file.
graph, lib, params = nnvm.compiler.build(
graph, target, shape={"data", data_shape}, params=params)
module = graph_runtime.create(graph, lib, tvm.gpu(0))
# save the parameters as byte array
param_bytes = nnvm.compiler.save_param_dict(params)
# We can serialize the param_bytes and load it back later.
# Pass in byte array to module to directly set parameters
module["load_params"](param_bytes)
"""
args = []
for k, v in params.items():
args.append(k)
args.append(tvm.nd.array(v))
return _save_param_dict(*args)
示例15: test_bind
# 需要導入模塊: import tvm [as 別名]
# 或者: from tvm import gpu [as 別名]
def test_bind():
if not tvm.gpu(0).exist:
print('[Warning] No GPU found! Skip bind test!')
return
@script
def vec_add(a, b, c):
for tx in bind('threadIdx.x', 1000):
c[tx] = b[tx] + c[tx]
a = tvm.placeholder((1000, ), dtype='float32', name='a')
b = tvm.placeholder((1000, ), dtype='float32', name='b')
c = tvm.placeholder((1000, ), dtype='float32', name='c')
run_and_check(vec_add, [a, b, c], [c], target='cuda')