【TVM 教程】在 NVIDIA GPU 上调优高性能卷积(下)

微信图片_20241016191925.png

File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
    raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 4, 8, 8]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 64]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2482196
No: 14  GFLOPS: 0.00/260.18     result: Traceback (most recent call last):
  File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
    func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
  File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
    func = build(s, args, target_host=task.target_host, runtime=runtime)
  File "/workspace/python/tvm/driver/build_module.py", line 228, in build
    input_mod = lower(inputs, args, name=name, binds=binds)
  File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
    return ffi.lower_schedule(inp, args, name, binds, simple_mode)
  File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
  File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
  File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
tvm._ffi.base.TVMError: Traceback (most recent call last):
  24: TVMFuncCall
        at ../src/runtime/c_runtime_api.cc:477
  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
        at ../include/tvm/runtime/packed_func.h:1217
  22: Call
        at ../include/tvm/runtime/packed_func.h:1213
  21: operator()
        at ../include/tvm/runtime/packed_func.h:1731
  20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
        at ../include/tvm/runtime/packed_func.h:1671
  19: run<>
        at ../include/tvm/runtime/packed_func.h:1631
  18: run<tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1646
  13: operator()
        at ../src/driver/driver_api.cc:365
  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, bool)
        at ../src/driver/driver_api.cc:352
  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
        at ../src/driver/driver_api.cc:252
  10: tvm::transform::Pass::operator()(tvm::IRModule) const
        at ../src/ir/transform.cc:258
  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/ir/transform.cc:274
  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/ir/transform.cc:453
  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/ir/transform.cc:274
  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/tir/ir/transform.cc:100
  5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
        at ../include/tvm/runtime/packed_func.h:1750
  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
        at ../include/tvm/runtime/packed_func.h:1694
  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
        at ../include/tvm/runtime/packed_func.h:1618
  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
        at ../include/tvm/runtime/packed_func.h:1217
  1: Call
        at ../include/tvm/runtime/packed_func.h:1213
  0: operator()
        at ../src/runtime/c_runtime_api.cc:534
  File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
  File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
    raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel

Traceback (most recent call last):
  24: TVMFuncCall
        at ../src/runtime/c_runtime_api.cc:477
  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
        at ../include/tvm/runtime/packed_func.h:1217
  22: Call
        at ../include/tvm/runtime/packed_func.h:1213
  21: operator()
        at ../include/tvm/runtime/packed_func.h:1731
  20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
        at ../include/tvm/runtime/packed_func.h:1671
  19: run<>
        at ../include/tvm/runtime/packed_func.h:1631
  18: run<tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1646
  13: operator()
        at ../src/driver/driver_api.cc:365
  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, bool)
        at ../src/driver/driver_api.cc:352
  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
        at ../src/driver/driver_api.cc:252
  10: tvm::transform::Pass::operator()(tvm::IRModule) const
        at ../src/ir/transform.cc:258
  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/ir/transform.cc:274
  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/ir/transform.cc:453
  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/ir/transform.cc:274
  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/tir/ir/transform.cc:100
  5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
        at ../include/tvm/runtime/packed_func.h:1750
  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
        at ../include/tvm/runtime/packed_func.h:1694
  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
        at ../include/tvm/runtime/packed_func.h:1618
  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
        at ../include/tvm/runtime/packed_func.h:1217
  1: Call
        at ../include/tvm/runtime/packed_func.h:1213
  0: operator()
        at ../src/runtime/c_runtime_api.cc:534
  File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
  File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
    raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 64, 1, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10306226
No: 15  GFLOPS: 5.42/260.18     result: MeasureResult(costs=(0.042678113000000004,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.8313236236572266, timestamp=1658801313.2114427)       [('tile_f', [-1, 2, 2, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5330964
No: 16  GFLOPS: 3.34/260.18     result: MeasureResult(costs=(0.0693738225,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.546748638153076, timestamp=1658801314.4557946)        [('tile_f', [-1, 8, 4, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2140058
No: 17  GFLOPS: 0.00/260.18     result: Traceback (most recent call last):
  File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 142, in build
    res = future.result()
  File "/usr/lib/python3.7/concurrent/futures/_base.py", line 435, in result
    return self.__get_result()
  File "/usr/lib/python3.7/concurrent/futures/_base.py", line 384, in __get_result
    raise self._exception
  File "/usr/lib/python3.7/concurrent/futures/thread.py", line 57, in run
    result = self.fn(*self.args, **self.kwargs)
  File "/workspace/python/tvm/contrib/popen_pool.py", line 404, in <lambda>
    worker = lambda *args: self._worker_run(*args)
  File "/workspace/python/tvm/contrib/popen_pool.py", line 373, in _worker_run
    return proc.recv()
  File "/workspace/python/tvm/contrib/popen_pool.py", line 297, in recv
    raise TimeoutError()
TimeoutError

        [('tile_f', [-1, 2, 2, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 16]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10195251
No: 18  GFLOPS: 27.80/260.18    result: MeasureResult(costs=(0.008326810928571429,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.2733991146087646, timestamp=1658801325.4546382)       [('tile_f', [-1, 4, 8, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6068603
No: 19  GFLOPS: 0.00/260.18     result: Traceback (most recent call last):
  File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
    func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
  File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
    func = build(s, args, target_host=task.target_host, runtime=runtime)
  File "/workspace/python/tvm/driver/build_module.py", line 228, in build
    input_mod = lower(inputs, args, name=name, binds=binds)
  File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
    return ffi.lower_schedule(inp, args, name, binds, simple_mode)
  File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
  File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
  File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
tvm._ffi.base.TVMError: Traceback (most recent call last):
  24: TVMFuncCall
        at ../src/runtime/c_runtime_api.cc:477
  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
        at ../include/tvm/runtime/packed_func.h:1217
  22: Call
        at ../include/tvm/runtime/packed_func.h:1213
  21: operator()
        at ../include/tvm/runtime/packed_func.h:1731
  20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
        at ../include/tvm/runtime/packed_func.h:1671
  19: run<>
        at ../include/tvm/runtime/packed_func.h:1631
  18: run<tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1646
  13: operator()
        at ../src/driver/driver_api.cc:365
  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, bool)
        at ../src/driver/driver_api.cc:352
  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
        at ../src/driver/driver_api.cc:252
  10: tvm::transform::Pass::operator()(tvm::IRModule) const
        at ../src/ir/transform.cc:258
  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/ir/transform.cc:274
  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/ir/transform.cc:453
  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/ir/transform.cc:274
  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/tir/ir/transform.cc:100
  5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
        at ../include/tvm/runtime/packed_func.h:1750
  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
        at ../include/tvm/runtime/packed_func.h:1694
  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
        at ../include/tvm/runtime/packed_func.h:1618
  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
        at ../include/tvm/runtime/packed_func.h:1217
  1: Call
        at ../include/tvm/runtime/packed_func.h:1213
  0: operator()
        at ../src/runtime/c_runtime_api.cc:534
  File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
  File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
    raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel

Traceback (most recent call last):
  24: TVMFuncCall
        at ../src/runtime/c_runtime_api.cc:477
  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
        at ../include/tvm/runtime/packed_func.h:1217
  22: Call
        at ../include/tvm/runtime/packed_func.h:1213
  21: operator()
        at ../include/tvm/runtime/packed_func.h:1731
  20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
        at ../include/tvm/runtime/packed_func.h:1671
  19: run<>
        at ../include/tvm/runtime/packed_func.h:1631
  18: run<tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1646
  13: operator()
        at ../src/driver/driver_api.cc:365
  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, bool)
        at ../src/driver/driver_api.cc:352
  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
        at ../src/driver/driver_api.cc:252
  10: tvm::transform::Pass::operator()(tvm::IRModule) const
        at ../src/ir/transform.cc:258
  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/ir/transform.cc:274
  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/ir/transform.cc:453
  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/ir/transform.cc:274
  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/tir/ir/transform.cc:100
  5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
        at ../include/tvm/runtime/packed_func.h:1750
  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
        at ../include/tvm/runtime/packed_func.h:1694
  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
        at ../include/tvm/runtime/packed_func.h:1618
  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
        at ../include/tvm/runtime/packed_func.h:1217
  1: Call
        at ../include/tvm/runtime/packed_func.h:1213
  0: operator()
        at ../src/runtime/c_runtime_api.cc:534
  File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
  File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
    raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 16, 4, 8]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6956993
No: 20  GFLOPS: 0.00/260.18     result: Traceback (most recent call last):
  File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
    func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
  File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
    func = build(s, args, target_host=task.target_host, runtime=runtime)
  File "/workspace/python/tvm/driver/build_module.py", line 228, in build
    input_mod = lower(inputs, args, name=name, binds=binds)
  File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
    return ffi.lower_schedule(inp, args, name, binds, simple_mode)
  File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
  File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
  File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
tvm._ffi.base.TVMError: Traceback (most recent call last):
  24: TVMFuncCall
        at ../src/runtime/c_runtime_api.cc:477
  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
        at ../include/tvm/runtime/packed_func.h:1217
  22: Call
        at ../include/tvm/runtime/packed_func.h:1213
  21: operator()
        at ../include/tvm/runtime/packed_func.h:1731
  20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
        at ../include/tvm/runtime/packed_func.h:1671
  19: run<>
        at ../include/tvm/runtime/packed_func.h:1631
  18: run<tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1646
  13: operator()
        at ../src/driver/driver_api.cc:365
  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, bool)
        at ../src/driver/driver_api.cc:352
  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
        at ../src/driver/driver_api.cc:252
  10: tvm::transform::Pass::operator()(tvm::IRModule) const
        at ../src/ir/transform.cc:258
  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/ir/transform.cc:274
  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/ir/transform.cc:453
  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/ir/transform.cc:274
  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/tir/ir/transform.cc:100
  5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
        at ../include/tvm/runtime/packed_func.h:1750
  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
        at ../include/tvm/runtime/packed_func.h:1694
  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
        at ../include/tvm/runtime/packed_func.h:1618
  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
        at ../include/tvm/runtime/packed_func.h:1217
  1: Call
        at ../include/tvm/runtime/packed_func.h:1213
  0: operator()
        at ../src/runtime/c_runtime_api.cc:534
  File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
  File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
    raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel

Traceback (most recent call last):
  24: TVMFuncCall
        at ../src/runtime/c_runtime_api.cc:477
  23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
        at ../include/tvm/runtime/packed_func.h:1217
  22: Call
        at ../include/tvm/runtime/packed_func.h:1213
  21: operator()
        at ../include/tvm/runtime/packed_func.h:1731
  20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
        at ../include/tvm/runtime/packed_func.h:1671
  19: run<>
        at ../include/tvm/runtime/packed_func.h:1631
  18: run<tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1631
  14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
        at ../include/tvm/runtime/packed_func.h:1646
  13: operator()
        at ../src/driver/driver_api.cc:365
  12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, bool)
        at ../src/driver/driver_api.cc:352
  11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
        at ../src/driver/driver_api.cc:252
  10: tvm::transform::Pass::operator()(tvm::IRModule) const
        at ../src/ir/transform.cc:258
  9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/ir/transform.cc:274
  8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/ir/transform.cc:453
  7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/ir/transform.cc:274
  6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
        at ../src/tir/ir/transform.cc:100
  5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
        at ../include/tvm/runtime/packed_func.h:1750
  4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
        at ../include/tvm/runtime/packed_func.h:1694
  3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
        at ../include/tvm/runtime/packed_func.h:1618
  2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
        at ../include/tvm/runtime/packed_func.h:1217
  1: Call
        at ../include/tvm/runtime/packed_func.h:1213
  0: operator()
        at ../src/runtime/c_runtime_api.cc:534
  File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
  File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
    raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel        [('tile_f', [-1, 16, 1, 2]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3377719

最后从日志文件中检查最佳配置，检查正确性并测试运行时间。

# 检查最佳配置
dispatch_context = autotvm.apply_history_best(record_file)
best_config = dispatch_context.query(task.target, task.workload)
print("\nBest config:")
print(best_config)

# 从日志文件中应用历史最好记录
with autotvm.apply_history_best(record_file):
    with tvm.target.Target("cuda"):
        s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding)
        func = tvm.build(s, arg_bufs)

# 验证正确性
a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
c_np = conv2d_nchw_python(a_np, w_np, strides, padding)

dev = tvm.cuda()
a_tvm = tvm.nd.array(a_np, device=dev)
w_tvm = tvm.nd.array(w_np, device=dev)
c_tvm = tvm.nd.empty(c_np.shape, device=dev)
func(a_tvm, w_tvm, c_tvm)

tvm.testing.assert_allclose(c_np, c_tvm.numpy(), rtol=1e-2)

# 评估运行时间。这里选择一个较大的重复次数（400）来减少噪音以及内核启动的开销。还可用 nvprof 来验证结果。
evaluator = func.time_evaluator(func.entry_name, dev, number=400)
print("Time cost of this operator: %f" % evaluator(a_tvm, w_tvm, c_tvm).mean)

输出结果：

Finish loading 20 records

Best config:
[('tile_f', [-1, 8, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4264713
Finish loading 20 records
Time cost of this operator: 0.001299

下载 Python 源代码：tune_conv2d_cuda.py

下载 Jupyter notebook：tune_conv2d_cuda.ipynb

推荐阅读

目录