Hello!
I am currently working with a pre created ONNX model, The ONNX model was possibly created using an input shape of (10, 3, 32, 32). Does this mean my created engine for the said model will only work work on (10, 3, 32, 32).
If so, how do I generalise my shapes.
Another issue I am facing is that my engine on inference returns garbage values.
The values returned are -4.3160208e+08 or 0.
Here is the code that I am using.
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
runtime = trt.Runtime(TRT_LOGGER)
model_path = ‘model.onnx’
def print_network(network):
for i in range(network.num_layers):
layer = network.get_layer(i)
print("\nLAYER {}".format(i))
print("===========================================")
layer_input = layer.get_input(0)
if layer_input:
print("\tInput Name: {}".format(layer_input.name))
print("\tInput Shape: {}".format(layer_input.shape))
layer_output = layer.get_output(0)
if layer_output:
print("\tOutput Name: {}".format(layer_output.name))
print("\tOutput Shape: {}".format(layer_output.shape))
print("===========================================")
def build_engine(model_path):
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(flags = 1) as network,
trt.OnnxParser(network, TRT_LOGGER) as parser:
builder.max_workspace_size = 1<<32
builder.max_batch_size = 1
builder.fp16_mode = 0
with open(model_path, ‘rb’) as f:
value = parser.parse(f.read())
print("Parser: ", value)
engine = builder.build_cuda_engine(network)
print_network(network)
print(engine)
return engine
engine = build_engine(model_path)
buf = engine.serialize()
with open(“ssh.engine”, ‘wb’) as f:
f.write(buf)
create buffer
print(imu.dtype, “DTYPE”)
context = engine.create_execution_context()
print(engine.get_binding_shape(0),engine.get_binding_shape(1),engine.get_binding_shape(2), engine.get_binding_shape(3), engine.get_binding_shape(4), “binding shape”)
#print(imu[0].shape,“imu shape”)
h_input = cuda.pagelocked_empty(trt.volume((10, 3, 32, 32)), dtype = np.float32)
h_output_hmap = cuda.pagelocked_empty(trt.volume((1, 1, 8, 8)), dtype=np.float32)
h_output_scale = cuda.pagelocked_empty(trt.volume(( 1, 2, 8, 8)), dtype=np.float32)
h_output_offset = cuda.pagelocked_empty(trt.volume(( 1, 2, 8, 8)), dtype=np.float32)
Allocate device memory for inputs and outputs.
print(h_input.nbytes, “Hinput”)
print(h_output_hmap.nbytes, “Houtput_hmap”)
print(h_output_scale.nbytes, “Houtput_scale”)
print(h_output_offset.nbytes, “Houtput_offset”)
d_input = cuda.mem_alloc(h_input.nbytes)
d_output_hmap = cuda.mem_alloc(h_output_hmap.nbytes)
d_output_scale = cuda.mem_alloc(h_output_scale.nbytes)
d_output_offset = cuda.mem_alloc(h_output_offset.nbytes)
bindings = [int(d_input), int(d_output_hmap), int(d_output_scale), int(d_output_offset)]
Create a stream in which to copy inputs/outputs and run inference.
stream = cuda.Stream()
print(np.ascontiguousarray(np.array(imu[0])).shape)
with engine.create_execution_context() as context:
# Transfer input data to the GPU.
cuda.memcpy_htod_async(d_input,np.ascontiguousarray(np.array(imu)), stream)
# Run inference.
context.execute_async(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(h_output_hmap, d_output_hmap, stream)
cuda.memcpy_dtoh_async(h_output_scale, d_output_scale, stream)
cuda.memcpy_dtoh_async(h_output_offset, d_output_offset, stream)
# Synchronize the stream
stream.synchronize()
# Return the host output.
print(h_output_hmap.shape,
h_output_scale.shape,
h_output_offset.shape)
# print(h_output_hmap, "HMAP")
print(h_output_hmap)
# print(h_output_scale)
# print(h_output_offset)