Hello ,
Im using the below code to create cuda stream and run inference on SSD mobile 320x320 V2 converted to tesnorrt .The inference is running fast but Im facing extreme slowness when moving the data back from device to host in the d_to_h steps .The inference is taking 5 ms while the transfer is taking 20 ms .
Is there anyhting in the code where i can enhance or improve the speed of transfrer and could this be an issue ?
Im usnig Xavier and TensorRT 8
Thanks
class TensorRTInfer:
def __init__(self, engine):
"""
:param engine_path: The path to the serialized engine to load from disk.
"""
# Load TRT engine
self.cfx = cuda.Device(0).make_context()
self.stream = cuda.Stream()
self.engine = engine
self.context = self.engine.create_execution_context()
# Setup I/O bindings
self.inputs1 = []
self.outputs1 = []
self.allocations1 = []
for i in range(self.engine.num_bindings):
name = self.engine.get_binding_name(i)
dtype = self.engine.get_binding_dtype(i)
shape = self.engine.get_binding_shape(i)
size = np.dtype(trt.nptype(dtype)).itemsize * 1
for s in shape:
size *= s
allocation1 = cuda.mem_alloc(size)
binding1 = {
'index': i,
'name': name,
'dtype': np.dtype(trt.nptype(dtype)),
'shape': list(shape),
'allocation': allocation1,
}
self.allocations1.append(allocation1)
if self.engine.binding_is_input(i):
self.inputs1.append(binding1)
else:
self.outputs1.append(binding1)
self.outputs2 = []
for shape, dtype in self.output_spec():
shape[0]=shape[0] *1
self.outputs2.append(np.zeros(shape, dtype))
print("done building..")
def input_spec(self):
"""
Get the specs for the input tensor of the network. Useful to prepare memory allocations.
:return: Two items, the shape of the input tensor and its (numpy) datatype.
"""
return self.inputs[0]['shape'], self.inputs[0]['dtype']
def output_spec(self):
"""
Get the specs for the output tensors of the network. Useful to prepare memory allocations.
:return: A list with two items per element, the shape and (numpy) datatype of each output tensor.
"""
specs = []
for o in self.outputs1:
specs.append((o['shape'], o['dtype']))
return specs
def h_to_d(self, batch):
self.batch = batch
cuda.memcpy_htod_async(self.inputs1[0]['allocation'], np.ascontiguousarray(batch))
def destory(self):
self.cfx.pop()
def d_to_h(self):
for o in range(len(self.outputs2)):
cuda.memcpy_dtoh_async(self.outputs2[0], self.outputs1[0]['allocation'], self.stream)
return self.outputs2
def infer_this(self):
self.cfx.push()
self.context.execute_async(batch_size=1,bindings=self.allocations1, stream_handle=self.stream.handle)
self.cfx.pop()
import os
import sys
from time import time, sleep, perf_counter
import time
import time
import ctypes
import argparse
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import threading
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Process, Queue, Manager
import multiprocessing
import cv2
class TensorRTInfer:
"""
Implements inference for the Model TensorRT engine.
"""
def __init__(self, engine):
"""
:param engine_path: The path to the serialized engine to load from disk.
"""
# Load TRT engine
self.cfx = cuda.Device(0).make_context()
self.stream = cuda.Stream()
self.engine = engine
self.context = self.engine.create_execution_context()
# Setup I/O bindings
self.inputs1 = []
self.outputs1 = []
self.allocations1 = []
for i in range(self.engine.num_bindings):
name = self.engine.get_binding_name(i)
dtype = self.engine.get_binding_dtype(i)
shape = self.engine.get_binding_shape(i)
size = np.dtype(trt.nptype(dtype)).itemsize * 1
for s in shape:
size *= s
allocation1 = cuda.mem_alloc(size)
binding1 = {
'index': i,
'name': name,
'dtype': np.dtype(trt.nptype(dtype)),
'shape': list(shape),
'allocation': allocation1,
}
self.allocations1.append(allocation1)
if self.engine.binding_is_input(i):
self.inputs1.append(binding1)
else:
self.outputs1.append(binding1)
self.outputs2 = []
for shape, dtype in self.output_spec():
shape[0]=shape[0] *1
self.outputs2.append(np.zeros(shape, dtype))
print("done building..")
def input_spec(self):
"""
Get the specs for the input tensor of the network. Useful to prepare memory allocations.
:return: Two items, the shape of the input tensor and its (numpy) datatype.
"""
return self.inputs[0]['shape'], self.inputs[0]['dtype']
def output_spec(self):
"""
Get the specs for the output tensors of the network. Useful to prepare memory allocations.
:return: A list with two items per element, the shape and (numpy) datatype of each output tensor.
"""
specs = []
for o in self.outputs1:
specs.append((o['shape'], o['dtype']))
return specs
def h_to_d(self, batch):
self.batch = batch
cuda.memcpy_htod_async(self.inputs1[0]['allocation'], np.ascontiguousarray(batch))
def destory(self):
self.cfx.pop()
def d_to_h(self):
for o in range(len(self.outputs2)):
cuda.memcpy_dtoh_async(self.outputs2[0], self.outputs1[0]['allocation'], self.stream)
return self.outputs2
def infer_this(self):
self.cfx.push()
self.context.execute_async(batch_size=1,bindings=self.allocations1, stream_handle=self.stream.handle)
self.cfx.pop()
if __name__ == '__main__':
logger = trt.Logger(trt.Logger.ERROR)
trt.init_libnvinfer_plugins(logger, namespace="")
engine = None
with open('/home/zenith/Desktop/model1_16.trt', "rb") as f, trt.Runtime(logger) as runtime:
engine1 = runtime.deserialize_cuda_engine(f.read())
mat1 = cv2.imread('/home/zenith/Desktop/tf16/img108.jpg')
stretch_near1 = cv2.resize(mat1, (640, 640))
_image1 = np.expand_dims(stretch_near1, axis=0).astype(np.float32)
images = np.random.rand(1, 640, 640, 3).astype(np.float32)
trt_infer_big1 = TensorRTInfer(engine1)
x = range(100)
for n in x:
tic = time.perf_counter()
tiic = time.perf_counter()
trt_infer_big1.h_to_d(_image1)
tooc = time.perf_counter()
vll = tooc - tiic
print("h_to_d:" + str(vll))
act1 = time.perf_counter()
trt_infer_big1.infer_this()
act2 = time.perf_counter()
vll = act2 - act1
print("inference:" + str(vll))
teec = time.perf_counter()
trt_infer_big1.d_to_h()
toec = time.perf_counter()
vll = toec - teec
print("d_to_h:" + str(vll))
toc = time.perf_counter()
vll = toc - tic
print("whole time:" + str(vll))
sleep(0.05)
in the above for loop ,I’m trying to follow cuda concurrent pattern which should reduce the time considerably compared with linear approach .
You will notice the time for d_to_h in the loop is taking the largest amount of time while the inference is taking so little .
Thanks for sharing. However it is still go through the samples by repeating the full cycles, instead of a pipeline type of process. The inference takes much less time then moving data between host and device.
Means the sample code does the following: copy sample1 input from host to device, inference, copy sampe1 output from device to host. Then copy sample2 input from host to device, inference, copy sampe2 output from device to host.
Is there a way to do it like a pipeline style? Means simultaneously copying sample1 output from host to device and copying sample2 input from host to device. Thanks!