-
Notifications
You must be signed in to change notification settings - Fork 2
/
trtmodel.py
102 lines (85 loc) · 3.57 KB
/
trtmodel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import cv2
import numpy as np
import os
import csv
import copy
import argparse
from glob import glob
tags = None
include_characters = False
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
class TrtTagger:
def __init__(self, engine_path, max_batch_size=1):
self.engine_path = engine_path
self.dtype = np.float32
self.logger = trt.Logger(trt.Logger.ERROR)
self.runtime = trt.Runtime(self.logger)
self.engine = self.load_engine(self.runtime, self.engine_path)
shape = self.engine.get_binding_shape(0)
self.batch_size = shape[0]
self.height = shape[1]
self.width = shape[2]
self.max_batch_size = 1
self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers()
self.context = self.engine.create_execution_context()
@staticmethod
def load_engine(trt_runtime, engine_path):
trt.init_libnvinfer_plugins(None, "")
with open(engine_path, 'rb') as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
def allocate_buffers(self):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in self.engine:
size = trt.volume(self.engine.get_binding_shape(binding)) * self.max_batch_size
host_mem = cuda.pagelocked_empty(size, self.dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(device_mem))
if self.engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def __call__(self, pre_processed_img_list):
post_processed_img_list = []
for img in pre_processed_img_list:
h, w, c = img.shape
mx = max(h, w)
scale_x = self.width / mx
scale_y = self.height / mx
scale = min(scale_y, scale_y)
canvas = np.ones((self.height, self.width, 3), dtype=np.uint8) * 255
new_h = int(scale * h)
new_w = int(scale * w)
canvas[0:new_h, 0:new_w] = cv2.resize(img, (new_w, new_h))
shift_y = int((self.height - new_h) / 2)
shift_x = int((self.width - new_w) / 2)
canvas = np.roll(canvas, shift_y, axis=0)
canvas = np.roll(canvas, shift_x, axis=1)
post_processed_img_list += [canvas]
x = np.array(post_processed_img_list, dtype=self.dtype)
# x = canvas.astype(self.dtype)
np.copyto(self.inputs[0].host, x.ravel())
for inp in self.inputs:
cuda.memcpy_htod_async(inp.device, inp.host, self.stream)
self.context.execute_async(batch_size=self.batch_size, bindings=self.bindings,
stream_handle=self.stream.handle)
for out in self.outputs:
cuda.memcpy_dtoh_async(out.host, out.device, self.stream)
self.stream.synchronize()
outtmp=[out.host.reshape(self.batch_size, -1) for out in self.outputs]
return outtmp[0]