-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.py
190 lines (147 loc) · 6.74 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import argparse
import skimage
import imutils
import matplotlib.cm as cm
from YOLO.utils import *
from cv2 import cuda
from pipeline import *
if cuda.getCudaEnabledDeviceCount() > 0:
cuda.setDevice(0)
cfgfile = "YOLO/yolov4-tiny.cfg"
weightfile = "YOLO/yolov4-tiny.weights"
namesfile = "YOLO/coco.names"
class_names = load_class_names(namesfile)
def predict_video(video, expand=0.05, backend="cuda", k=5, conf=0.7, nms=0.01):
if video == "webcam":
video = 0
expanding_factor = 0
cv2.namedWindow("window")
cap = cv2.VideoCapture(video)
while cap.isOpened():
ret, frame = cap.read()
start_time = time.time()
gpu_frame = cv2.cuda_GpuMat(frame)
stream = cv2.cuda_Stream()
gpu_frame.upload(frame, stream)
frame = gpu_frame.download(stream)
if ret:
frame = imutils.resize(frame, width=1000, height=480)
width = frame.shape[1]
height = frame.shape[0]
with open('YOLO/coco.names', 'r') as f:
classes = [line.strip() for line in f.readlines()]
colors = np.random.uniform(0, 255, size=(len(classes), 3))
net = cv2.dnn.readNet(weightfile, cfgfile)
if backend == "cuda":
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
else:
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
blob = cv2.dnn.blobFromImage(frame, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
net.setInput(blob)
outs = net.forward(get_output_layers(net))
class_ids = []
confidences = []
boxes = []
conf_threshold = conf
nms_threshold = nms
for out in outs:
for detection in out:
scores = detection[5:]
class_id = np.argmax(scores)
confidence = scores[class_id]
if confidence > 0.5:
center_x = int(detection[0] * width)
center_y = int(detection[1] * height)
w = int(detection[2] * width)
h = int(detection[3] * height)
x = center_x - w / 2
y = center_y - h / 2
class_ids.append(class_id)
confidences.append(float(confidence))
boxes.append([x, y, w, h])
indices = cv2.dnn.NMSBoxes(boxes, confidences, conf_threshold, nms_threshold)
indices = indices[0] if len(indices) > 0 else []
for i in indices:
box = boxes[i]
x = box[0]
y = box[1]
w = box[2]
h = box[3]
try:
sentence, word_list, label = draw_prediction(frame, class_ids[i],
int(x - expanding_factor * x),
int(y - expanding_factor * y),
int(x + w + expanding_factor * (x + w)),
int(y + h + expanding_factor * (y + h)),
colors, classes, k)
except TypeError:
continue
if not any(word == label for word in word_list):
# only expand if withing the frame boundaries
if (x - expanding_factor * x) > 0 and (y - expanding_factor * y) > 0 and \
(x + w + expanding_factor * (x + w)) < width and \
(y + h + expanding_factor * (y + h)) < height:
expanding_factor += expand
end_time = time.time()
inference_time = end_time - start_time
FPS = 1.0 / inference_time
cv2.putText(frame, "FPS: {:.2f}".format(FPS), (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
cv2.imshow("window", frame)
key = cv2.waitKey(20)
if key > 0: # exit by pressing any key
cv2.destroyAllWindows()
for i in range(1, 5):
cv2.waitKey(1)
return
else:
continue
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
def visualize_att(image, seq, alphas, rev_word_map, smooth=False):
"""
Visualizes caption with weights at every word.
Adapted from paper authors' repo: https://github.com/kelvinxu/arctic-captions/blob/master/alpha_visualization.ipynb
:param image_path: path to image that has been captioned
:param seq: caption
:param alphas: weights
:param rev_word_map: reverse word mapping, i.e. ix2word
:param smooth: smooth weights?
"""
# image = Image.open(image_path)
# image = image.resize([1500, 1500], Image.LANCZOS)
plt.figure(figsize=(100, 100))
words = [rev_word_map[ind] for ind in seq]
print(words)
for t in range(len(words)):
if t > 50:
break
plt.subplot(int(np.ceil(len(words) / 5)), 5, t + 1)
plt.text(0, 1, '%s' % (words[t]), color='black', backgroundcolor='white', fontsize=12)
plt.imshow(image)
current_alpha = alphas[t, :]
if smooth:
alpha = skimage.transform.pyramid_expand(current_alpha.numpy(), upscale=24, sigma=8)
else:
alpha = skimage.transform.resize(current_alpha.numpy(), [np.shape(image)[0], np.shape(image)[1]])
if t == 0:
plt.imshow(alpha, alpha=0)
else:
plt.imshow(alpha, alpha=0.8)
plt.set_cmap(cm.Greys_r)
plt.axis('off')
plt.show()
if __name__ == '__main__':
print("Inference on: ", cuda.getCudaEnabledDeviceCount(), "GPU(s)")
print("Device: ", device)
parser = argparse.ArgumentParser()
parser.add_argument("--video", type=str, default="videos/cow.mp4", help="path to video file")
parser.add_argument("--expand", type=float, default=0.05, help="pyramid image expanding factor")
parser.add_argument("--backend", type=str, default="cuda", help="backend to use (cuda or cpu)")
parser.add_argument("--k", type=int, default=5, help="number of hypotheses held in the beam")
parser.add_argument("--conf", type=float, default=0.7, help="confidence threshold")
parser.add_argument("--nms", type=float, default=0.01, help="nms threshold")
args = parser.parse_args()
predict_video(args.video, args.expand, args.backend, args.k, args.conf, args.nms)