-
Notifications
You must be signed in to change notification settings - Fork 32
/
tracker.m
174 lines (143 loc) · 6.13 KB
/
tracker.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
function [positions, time] = tracker(video_path, img_files, pos, target_sz, ...
padding, kernel, lambda, output_sigma_factor, interp_factor, cell_size, ...
features, show_visualization)
%TRACKER Kernelized/Dual Correlation Filter (KCF/DCF) tracking.
% This function implements the pipeline for tracking with the KCF (by
% choosing a non-linear kernel) and DCF (by choosing a linear kernel).
%
% It is meant to be called by the interface function RUN_TRACKER, which
% sets up the parameters and loads the video information.
%
% Parameters:
% VIDEO_PATH is the location of the image files (must end with a slash
% '/' or '\').
% IMG_FILES is a cell array of image file names.
% POS and TARGET_SZ are the initial position and size of the target
% (both in format [rows, columns]).
% PADDING is the additional tracked region, for context, relative to
% the target size.
% KERNEL is a struct describing the kernel. The field TYPE must be one
% of 'gaussian', 'polynomial' or 'linear'. The optional fields SIGMA,
% POLY_A and POLY_B are the parameters for the Gaussian and Polynomial
% kernels.
% OUTPUT_SIGMA_FACTOR is the spatial bandwidth of the regression
% target, relative to the target size.
% INTERP_FACTOR is the adaptation rate of the tracker.
% CELL_SIZE is the number of pixels per cell (must be 1 if using raw
% pixels).
% FEATURES is a struct describing the used features (see GET_FEATURES).
% SHOW_VISUALIZATION will show an interactive video if set to true.
%
% Outputs:
% POSITIONS is an Nx2 matrix of target positions over time (in the
% format [rows, columns]).
% TIME is the tracker execution time, without video loading/rendering.
%
% Joao F. Henriques, 2014
%if the target is large, lower the resolution, we don't need that much
%detail
resize_image = (sqrt(prod(target_sz)) >= 100); %diagonal size >= threshold
if resize_image,
pos = floor(pos / 2);
target_sz = floor(target_sz / 2);
end
%window size, taking padding into account
window_sz = floor(target_sz * (1 + padding));
% %we could choose a size that is a power of two, for better FFT
% %performance. in practice it is slower, due to the larger window size.
% window_sz = 2 .^ nextpow2(window_sz);
%create regression labels, gaussian shaped, with a bandwidth
%proportional to target size
output_sigma = sqrt(prod(target_sz)) * output_sigma_factor / cell_size;
if isfield(features, 'deep') && features.deep
yf = fft2(gaussian_shaped_labels(output_sigma, ceil(window_sz / cell_size)));
% sz = ceil(window_sz/cell_size)-1+4-4;
% yf = fft2(gaussian_shaped_labels(output_sigma, sz));
else
yf = fft2(gaussian_shaped_labels(output_sigma, floor(window_sz / cell_size)));
end
%store pre-computed cosine window
cos_window = hann(size(yf,1)) * hann(size(yf,2))';
if show_visualization, %create video interface
update_visualization = show_video(img_files, video_path, resize_image);
end
%note: variables ending with 'f' are in the Fourier domain.
time = 0; %to calculate FPS
positions = zeros(numel(img_files), 2); %to calculate precision
for frame = 1:numel(img_files),
%load image
im = imread([video_path img_files{frame}]);
if ~isfield(features, 'deep')
if size(im,3) > 1,
im = rgb2gray(im);
end
end
if resize_image,
im = imresize(im, 0.5);
end
tic()
if frame > 1,
%obtain a subwindow for detection at the position from last
%frame, and convert to Fourier domain (its size is unchanged)
patch = get_subwindow(im, pos, window_sz);
zf = fft2(get_features(patch, features, cell_size, cos_window));
%calculate response of the classifier at all shifts
switch kernel.type
case 'gaussian',
kzf = gaussian_correlation(zf, model_xf, kernel.sigma);
case 'polynomial',
kzf = polynomial_correlation(zf, model_xf, kernel.poly_a, kernel.poly_b);
case 'linear',
kzf = linear_correlation(zf, model_xf);
end
response = real(ifft2(model_alphaf .* kzf)); %equation for fast detection
%target location is at the maximum response. we must take into
%account the fact that, if the target doesn't move, the peak
%will appear at the top-left corner, not at the center (this is
%discussed in the paper). the responses wrap around cyclically.
[vert_delta, horiz_delta] = find(response == max(response(:)), 1);
if vert_delta > size(zf,1) / 2, %wrap around to negative half-space of vertical axis
vert_delta = vert_delta - size(zf,1);
end
if horiz_delta > size(zf,2) / 2, %same for horizontal axis
horiz_delta = horiz_delta - size(zf,2);
end
pos = pos + cell_size * [vert_delta - 1, horiz_delta - 1];
end
%obtain a subwindow for training at newly estimated target position
patch = get_subwindow(im, pos, window_sz);
xf = fft2(get_features(patch, features, cell_size, cos_window));
%Kernel Ridge Regression, calculate alphas (in Fourier domain)
switch kernel.type
case 'gaussian',
kf = gaussian_correlation(xf, xf, kernel.sigma);
case 'polynomial',
kf = polynomial_correlation(xf, xf, kernel.poly_a, kernel.poly_b);
case 'linear',
kf = linear_correlation(xf, xf);
end
alphaf = yf ./ (kf + lambda); %equation for fast training
if frame == 1, %first frame, train with a single image
model_alphaf = alphaf;
model_xf = xf;
else
%subsequent frames, interpolate model
model_alphaf = (1 - interp_factor) * model_alphaf + interp_factor * alphaf;
model_xf = (1 - interp_factor) * model_xf + interp_factor * xf;
end
%save position and timing
positions(frame,:) = pos;
time = time + toc();
%visualization
if show_visualization,
box = [pos([2,1]) - target_sz([2,1])/2, target_sz([2,1])];
stop = update_visualization(frame, box);
if stop, break, end %user pressed Esc, stop early
drawnow
% pause(0.05) %uncomment to run slower
end
end
if resize_image,
positions = positions * 2;
end
end