-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsystem.py
109 lines (88 loc) · 4.26 KB
/
system.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import numpy as np
from scipy.spatial.distance import cdist
from utils import load_model
## Global variables
# PCA params set in training
PCA_MEAN = None
PCA_STD_DEV = None
PCA_COMPONENTS = None
##KNN for classification with k = 5 (5 neighbours) and uses inverse distance weighting
class KNN:
def __init__(self, k=5):
self.k = k
self.X_train = None
self.y_train = None
self.pca_mean = None
self.pca_std = None
self.pca_components = None
# Stores dimension-reduced features and labels for predicion
def fit(self, training_features, training_labels):
self.X_train = training_features
self.y_train = training_labels
# Predicts lables for new data
def predict(self, test_features):
# Euclidean distance between test features
# (euclidean chosen over cosine method as it easier to implement and does not hurt performance much)
distances = cdist(test_features, self.X_train, metric='euclidean')
# Loops through each row in matrix
predictions = []
for i in range(distances.shape[0]):
neighbor_dist_vector = distances[i, :]
# Gets labels/distances of only 5 nearest neighbours (doesnt sort whole array for speed)
nn_indices = np.argpartition(neighbor_dist_vector, self.k)[:self.k]
nn_labels = self.y_train[nn_indices]
nn_distances = neighbor_dist_vector[nn_indices]
# Applies inverse-distance weighting with a/0 accounted for
weights = 1.0 / (nn_distances + 1e-9)
# Loops through labels and weights, summing labels and predicting highest-summed label
label_weights = {}
for lbl, w in zip(nn_labels, weights):
if lbl not in label_weights:
label_weights[lbl] = 0.0
label_weights[lbl] += w
best_label = max(label_weights, key=label_weights.get)
predictions.append(best_label)
return np.array(predictions, dtype=np.int64)
# PCA for dimensionality reduction
def image_to_reduced_feature(images, mode='inference'):
global PCA_MEAN, PCA_STD_DEV, PCA_COMPONENTS
# Images cast to int64 format to prevent under/overflow
images = images.astype(np.int64)
# In 'train' mode, PCA params calculated and stored in global variabels
# PCA-reduced features returned
if mode == 'train':
# Gets mean and standard deviation of training set, with 0 std. dev. accounted for
PCA_MEAN = np.mean(images, axis=0)
PCA_STD_DEV = np.std(images, axis=0, ddof=1)
PCA_STD_DEV[PCA_STD_DEV == 0] = 1e-9
# Performs SVD on standardised data to get components for PCA
#(SVD is used instead of directly computing covariance matrix for effifciency)
stdised_data = (images - PCA_MEAN) / PCA_STD_DEV
left_singular_vects, singular_vals, right_singular_vects = np.linalg.svd(stdised_data, full_matrices=False)
# Projects standardised data onto 55 transposed PCA components for feature reducion
PCA_COMPONENTS = right_singular_vects[:55]
reduced_features = np.dot(stdised_data, PCA_COMPONENTS.T)
return reduced_features
# In 'inference' mode, existing PCA params applied to new data
else:
# If they aren't in memory yet, load saved model's params
if (PCA_MEAN is None or PCA_STD_DEV is None or PCA_COMPONENTS is None):
existing_model = load_model('trained_model.pkl')
PCA_MEAN = existing_model.pca_mean
PCA_STD_DEV = existing_model.pca_std
PCA_COMPONENTS = existing_model.pca_components
# Standardises data then projects onto existing PCA components
stdised_data = (images - PCA_MEAN) / PCA_STD_DEV
reduced_features = np.dot(stdised_data, PCA_COMPONENTS.T)
return reduced_features
# Instantiates KNN, saves PCA params for reuse, and fits classifier
def training_model(feature_vectors, labels):
# Create KNN with k = 5
knn = KNN(k=5)
# Store PCA params in the classifier to be saved in trained_model.pkl
knn.pca_mean = PCA_MEAN
knn.pca_std = PCA_STD_DEV
knn.pca_components = PCA_COMPONENTS
# Fit classifier onto PCA-reduced features
knn.fit(feature_vectors, labels)
return knn