-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate_clustering.py
285 lines (243 loc) · 13.2 KB
/
evaluate_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# built-in imports
import argparse
import os
import sys
import json
from copy import deepcopy
# standard imports
import numpy as np
import pandas as pd
# local imports
from source.constants import RANDOM_SEED
from source.constants import FEATURE_VECTORS_SAVE_DIR, ANNOTATIONS_SAVE_DIR
from source.constants import ALL_CANCER_TYPES, ALL_IMG_NORMS, ALL_EXTRACTOR_MODELS, ALL_DIMENSIONALITY_REDUCTION_METHODS, ALL_CLUSTERING_ALGORITHMS, ALL_DISTANCE_METRICS
from source.eval_utils import reduce_feature_dimensionality, get_clustering_labels, compute_connectivity_matrix, compute_clustering_metrics, precision_at_1, precision_at_k
def get_imgpaths_2_intids(ids_2_imgpaths):
assert len(set(ids_2_imgpaths.values())) == len(list(ids_2_imgpaths.values())), "Can only reverse a bijective mapping, duplicate values found."
imgpaths_2_intids = {v: int(k) for k, v in ids_2_imgpaths.items()}
return imgpaths_2_intids
def extract_connectivity_vector(connectivity_matrix):
"""
Extract a 1D connectivity vector of shape (1, n*(n-1)/2) from a 2D connectivity matrix (n*n).
It only makes sence to consider the upper triangular part of the matrix excluding the diagonal (k=1)
Get these elements and put into 1d array.
Parameters:
connectivity_matrix (array-like): 2D connectivity matrix of shape (n, n)
Returns:
array-like: 1D connectivity vector of shape (1, n*(n-1)/2)
"""
connectivity_vector = connectivity_matrix[np.triu_indices(
connectivity_matrix.shape[0], k=1)]
return connectivity_vector
def get_true_connectivity(
manual_annotations_dir,
ids_2_imgpaths,
):
imgpaths_2_intids = get_imgpaths_2_intids(ids_2_imgpaths=ids_2_imgpaths)
# true clustering was saved as json in a dictionary format: {'0': ['pathA', 'pathB'], '1': ['pathC', 'pathD']}
with open(f"{manual_annotations_dir}/final_clusters.json", "r") as f:
annotated_clusters_dict_str = json.load(f)
# replace string keys with int keys
annotated_clusters_dict = {
int(k): deepcopy(v) for k, v in annotated_clusters_dict_str.items()}
num_true_clusters = len(annotated_clusters_dict)
num_total_images = sum([len(cluster)
for cluster in annotated_clusters_dict.values()])
# compute cluster labels vector and connectivity matrix, images ordered in the same way as the features array
true_cluster_labels = -1 * np.ones(num_total_images, dtype=int)
for cluster_id, img_paths in annotated_clusters_dict.items():
for img_path in img_paths:
true_cluster_labels[imgpaths_2_intids[img_path]
] = cluster_id
assert np.all(true_cluster_labels != -1)
# connectivity_matrix[i, j] is 1 if samples (i) and (j) are in the same cluster, 0 - if in different clusters
true_connectivity_matrix = compute_connectivity_matrix(
clusters_dict=annotated_clusters_dict, imgpaths_2_intids=imgpaths_2_intids)
true_connectivity_vector = extract_connectivity_vector(true_connectivity_matrix)
return true_connectivity_matrix, true_connectivity_vector, true_cluster_labels, num_true_clusters
def get_predicted_connectivity(
predicted_cluster_labels,
ids_2_imgpaths,
):
imgpaths_2_intids = get_imgpaths_2_intids(ids_2_imgpaths=ids_2_imgpaths)
# compute clusters dict - same format as annotated_clusters_dict
predicted_clusters_dict = {}
for i, predicted_cluster_label in enumerate(predicted_cluster_labels):
if predicted_cluster_label not in predicted_clusters_dict:
predicted_clusters_dict[predicted_cluster_label] = []
# append in any case
predicted_clusters_dict[predicted_cluster_label].append(
ids_2_imgpaths[str(i)])
# connectivity_matrix[i, j] is 1 if samples (i) and (j) are in the same cluster, 0 - if in different clusters
predicted_connectivity_matrix = compute_connectivity_matrix(
clusters_dict=predicted_clusters_dict, imgpaths_2_intids=imgpaths_2_intids)
predicted_connectivity_vector = extract_connectivity_vector(
predicted_connectivity_matrix)
return predicted_connectivity_vector
# TODO: split into separate clustering and evaluation - to be used here, in 3-evaluation.ipynb, 2-clustering-interactive.ipynb
def evaluate_clustering(
features_save_dir,
manual_annotations_dir,
dimensionality_reduction,
clustering,
distance_metric,
verbose,
):
print("Features path:", features_save_dir)
print("Dimensionality reduction:", dimensionality_reduction)
print("Clustering:", clustering)
print("Distance metric:", distance_metric)
print()
features_npy_path = f'{features_save_dir}/features.npy'
ids_2_imgpaths_json_path = f'{features_save_dir}/ids_2_img_paths.json'
assert os.path.isfile(
features_npy_path), f"File does not exist: \n\t{features_npy_path}"
assert os.path.isfile(
ids_2_imgpaths_json_path), f"File does not exist: \n\t{ids_2_imgpaths_json_path}"
# load
features = np.load(features_npy_path)
if distance_metric == 'cosine':
# after normalisation, euclidean distance is equivalent to cosine distance
# KMeans does not support cosine distance, so we can't just pass distance_metric to KMeans as a parameter
features = features / \
np.linalg.norm(features, axis=1, keepdims=True)
with open(ids_2_imgpaths_json_path, 'r') as f:
ids_2_imgpaths = json.load(f)
# check that the values are unique, this will allow bijective mapping
assert len(set(ids_2_imgpaths.values())) == len(
ids_2_imgpaths.values())
# true connectivity info
true_connectivity_matrix, true_connectivity_vector, true_cluster_labels, num_true_clusters = get_true_connectivity(
manual_annotations_dir=manual_annotations_dir,
ids_2_imgpaths=ids_2_imgpaths,
)
# dimensionality reduction
features_reduced = reduce_feature_dimensionality(features=features, method=dimensionality_reduction)
# compute precision@1 and precision@5; use euclidean distance - already normalised before
precision_at_1_value = precision_at_1(features_reduced, true_connectivity_matrix, metric='euclidean')
precision_at_5_value = precision_at_k(features_reduced, true_connectivity_matrix, k=5, metric='euclidean')
# clustering
predicted_cluster_labels = get_clustering_labels(
features=features_reduced,
n_clusters=num_true_clusters,
method=clustering,
random_state=RANDOM_SEED,
)
# predicted connectivity info
predicted_connectivity_vector = get_predicted_connectivity(
predicted_cluster_labels=predicted_cluster_labels,
ids_2_imgpaths=ids_2_imgpaths,
)
assert predicted_connectivity_vector.shape == true_connectivity_vector.shape
# Compute metrics
metrics = compute_clustering_metrics(
true_connectivity_vector, predicted_connectivity_vector, true_cluster_labels, predicted_cluster_labels)
metrics['precision@1'] = precision_at_1_value
metrics['precision@5'] = precision_at_5_value
if verbose:
for metric, value in metrics.items():
if isinstance(value, int):
print(f"{metric}: {value}")
elif isinstance(value, float):
print(f"{metric}: {value:.4f}")
else:
print(f"{metric}:\n {value}")
print()
return metrics
def main():
parser = argparse.ArgumentParser(
description='Evaluate extractor-reduction-clustering pipeline.')
parser.add_argument('--cancer_type', type=str,
default='lung_aca',
choices=ALL_CANCER_TYPES,
help='Cancer type name name.')
parser.add_argument('--extractor_name', type=str,
default='all',
choices=list(ALL_EXTRACTOR_MODELS) + ['all'],
help='Feature extractor name.')
parser.add_argument('--img_norm', type=str,
default='all',
choices=list(ALL_IMG_NORMS) + ['all'])
parser.add_argument('--distance_metric', type=str,
default='all', choices=list(ALL_DISTANCE_METRICS) + ['all'],
help='Distance metric to use for clustering evaluation.')
parser.add_argument('--dimensionality_reduction', type=str,
default='all',
choices=list(ALL_DIMENSIONALITY_REDUCTION_METHODS) + ['all'],
help='Dimensionality reduction algorithm name.')
parser.add_argument('--clustering', type=str,
default='all',
choices=list(ALL_CLUSTERING_ALGORITHMS) + ['all'],
help='Clustering algorithm name.')
parser.add_argument('--manual_annotations_dir', type=str,
default=None,
help='Directory path with manual annotations.')
parser.add_argument('--overwrite', action='store_true',
help='Overwrite existing results.')
parser.add_argument('--verbose', action='store_true',
help='Print metrics to stdout.')
args = parser.parse_args()
print('\n', "-" * 48, '\n')
print("Arguments passed to the script:")
print("\nargs - raw:")
print(sys.argv)
print("\nargs:")
print(args)
print('\n', "-" * 48, '\n')
if args.manual_annotations_dir is not None:
assert os.path.isdir(args.manual_annotations_dir)
assert f'/{args.cancer_type}/' in args.manual_annotations_dir, f"--cancer_type={args.cancer_type} must match the --manual_annotations_dir={args.manual_annotations_dir}"
manual_annotations_dir = args.manual_annotations_dir
else:
manual_annotations_dir = f'{ANNOTATIONS_SAVE_DIR}/{args.cancer_type}/UNI/resize_only'
assert os.path.isdir(manual_annotations_dir)
# we can do all combinations of distance metrics, dimensionality reductions and clustering algorithms
distance_metrics = [args.distance_metric] if args.distance_metric != 'all' else ALL_DISTANCE_METRICS
dimensionality_reductions = [args.dimensionality_reduction] if args.dimensionality_reduction != 'all' else ALL_DIMENSIONALITY_REDUCTION_METHODS
clusterings = [args.clustering] if args.clustering != 'all' else ALL_CLUSTERING_ALGORITHMS
all_features_save_dir = f"{FEATURE_VECTORS_SAVE_DIR}/{args.cancer_type}"
# use directories with precomputed features instead of ALL_EXTRACTOR_MODELS, we will not compute features here
extractor_names = [args.extractor_name] if args.extractor_name != 'all' else os.listdir(all_features_save_dir)
# img_norms is defined later, after we know the extractor_name
eval_results_dir = 'eval_results'
# should overwrite the same one file for a specific cancer type
filename_base = f"cancer_type={args.cancer_type}#extractor_name=all#img_norm=all#distance_metric=all#dimensionality_reduction=all#clustering=all"
# Load existing data - initialize with empty dictionary if file does not exist
try:
with open(f"{eval_results_dir}/{filename_base}.json", 'r') as f:
all_metrics = json.load(f)
print(f"Will append to existing file {eval_results_dir}/{filename_base}.json", end='\n\n')
except FileNotFoundError:
print(f"File {eval_results_dir}/{filename_base}.json not found, initializing with empty dictionary.", end='\n\n')
all_metrics = {}
for extractor_name in extractor_names:
img_norms = [args.img_norm] if args.img_norm != 'all' else os.listdir(f"{all_features_save_dir}/{extractor_name}")
for img_norm in img_norms:
features_save_dir = f"{all_features_save_dir}/{extractor_name}/{img_norm}"
for distance_metric in distance_metrics:
for dimensionality_reduction in dimensionality_reductions:
for clustering in clusterings:
combo_key = f"{extractor_name}#{img_norm}#{distance_metric}#{dimensionality_reduction}#{clustering}"
if (combo_key in all_metrics) and (not args.overwrite):
print(f"Skipping {combo_key} - already computed")
continue
else:
print(f"\nComputing {combo_key} ...\n")
current_metrics = evaluate_clustering(
features_save_dir=features_save_dir,
# manual_annotations_dir=args.manual_annotations_dir,
manual_annotations_dir=manual_annotations_dir,
dimensionality_reduction=dimensionality_reduction,
clustering=clustering,
distance_metric=distance_metric,
verbose=args.verbose,
)
all_metrics[combo_key] = current_metrics
# Save intermediate results to a JSON file - rewrites every time
with open(f"{eval_results_dir}/{filename_base}.json", 'w') as f:
json.dump(all_metrics, f, indent=4)
# Save final results to a CSV file
df = pd.DataFrame(all_metrics).T
df.to_csv(f"{eval_results_dir}/{filename_base}.csv")
if __name__ == "__main__":
main()