-
Notifications
You must be signed in to change notification settings - Fork 2
/
split_generator.py
113 lines (97 loc) · 4.54 KB
/
split_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# Copyright 2022 CRS4 (http://www.crs4.it/)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from crs4.cassandra_utils._split_generator import split_generator
class imagenet_split_generator(split_generator):
def __init__(
self,
metadata_id_col="id",
metadata_label_col="label",
data_col="data",
label_type="int"
):
super().__init__(
metadata_id_col=metadata_id_col,
metadata_label_col=metadata_label_col,
data_col=data_col,
label_type=label_type)
def create_split(self, split_ratio_list, balance=None):
"""
This method populates the class attributr split_metadata with split information
@ split_ratio_list: a weight vector with an element for each split (ex. [7, 2, 1]). The vector is normalized before the computation
@ balance: a string {'random'|'original'} or a weight vector, an element for each class. The vector is normalized before the computation
"""
label_type = "int"
df = self._df
rows = df.shape[0]
# Get a dictionary of occurrence for each class
class_count_dict = (
df.groupby(self._metadata_label_col).count().to_dict(orient="dict")[self._metadata_id_col]
)
# get class count vector with index sorted by class
class_count = [
v for k, v in sorted(class_count_dict.items(), key=lambda item: item[0])
]
num_classes = len(class_count)
if isinstance(balance, str):
if balance == "random":
balance = np.random.rand(num_classes)
elif balance == "original":
balance = np.array(class_count)
else:
raise Exception("The legal string values are {random|original}")
elif isinstance(balance, (list, np.ndarray)):
if len(balance) != len(class_count):
raise Exception(
"TThe balance vector size must be equal to the number of classes"
)
else:
raise Exception(
"This method takes either a string or a list or a numpy array with the size equal to the number of classes"
)
sum_split_ratio = np.sum(split_ratio_list)
balance = balance / np.sum(balance)
# Count samples per each class
samples_per_class = np.trunc(balance * rows).astype(np.int32)
diff = samples_per_class - class_count
less_data_class = np.argmax(diff)
new_rows = class_count[less_data_class] / balance[less_data_class]
samples_per_class = np.trunc(balance * new_rows).astype(np.int32)
## Now that sample_per class has valid numbers we can start grouping per class and then creating splits
# Each split will have an almost equal number of sample for each class
grps = df.groupby(self._metadata_label_col, as_index=False)
split = [[] for _ in split_ratio_list]
for current_class in grps.groups:
df_tmp = grps.get_group(current_class)
index = df_tmp.index.tolist()
np.random.shuffle(index)
tot_num = samples_per_class[current_class]
# randomly sample tot_num indexes
sel_index = np.random.choice(index, tot_num, replace=False)
offset = 0
for ix, i in enumerate(split_ratio_list):
start = offset
stop = offset + int((tot_num * i) / sum_split_ratio)
if ix == len(split_ratio_list) - 1 and (tot_num - stop) == 1:
tmp = sel_index[start:tot_num]
else:
tmp = sel_index[start:stop]
split[ix] += tmp.tolist()
offset = stop
split = [np.array(i) for i in split]
row_keys = self._df[self._metadata_id_col].to_numpy()
self.split_metadata["row_keys"] = row_keys
self.split_metadata["split"] = split
self.split_metadata["label_type"] = label_type
self.split_metadata["num_classes"] = num_classes