From 69118663c1bdf74489b24da20ff8ef101ebc4df6 Mon Sep 17 00:00:00 2001 From: Timo Kaufmann Date: Wed, 18 Nov 2020 19:11:16 +0100 Subject: [PATCH] wip: simplify pareto choice --- .../choicefunctions/choice_data_generator.py | 6 +- csrank/dataset_reader/dataset_reader.py | 1 - poc/datasets/variable_choice_datasets.py | 186 ++++++------------ 3 files changed, 69 insertions(+), 124 deletions(-) diff --git a/csrank/dataset_reader/choicefunctions/choice_data_generator.py b/csrank/dataset_reader/choicefunctions/choice_data_generator.py index 57341fcf..8ea1c7f7 100644 --- a/csrank/dataset_reader/choicefunctions/choice_data_generator.py +++ b/csrank/dataset_reader/choicefunctions/choice_data_generator.py @@ -30,6 +30,7 @@ def make_globular_pareto_choices( cluster_size=10, **kwargs, ): + print(f"Cluster size is {cluster_size}") def pareto_front(X, signs=None): n_points, n_attributes = X.shape if signs is None: @@ -56,7 +57,10 @@ def make_randn_pareto_choices( This should be the easiest possible Pareto-problem, since the model can learn a latent-utility which scores how likely a point is on the front (independent - of the other points).""" + of the other points). + + Generates n_instances clusters around the same centroid. + """ rand = check_random_state(data_seed) X = rand.randn(n_instances, n_objects, n_features) Y = np.empty((n_instances, n_objects), dtype=bool) diff --git a/csrank/dataset_reader/dataset_reader.py b/csrank/dataset_reader/dataset_reader.py index b676f983..94bc6da2 100644 --- a/csrank/dataset_reader/dataset_reader.py +++ b/csrank/dataset_reader/dataset_reader.py @@ -138,7 +138,6 @@ def get_dataset_dictionaries(self, lengths=[5, 6]): def get_single_train_test_split(self): raise NotImplementedError - @abstractmethod def get_train_test_datasets(self, n_datasets=5): splits = np.array(n_datasets) return self.splitter(splits) diff --git a/poc/datasets/variable_choice_datasets.py b/poc/datasets/variable_choice_datasets.py index 0830821f..7a773992 100644 --- a/poc/datasets/variable_choice_datasets.py +++ b/poc/datasets/variable_choice_datasets.py @@ -56,6 +56,48 @@ def __init__( super().__init__(x, y_true) +def _compute_pareto_front(X, signs=None): + """Compute the Pareto front of a set of points. + + Parameters + ---------- + X + A tensor of d-dimensional points. + signs + A factor by which each point is scaled. If this is negative, + the Pareto Front is inverted (an object is dominated if some + other object has *smaller* values for each attribute). Can be + set on a per-attribute basis. Defaults to -1 for each attribute + if no signs are given. + + + Returns + ------- + tensor + The Pareto front, that is those points that are not dominated. + A point is dominated if some other point has a larger value in + every dimension. + """ + n_points, n_attributes = X.shape + + if signs is None: + # TODO maybe remove this functionality or at least flip the default; + # this is confusing (kept to match the implementation in the original + # cs-ranking) + signs = -np.ones(n_attributes) + + # A binary flag array. First assume all points are on the Pareto + # front, then check if that is actually true for each point. + pareto = np.ones(n_points, dtype=bool) + for i, attr in enumerate(X): + # A binary flag array that has an entry for each other point: 1 + # if and only if it is "better" than the other point in at + # least one dimension (i.e. it is not dominated by that point). + not_dominated = np.any((X * signs[None, :]) <= (attr * signs), axis=1) + pareto[i] = np.all(not_dominated) + return pareto + + class ParetoChoiceProblem(TensorDataset): """Generate a choice problem based on choosing pareto points. @@ -69,128 +111,28 @@ def __init__( n_features: int, random_state: np.random.RandomState, ): - # TODO - # generate set of points uniformly at random in R^2, find the - # Pareto-set (follow steps 1-3 in B.4.1) containing only the non-domnated objects. - # Consider an n-features dimension space, generate points in the - # hypersphere from a isometric normal distribution (see - # csrank/dataset_reader/choicefunctions/choice_data_generator); Move - # the whole ball to some other center; Generate pareto front of each cluster. self.random_state = random_state - (x, y_true) = self.make_globular_pareto_choices( - n_instances, - n_objects, - n_features, - cluster_spread=1.0, - cluster_size=10, - random_state=random_state, + self.n_instances = n_instances + self.n_objects = n_objects + self.n_features = n_features + x_instances = [] + y_instances = [] + for instance in range(n_instances): + x_instance = self.draw_x_instance() + pareto_front_flags = _compute_pareto_front(x_instance) + x_instances.append(x_instance) + y_instances.append(pareto_front_flags) + super().__init__( + torch.tensor(x_instances).float(), torch.tensor(y_instances).float() ) - super().__init__(torch.tensor(x).float(), torch.tensor(y_true).float()) - def make_globular_pareto_choices( - self, - n_instances: int, - n_objects: int, - n_features: int, - cluster_spread: float, - cluster_size: int, - ): - def pareto_front(X, signs=None): - """Compute the Pareto front of a set of points. - - Parameters - ---------- - X - A tensor of d-dimensional points. - signs - A factor by which each point is scaled. If this is negative, - the Pareto Front is inverted (an object is dominated if some - other object has *smaller* values for each attribute). Can be - set on a per-attribute basis. Defaults to -1 for each attribute - if no signs are given. - - - Returns - ------- - tensor - The Pareto front, that is those points that are not dominated. - A point is dominated if some other point has a larger value in - every dimension. - """ - n_points, n_attributes = X.shape - - if signs is None: - signs = -np.ones(n_attributes) - - # A binary flag array. First assume all points are on the Pareto - # front, then check if that is actually true for each point. - pareto = np.ones(n_points, dtype=bool) - for i, attr in enumerate(X): - # A binary flag array that has an entry for each other point: 1 - # if and only if it is "better" than the other point in at - # least one dimension (i.e. it is not dominated by that point). - not_dominated = np.any((X * signs[None, :]) <= (attr * signs), axis=1) - pareto[i] = np.all(not_dominated) - return pareto - - def sample_unit_ball( - n_objects: int, - n_features: int, - radius: float, - ): - """Samples some points in a d-dimensional ball. - - Parameters - ---------- - n_objects - How many points to sample. - n_features - The dimension of the space. - radius - The radius of the ball. - """ - X = self.random_state.randn(n_objects, n_features) - u = self.random_state.uniform(size=n_objects)[:, None] - X /= np.linalg.norm(X, axis=1, ord=2)[:, None] - X *= radius * u - return X - - def make_randn_pareto_choices( - cluster_size: int, - n_features: int, - n_objects: int, - center: np.array, - ): - """Generate random objects from a d-dimensional isometric normal distribution. - - This should be the easiest possible Pareto-problem, since the model can learn - a latent-utility which scores how likely a point is on the front (independent - of the other points).""" - # Generate a single cluster, uniformly at random. - X = self.random_state.randn(cluster_size, n_objects, n_features) - Y = np.empty((cluster_size, n_objects), dtype=bool) - for i in range(cluster_size): - Y[i] = pareto_front(X[i]) - # Return the shifted cluster with its Pareto front. - return X + center, Y - - X = np.empty((n_instances, n_objects, n_features)) - Y = np.empty((n_instances, n_objects), dtype=int) - for i in range(int(n_instances / cluster_size)): - center = sample_unit_ball( - n_inst=1, n_features=n_features, radius=cluster_spread - ) - # Cluster of points sampled uniformly at random and shifted to - # center. Center is some point on the unit ball (cluster_spread). - x, y = make_randn_pareto_choices( - cluster_size=cluster_size, - n_features=n_features, - n_objects=n_objects, - center=center, - ) - # TODO(next) how do clusters and instances relate? Is each cluster - # one instance? Then why the n_instances / cluster_size in the for - # loop? - X[i * cluster_size : (i + 1) * cluster_size] = x - Y[i * cluster_size : (i + 1) * cluster_size] = y - return X, Y + def draw_centroid(self): + # Draw a centroid from the unit ball from a Gaussian distribution. + return self.random_state.randn(self.n_features) + + def draw_x_instance(self): + # Draws a cluster of points around a a randomly generated centroid. + return ( + self.random_state.randn(self.n_objects, self.n_instances) + + self.draw_centroid() + )