wip: simplify pareto choice

kiudee · Nov 18, 2020 · 6911866 · 6911866
1 parent c3c2465
commit 6911866
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 124 deletions.
diff --git a/csrank/dataset_reader/choicefunctions/choice_data_generator.py b/csrank/dataset_reader/choicefunctions/choice_data_generator.py
@@ -30,6 +30,7 @@ def make_globular_pareto_choices(
         cluster_size=10,
         **kwargs,
     ):
+        print(f"Cluster size is {cluster_size}")
         def pareto_front(X, signs=None):
             n_points, n_attributes = X.shape
             if signs is None:
@@ -56,7 +57,10 @@ def make_randn_pareto_choices(
 
             This should be the easiest possible Pareto-problem, since the model can learn
             a latent-utility which scores how likely a point is on the front (independent
-            of the other points)."""
+            of the other points).
+
+            Generates n_instances clusters around the same centroid.
+            """
             rand = check_random_state(data_seed)
             X = rand.randn(n_instances, n_objects, n_features)
             Y = np.empty((n_instances, n_objects), dtype=bool)

diff --git a/csrank/dataset_reader/dataset_reader.py b/csrank/dataset_reader/dataset_reader.py
@@ -138,7 +138,6 @@ def get_dataset_dictionaries(self, lengths=[5, 6]):
     def get_single_train_test_split(self):
         raise NotImplementedError
 
-    @abstractmethod
     def get_train_test_datasets(self, n_datasets=5):
         splits = np.array(n_datasets)
         return self.splitter(splits)
diff --git a/poc/datasets/variable_choice_datasets.py b/poc/datasets/variable_choice_datasets.py
@@ -56,6 +56,48 @@ def __init__(
         super().__init__(x, y_true)
 
 
+def _compute_pareto_front(X, signs=None):
+    """Compute the Pareto front of a set of points.
+
+    Parameters
+    ----------
+    X
+        A tensor of d-dimensional points.
+    signs
+        A factor by which each point is scaled. If this is negative,
+        the Pareto Front is inverted (an object is dominated if some
+        other object has *smaller* values for each attribute). Can be
+        set on a per-attribute basis. Defaults to -1 for each attribute
+        if no signs are given.
+
+
+    Returns
+    -------
+    tensor
+        The Pareto front, that is those points that are not dominated.
+        A point is dominated if some other point has a larger value in
+        every dimension.
+    """
+    n_points, n_attributes = X.shape
+
+    if signs is None:
+        # TODO maybe remove this functionality or at least flip the default;
+        # this is confusing (kept to match the implementation in the original
+        # cs-ranking)
+        signs = -np.ones(n_attributes)
+
+    # A binary flag array. First assume all points are on the Pareto
+    # front, then check if that is actually true for each point.
+    pareto = np.ones(n_points, dtype=bool)
+    for i, attr in enumerate(X):
+        # A binary flag array that has an entry for each other point: 1
+        # if and only if it is "better" than the other point in at
+        # least one dimension (i.e. it is not dominated by that point).
+        not_dominated = np.any((X * signs[None, :]) <= (attr * signs), axis=1)
+        pareto[i] = np.all(not_dominated)
+    return pareto
+
+
 class ParetoChoiceProblem(TensorDataset):
     """Generate a choice problem based on choosing pareto points.
 
@@ -69,128 +111,28 @@ def __init__(
         n_features: int,
         random_state: np.random.RandomState,
     ):
-        # TODO
-        # generate set of points uniformly at random in R^2, find the
-        # Pareto-set (follow steps 1-3 in B.4.1) containing only the non-domnated objects.
-        # Consider an n-features dimension space, generate points in the
-        # hypersphere from a isometric normal distribution (see
-        # csrank/dataset_reader/choicefunctions/choice_data_generator); Move
-        # the whole ball to some other center; Generate pareto front of each cluster.
         self.random_state = random_state
-        (x, y_true) = self.make_globular_pareto_choices(
-            n_instances,
-            n_objects,
-            n_features,
-            cluster_spread=1.0,
-            cluster_size=10,
-            random_state=random_state,
+        self.n_instances = n_instances
+        self.n_objects = n_objects
+        self.n_features = n_features
+        x_instances = []
+        y_instances = []
+        for instance in range(n_instances):
+            x_instance = self.draw_x_instance()
+            pareto_front_flags = _compute_pareto_front(x_instance)
+            x_instances.append(x_instance)
+            y_instances.append(pareto_front_flags)
+        super().__init__(
+            torch.tensor(x_instances).float(), torch.tensor(y_instances).float()
         )
-        super().__init__(torch.tensor(x).float(), torch.tensor(y_true).float())
 
-    def make_globular_pareto_choices(
-        self,
-        n_instances: int,
-        n_objects: int,
-        n_features: int,
-        cluster_spread: float,
-        cluster_size: int,
-    ):
-        def pareto_front(X, signs=None):
-            """Compute the Pareto front of a set of points.
-
-            Parameters
-            ----------
-            X
-                A tensor of d-dimensional points.
-            signs
-                A factor by which each point is scaled. If this is negative,
-                the Pareto Front is inverted (an object is dominated if some
-                other object has *smaller* values for each attribute). Can be
-                set on a per-attribute basis. Defaults to -1 for each attribute
-                if no signs are given.
-
-
-            Returns
-            -------
-            tensor
-                The Pareto front, that is those points that are not dominated.
-                A point is dominated if some other point has a larger value in
-                every dimension.
-            """
-            n_points, n_attributes = X.shape
-
-            if signs is None:
-                signs = -np.ones(n_attributes)
-
-            # A binary flag array. First assume all points are on the Pareto
-            # front, then check if that is actually true for each point.
-            pareto = np.ones(n_points, dtype=bool)
-            for i, attr in enumerate(X):
-                # A binary flag array that has an entry for each other point: 1
-                # if and only if it is "better" than the other point in at
-                # least one dimension (i.e. it is not dominated by that point).
-                not_dominated = np.any((X * signs[None, :]) <= (attr * signs), axis=1)
-                pareto[i] = np.all(not_dominated)
-            return pareto
-
-        def sample_unit_ball(
-            n_objects: int,
-            n_features: int,
-            radius: float,
-        ):
-            """Samples some points in a d-dimensional ball.
-
-            Parameters
-            ----------
-            n_objects
-                How many points to sample.
-            n_features
-                The dimension of the space.
-            radius
-                The radius of the ball.
-            """
-            X = self.random_state.randn(n_objects, n_features)
-            u = self.random_state.uniform(size=n_objects)[:, None]
-            X /= np.linalg.norm(X, axis=1, ord=2)[:, None]
-            X *= radius * u
-            return X
-
-        def make_randn_pareto_choices(
-            cluster_size: int,
-            n_features: int,
-            n_objects: int,
-            center: np.array,
-        ):
-            """Generate random objects from a d-dimensional isometric normal distribution.
-
-            This should be the easiest possible Pareto-problem, since the model can learn
-            a latent-utility which scores how likely a point is on the front (independent
-            of the other points)."""
-            # Generate a single cluster, uniformly at random.
-            X = self.random_state.randn(cluster_size, n_objects, n_features)
-            Y = np.empty((cluster_size, n_objects), dtype=bool)
-            for i in range(cluster_size):
-                Y[i] = pareto_front(X[i])
-            # Return the shifted cluster with its Pareto front.
-            return X + center, Y
-
-        X = np.empty((n_instances, n_objects, n_features))
-        Y = np.empty((n_instances, n_objects), dtype=int)
-        for i in range(int(n_instances / cluster_size)):
-            center = sample_unit_ball(
-                n_inst=1, n_features=n_features, radius=cluster_spread
-            )
-            # Cluster of points sampled uniformly at random and shifted to
-            # center. Center is some point on the unit ball (cluster_spread).
-            x, y = make_randn_pareto_choices(
-                cluster_size=cluster_size,
-                n_features=n_features,
-                n_objects=n_objects,
-                center=center,
-            )
-            # TODO(next) how do clusters and instances relate? Is each cluster
-            # one instance? Then why the n_instances / cluster_size in the for
-            # loop?
-            X[i * cluster_size : (i + 1) * cluster_size] = x
-            Y[i * cluster_size : (i + 1) * cluster_size] = y
-        return X, Y
+    def draw_centroid(self):
+        # Draw a centroid from the unit ball from a Gaussian distribution.
+        return self.random_state.randn(self.n_features)
+
+    def draw_x_instance(self):
+        # Draws a cluster of points around a a randomly generated centroid.
+        return (
+            self.random_state.randn(self.n_objects, self.n_instances)
+            + self.draw_centroid()
+        )