Skip to content

Commit

Permalink
wip: simplify pareto choice
Browse files Browse the repository at this point in the history
  • Loading branch information
timokau committed Nov 18, 2020
1 parent c3c2465 commit 6911866
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 124 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def make_globular_pareto_choices(
cluster_size=10,
**kwargs,
):
print(f"Cluster size is {cluster_size}")
def pareto_front(X, signs=None):
n_points, n_attributes = X.shape
if signs is None:
Expand All @@ -56,7 +57,10 @@ def make_randn_pareto_choices(
This should be the easiest possible Pareto-problem, since the model can learn
a latent-utility which scores how likely a point is on the front (independent
of the other points)."""
of the other points).
Generates n_instances clusters around the same centroid.
"""
rand = check_random_state(data_seed)
X = rand.randn(n_instances, n_objects, n_features)
Y = np.empty((n_instances, n_objects), dtype=bool)
Expand Down
1 change: 0 additions & 1 deletion csrank/dataset_reader/dataset_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@ def get_dataset_dictionaries(self, lengths=[5, 6]):
def get_single_train_test_split(self):
raise NotImplementedError

@abstractmethod
def get_train_test_datasets(self, n_datasets=5):
splits = np.array(n_datasets)
return self.splitter(splits)
186 changes: 64 additions & 122 deletions poc/datasets/variable_choice_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,48 @@ def __init__(
super().__init__(x, y_true)


def _compute_pareto_front(X, signs=None):
"""Compute the Pareto front of a set of points.
Parameters
----------
X
A tensor of d-dimensional points.
signs
A factor by which each point is scaled. If this is negative,
the Pareto Front is inverted (an object is dominated if some
other object has *smaller* values for each attribute). Can be
set on a per-attribute basis. Defaults to -1 for each attribute
if no signs are given.
Returns
-------
tensor
The Pareto front, that is those points that are not dominated.
A point is dominated if some other point has a larger value in
every dimension.
"""
n_points, n_attributes = X.shape

if signs is None:
# TODO maybe remove this functionality or at least flip the default;
# this is confusing (kept to match the implementation in the original
# cs-ranking)
signs = -np.ones(n_attributes)

# A binary flag array. First assume all points are on the Pareto
# front, then check if that is actually true for each point.
pareto = np.ones(n_points, dtype=bool)
for i, attr in enumerate(X):
# A binary flag array that has an entry for each other point: 1
# if and only if it is "better" than the other point in at
# least one dimension (i.e. it is not dominated by that point).
not_dominated = np.any((X * signs[None, :]) <= (attr * signs), axis=1)
pareto[i] = np.all(not_dominated)
return pareto


class ParetoChoiceProblem(TensorDataset):
"""Generate a choice problem based on choosing pareto points.
Expand All @@ -69,128 +111,28 @@ def __init__(
n_features: int,
random_state: np.random.RandomState,
):
# TODO
# generate set of points uniformly at random in R^2, find the
# Pareto-set (follow steps 1-3 in B.4.1) containing only the non-domnated objects.
# Consider an n-features dimension space, generate points in the
# hypersphere from a isometric normal distribution (see
# csrank/dataset_reader/choicefunctions/choice_data_generator); Move
# the whole ball to some other center; Generate pareto front of each cluster.
self.random_state = random_state
(x, y_true) = self.make_globular_pareto_choices(
n_instances,
n_objects,
n_features,
cluster_spread=1.0,
cluster_size=10,
random_state=random_state,
self.n_instances = n_instances
self.n_objects = n_objects
self.n_features = n_features
x_instances = []
y_instances = []
for instance in range(n_instances):
x_instance = self.draw_x_instance()
pareto_front_flags = _compute_pareto_front(x_instance)
x_instances.append(x_instance)
y_instances.append(pareto_front_flags)
super().__init__(
torch.tensor(x_instances).float(), torch.tensor(y_instances).float()
)
super().__init__(torch.tensor(x).float(), torch.tensor(y_true).float())

def make_globular_pareto_choices(
self,
n_instances: int,
n_objects: int,
n_features: int,
cluster_spread: float,
cluster_size: int,
):
def pareto_front(X, signs=None):
"""Compute the Pareto front of a set of points.
Parameters
----------
X
A tensor of d-dimensional points.
signs
A factor by which each point is scaled. If this is negative,
the Pareto Front is inverted (an object is dominated if some
other object has *smaller* values for each attribute). Can be
set on a per-attribute basis. Defaults to -1 for each attribute
if no signs are given.
Returns
-------
tensor
The Pareto front, that is those points that are not dominated.
A point is dominated if some other point has a larger value in
every dimension.
"""
n_points, n_attributes = X.shape

if signs is None:
signs = -np.ones(n_attributes)

# A binary flag array. First assume all points are on the Pareto
# front, then check if that is actually true for each point.
pareto = np.ones(n_points, dtype=bool)
for i, attr in enumerate(X):
# A binary flag array that has an entry for each other point: 1
# if and only if it is "better" than the other point in at
# least one dimension (i.e. it is not dominated by that point).
not_dominated = np.any((X * signs[None, :]) <= (attr * signs), axis=1)
pareto[i] = np.all(not_dominated)
return pareto

def sample_unit_ball(
n_objects: int,
n_features: int,
radius: float,
):
"""Samples some points in a d-dimensional ball.
Parameters
----------
n_objects
How many points to sample.
n_features
The dimension of the space.
radius
The radius of the ball.
"""
X = self.random_state.randn(n_objects, n_features)
u = self.random_state.uniform(size=n_objects)[:, None]
X /= np.linalg.norm(X, axis=1, ord=2)[:, None]
X *= radius * u
return X

def make_randn_pareto_choices(
cluster_size: int,
n_features: int,
n_objects: int,
center: np.array,
):
"""Generate random objects from a d-dimensional isometric normal distribution.
This should be the easiest possible Pareto-problem, since the model can learn
a latent-utility which scores how likely a point is on the front (independent
of the other points)."""
# Generate a single cluster, uniformly at random.
X = self.random_state.randn(cluster_size, n_objects, n_features)
Y = np.empty((cluster_size, n_objects), dtype=bool)
for i in range(cluster_size):
Y[i] = pareto_front(X[i])
# Return the shifted cluster with its Pareto front.
return X + center, Y

X = np.empty((n_instances, n_objects, n_features))
Y = np.empty((n_instances, n_objects), dtype=int)
for i in range(int(n_instances / cluster_size)):
center = sample_unit_ball(
n_inst=1, n_features=n_features, radius=cluster_spread
)
# Cluster of points sampled uniformly at random and shifted to
# center. Center is some point on the unit ball (cluster_spread).
x, y = make_randn_pareto_choices(
cluster_size=cluster_size,
n_features=n_features,
n_objects=n_objects,
center=center,
)
# TODO(next) how do clusters and instances relate? Is each cluster
# one instance? Then why the n_instances / cluster_size in the for
# loop?
X[i * cluster_size : (i + 1) * cluster_size] = x
Y[i * cluster_size : (i + 1) * cluster_size] = y
return X, Y
def draw_centroid(self):
# Draw a centroid from the unit ball from a Gaussian distribution.
return self.random_state.randn(self.n_features)

def draw_x_instance(self):
# Draws a cluster of points around a a randomly generated centroid.
return (
self.random_state.randn(self.n_objects, self.n_instances)
+ self.draw_centroid()
)

0 comments on commit 6911866

Please sign in to comment.