RasmussenLab · enryH · Jun 7, 2024 · Jan 4, 2023 · Jan 4, 2023 · Jan 4, 2023
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -1,16 +1,68 @@
 name: release on pypi
 on:
   push:
-    branches:
-      - main
+    # branches:
+    #   - main
+
 jobs:
+  format:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: psf/black@stable 
+  lint:
+    name: Lint with flake8
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install flake8
+        run: pip install flake8 flake8-bugbear
+      - name: Lint with flake8  
+        run: flake8 src
+  run-tutorial:
+    name: Run tutorial
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install dependencies
+        run: pip install .
+      - name: Prepare tutorial data
+        run: |
+          cd tutorial
+          move-dl data=random_small task=encode_data
+      - name: Train model and analyze latent space
+        run: |
+          cd tutorial
+          move-dl data=random_small task=random_small__latent
+      - name: Identify associations
+        run: |
+          cd tutorial
+          move-dl data=random_small task=random_small__id_assoc_ttest
+          move-dl data=random_small task=random_small__id_assoc_bayes
   publish:
     name: Publish package
     runs-on: ubuntu-latest
+    if: startsWith(github.ref, 'refs/tags')
+    needs:
+      - format
+      - lint
     steps:
-      - uses: actions/checkout@v3
-      - name: Publish package
-        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install twine and build
+        run: python -m pip install --upgrade twine build
+      - name: Build
+        run: python -m build
+      - name: Publish package  
         uses: pypa/gh-action-pypi-publish@release/v1
         with:
           user: __token__

diff --git a/.gitignore b/.gitignore
@@ -40,6 +40,11 @@ tutorial/*
 !tutorial/notebooks/*.ipynb
 !tutorial/README.md
 
+# Supplementary files
+supplementary_files/*.png
+supplementary_files/*.tsv
+supplementary_files/*.txt
+
 # Virtual environment
 venv/
 virtualvenv/
@@ -48,6 +53,12 @@ virtualvenv/
 docs/build/
 docs/source/_templates/
 
+# VS Code settings
+.vscode
+
+# macOS
+.DS_Store
+
 # Root folder
 /*.*
 !/.gitignore
@@ -58,3 +69,4 @@ docs/source/_templates/
 !/pyproject.toml
 !/requirements.txt
 !/setup.cfg
+!/.github
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -38,7 +38,7 @@
 
 html_theme = "sphinx_rtd_theme"
 html_theme_options = {
-    "collapse_navigation" : False,
+    "collapse_navigation": False,
 }
 html_static_path = []
 

diff --git a/setup.cfg b/setup.cfg
@@ -21,7 +21,7 @@ install_requires =
     matplotlib
     seaborn
     scikit-learn
-    scipy
+    scipy>=1.10.0
 
 package_dir =
     = src
@@ -34,3 +34,9 @@ where = src
 [options.entry_points]
 console_scripts =
     move-dl=move.__main__:main
+
+[flake8]
+max-line-length = 120
+aggressive = 2
+extend-select = B950
+extend-ignore = E203,E501,E701
diff --git a/src/move/__init__.py b/src/move/__init__.py
@@ -1,11 +1,11 @@
 from __future__ import annotations
 
+HYDRA_VERSION_BASE = "1.2"
+
+from move import conf, data, models  # noqa:E402
+from move.models.vae import VAE  # noqa:E402
+from move.training.training_loop import training_loop  # noqa:E402
+
 __license__ = "MIT"
 __version__ = (1, 4, 10)
 __all__ = ["conf", "data", "models", "training_loop", "VAE"]
-
-HYDRA_VERSION_BASE = "1.2"
-
-from move import conf, data, models
-from move.models.vae import VAE
-from move.training.training_loop import training_loop
diff --git a/src/move/analysis/metrics.py b/src/move/analysis/metrics.py
@@ -81,3 +81,19 @@ def norm(x: np.ma.MaskedArray, axis: int = 1) -> np.ma.MaskedArray:
         1D array with the specified axis removed.
     """
     return np.sqrt(np.sum(x**2, axis=axis))
+
+
+def get_2nd_order_polynomial(x_array, y_array, n_points=100):
+    """
+    Given a set of x an y values, find the 2nd oder polynomial fitting best the data.
+
+    Returns:
+        x_pol: x coordinates for the polynomial function evaluation.
+        y_pol: y coordinates for the polynomial function evaluation.
+    """
+    a2, a1, a = np.polyfit(x_array, y_array, deg=2)
+
+    x_pol = np.linspace(np.min(x_array), np.max(x_array), n_points)
+    y_pol = np.array([a2 * x * x + a1 * x + a for x in x_pol])
+
+    return x_pol, y_pol, (a2, a1, a)
diff --git a/src/move/conf/main.yaml b/src/move/conf/main.yaml
@@ -20,6 +20,7 @@ hydra:
   job:
     config:
       override_dirname:
+        item_sep: ";"
         exclude_keys:
           - experiment
 

diff --git a/src/move/conf/schema.py b/src/move/conf/schema.py
@@ -28,6 +28,7 @@ class InputConfig:
     name: str
     weight: int = 1
 
+
 @dataclass
 class ContinuousInputConfig(InputConfig):
     scale: bool = True
@@ -185,6 +186,27 @@ class IdentifyAssociationsTTestConfig(IdentifyAssociationsConfig):
     num_latent: list[int] = MISSING
 
 
+@dataclass
+class IdentifyAssociationsKSConfig(IdentifyAssociationsConfig):
+    """Configure the Kolmogorov-Smirnov approach to identify associations.
+
+    Args:
+        perturbed_feature_names: names of the perturbed features of interest.
+        target_feature_names: names of the target features of interest.
+
+    Description:
+    For each perturbed feature - target feature pair, we will plot:
+            - Input vs. reconstruction correlation plot: to assess reconstruction
+              quality of both target and perturbed features.
+            - Distribution of reconstruction values for the target feature before
+              and after the perturbation of the perturbed feature.
+
+    """
+
+    perturbed_feature_names: list[str] = field(default_factory=list)
+    target_feature_names: list[str] = field(default_factory=list)
+
+
 @dataclass
 class MOVEConfig:
     defaults: list[Any] = field(default_factory=lambda: [dict(data="base_data")])
@@ -237,6 +259,11 @@ def extract_names(configs: list[InputConfig]) -> list[str]:
     name="identify_associations_ttest_schema",
     node=IdentifyAssociationsTTestConfig,
 )
+cs.store(
+    group="task",
+    name="identify_associations_ks_schema",
+    node=IdentifyAssociationsKSConfig,
+)
 
 # Register custom resolvers
 OmegaConf.register_new_resolver("weights", extract_weights)

diff --git a/src/move/conf/task/identify_associations_bayes.yaml b/src/move/conf/task/identify_associations_bayes.yaml
@@ -32,3 +32,5 @@ training_loop:
     - 25
   early_stopping: false
   patience: 0
+
+
diff --git a/src/move/conf/task/identify_associations_ttest.yaml b/src/move/conf/task/identify_associations_ttest.yaml
@@ -35,3 +35,5 @@ training_loop:
     - 25
   early_stopping: false
   patience: 0
+
+
diff --git a/src/move/data/perturbations.py b/src/move/data/perturbations.py
@@ -1,12 +1,17 @@
 __all__ = ["perturb_categorical_data", "perturb_continuous_data"]
 
-from typing import cast
+from pathlib import Path
+from typing import Literal, Optional, cast
 
 import numpy as np
 import torch
 from torch.utils.data import DataLoader
 
 from move.data.dataloaders import MOVEDataset
+from move.data.preprocessing import feature_stats
+from move.visualization.dataset_distributions import plot_value_distributions
+
+ContinuousPerturbationType = Literal["minimum", "maximum", "plus_std", "minus_std"]
 
 
 def perturb_categorical_data(
@@ -111,3 +116,88 @@ def perturb_continuous_data(
         dataloaders.append(perturbed_dataloader)
 
     return dataloaders
+
+
+def perturb_continuous_data_extended(
+    baseline_dataloader: DataLoader,
+    con_dataset_names: list[str],
+    target_dataset_name: str,
+    perturbation_type: ContinuousPerturbationType,
+    output_subpath: Optional[Path] = None,
+) -> list[DataLoader]:
+    """Add perturbations to continuous data. For each feature in the target
+    dataset, change the feature's value in all samples (in rows):
+    1,2) substituting this feature in all samples by the feature's minimum/maximum value.
+    3,4) Adding/Substracting one standard deviation to the sample's feature value.
+
+    Args:
+        baseline_dataloader: Baseline dataloader
+        con_dataset_names: List of continuous dataset names
+        target_dataset_name: Target continuous dataset to perturb
+        perturbation_type: 'minimum', 'maximum', 'plus_std' or 'minus_std'.
+        output_subpath: path where the figure showing the perturbation will be saved
+
+    Returns:
+        - List of dataloaders containing all perturbed datasets
+        - Plot of the feature value distribution after the perturbation. Note that
+          all perturbations are collapsed into one single plot.
+
+    Note:
+        This function was created so that it could generalize to non-normalized
+        datasets. Scaling is done per dataset, not per feature -> slightly different stds
+        feature to feature.
+    """
+
+    baseline_dataset = cast(MOVEDataset, baseline_dataloader.dataset)
+    assert baseline_dataset.con_shapes is not None
+    assert baseline_dataset.con_all is not None
+
+    target_idx = con_dataset_names.index(target_dataset_name)  # dataset index
+    splits = np.cumsum([0] + baseline_dataset.con_shapes)
+    slice_ = slice(*splits[target_idx : target_idx + 2])
+
+    num_features = baseline_dataset.con_shapes[target_idx]
+    dataloaders = []
+    perturbations_list = []
+
+    for i in range(num_features):
+        perturbed_con = baseline_dataset.con_all.clone()
+        target_dataset = perturbed_con[:, slice_]
+        # Change the desired feature value by:
+        min_feat_val_list, max_feat_val_list, std_feat_val_list = feature_stats(
+            target_dataset
+        )
+        if perturbation_type == "minimum":
+            target_dataset[:, i] = torch.FloatTensor([min_feat_val_list[i]])
+        elif perturbation_type == "maximum":
+            target_dataset[:, i] = torch.FloatTensor([max_feat_val_list[i]])
+        elif perturbation_type == "plus_std":
+            target_dataset[:, i] += torch.FloatTensor([std_feat_val_list[i]])
+        elif perturbation_type == "minus_std":
+            target_dataset[:, i] -= torch.FloatTensor([std_feat_val_list[i]])
+
+        perturbations_list.append(target_dataset[:, i].numpy())
+
+        perturbed_dataset = MOVEDataset(
+            baseline_dataset.cat_all,
+            perturbed_con,
+            baseline_dataset.cat_shapes,
+            baseline_dataset.con_shapes,
+        )
+
+        perturbed_dataloader = DataLoader(
+            perturbed_dataset,
+            shuffle=False,
+            batch_size=baseline_dataloader.batch_size,
+        )
+        dataloaders.append(perturbed_dataloader)
+
+    # Plot the perturbations for all features, collapsed in one plot:
+    if output_subpath is not None:
+        fig = plot_value_distributions(np.array(perturbations_list).transpose())
+        fig_path = str(
+            output_subpath / f"perturbation_distribution_{target_dataset_name}.png"
+        )
+        fig.savefig(fig_path)
+
+    return dataloaders
diff --git a/src/move/data/preprocessing.py b/src/move/data/preprocessing.py
@@ -57,7 +57,7 @@ def one_hot_encode_single(mapping: dict[str, int], value: Optional[str]) -> IntA
     Returns:
         2D array
     """
-    encoded_value = np.zeros((1, len(mapping)))
+    encoded_value = np.zeros((1, len(mapping)), dtype=int)
     if not pd.isna(value):
         code = mapping[str(value)]
         encoded_value[0, code] = 1
@@ -79,3 +79,23 @@ def scale(x: np.ndarray) -> tuple[FloatArray, BoolArray]:
     scaled_x = standardize(logx[:, mask_1d], axis=0)
     scaled_x[np.isnan(scaled_x)] = 0
     return scaled_x, mask_1d
+
+
+def feature_stats(x: ArrayLike) -> tuple[FloatArray, FloatArray, FloatArray]:
+    """
+    Read an array of continuous values and extract the
+    minimum, maximum and standard deviation per column (feature).
+
+    Args:
+        x: 2D array with samples in its rows and features in its columns
+
+    Returns:
+        minimum: list with minimum value per feature (column)
+        maximum: list with maximum  " "
+        std: list with std " "
+    """
+
+    minimum = np.nanmin(x, axis=0)
+    maximum = np.nanmax(x, axis=0)
+    std = np.nanstd(x, axis=0)
+    return minimum, maximum, std
Original file line number	Diff line number	Diff line change
Expand Up		@@ -32,3 +32,5 @@ training_loop:
		- 25
		early_stopping: false
		patience: 0
Original file line number	Diff line number	Diff line change
Expand Up		@@ -35,3 +35,5 @@ training_loop:
		- 25
		early_stopping: false
		patience: 0