diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 1f7a8044..f0589f2f 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -10,9 +10,27 @@ jobs: steps: - uses: actions/checkout@v4 - uses: psf/black@stable + lint: + name: Lint with flake8 + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install flake8 + run: pip install flake8 flake8-bugbear + - name: Lint with flake8 + run: flake8 src + publish: name: Publish package runs-on: ubuntu-latest + if: startsWith(github.ref, 'refs/tags') + needs: + - format + - lint steps: - name: Checkout uses: actions/checkout@v4 @@ -24,8 +42,7 @@ jobs: run: python -m pip install --upgrade twine build - name: Build run: python -m build - - name: Publish package - if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') + - name: Publish package uses: pypa/gh-action-pypi-publish@release/v1 with: user: __token__ diff --git a/setup.cfg b/setup.cfg index cb3ead97..88946402 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,3 +34,7 @@ where = src [options.entry_points] console_scripts = move-dl=move.__main__:main + +[flake8] +max-line-length = 120 +aggressive = 2 \ No newline at end of file diff --git a/src/move/__init__.py b/src/move/__init__.py index a4afcdcb..ae9b8e45 100644 --- a/src/move/__init__.py +++ b/src/move/__init__.py @@ -1,11 +1,10 @@ from __future__ import annotations +from move.training.training_loop import training_loop +from move.models.vae import VAE +from move import conf, data, models __license__ = "MIT" __version__ = (1, 4, 10) __all__ = ["conf", "data", "models", "training_loop", "VAE"] HYDRA_VERSION_BASE = "1.2" - -from move import conf, data, models -from move.models.vae import VAE -from move.training.training_loop import training_loop diff --git a/src/move/data/perturbations.py b/src/move/data/perturbations.py index 4249428c..21ebddbf 100644 --- a/src/move/data/perturbations.py +++ b/src/move/data/perturbations.py @@ -11,7 +11,6 @@ from move.data.preprocessing import feature_stats from move.visualization.dataset_distributions import plot_value_distributions - ContinuousPerturbationType = Literal["minimum", "maximum", "plus_std", "minus_std"] @@ -42,7 +41,7 @@ def perturb_categorical_data( splits = np.cumsum( [0] + [int.__mul__(*shape) for shape in baseline_dataset.cat_shapes] ) - slice_ = slice(*splits[target_idx : target_idx + 2]) + slice_ = slice(*splits[target_idx: target_idx + 2]) target_shape = baseline_dataset.cat_shapes[target_idx] num_features = target_shape[0] # CHANGE @@ -94,7 +93,7 @@ def perturb_continuous_data( target_idx = con_dataset_names.index(target_dataset_name) splits = np.cumsum([0] + baseline_dataset.con_shapes) - slice_ = slice(*splits[target_idx : target_idx + 2]) + slice_ = slice(*splits[target_idx: target_idx + 2]) num_features = baseline_dataset.con_shapes[target_idx] @@ -155,7 +154,7 @@ def perturb_continuous_data_extended( target_idx = con_dataset_names.index(target_dataset_name) # dataset index splits = np.cumsum([0] + baseline_dataset.con_shapes) - slice_ = slice(*splits[target_idx : target_idx + 2]) + slice_ = slice(*splits[target_idx: target_idx + 2]) num_features = baseline_dataset.con_shapes[target_idx] dataloaders = [] diff --git a/src/move/models/vae.py b/src/move/models/vae.py index cd42f7de..23de0124 100644 --- a/src/move/models/vae.py +++ b/src/move/models/vae.py @@ -43,7 +43,7 @@ def __init__( continuous_shapes: Optional[list[int]] = None, categorical_weights: Optional[list[int]] = None, continuous_weights: Optional[list[int]] = None, - num_hidden: list[int] = [200, 200], + num_hidden: list[int] = (200, 200), num_latent: int = 20, beta: float = 0.01, dropout: float = 0.2, @@ -99,11 +99,11 @@ def __init__( # Initialize simple attributes self.beta = beta - self.num_hidden = num_hidden + self.num_hidden = list(num_hidden) self.num_latent = num_latent self.dropout = dropout - self.device = torch.device("cuda" if cuda == True else "cpu") + self.device = torch.device("cuda" if cuda else "cpu") # Activation functions self.relu = nn.LeakyReLU() @@ -116,7 +116,7 @@ def __init__( self.decoderlayers = nn.ModuleList() self.decodernorms = nn.ModuleList() - ### Layers + # Layers # Hidden layers for nin, nout in zip([self.input_size] + self.num_hidden, self.num_hidden): self.encoderlayers.append(nn.Linear(nin, nout)) @@ -190,7 +190,7 @@ def decompose_categorical(self, reconstruction: torch.Tensor) -> list[torch.Tens cat_out = [] pos = 0 for cat_shape in self.categorical_shapes: - cat_dataset = cat_tmp[:, pos : (cat_shape[0] * cat_shape[1] + pos)] + cat_dataset = cat_tmp[:, pos: (cat_shape[0] * cat_shape[1] + pos)] cat_out_tmp = cat_dataset.view( cat_dataset.shape[0], cat_shape[0], cat_shape[1] @@ -287,7 +287,7 @@ def calculate_cat_error( cat_errors = [] pos = 0 for cat_shape in self.categorical_shapes: - cat_dataset = cat_in[:, pos : (cat_shape[0] * cat_shape[1] + pos)] + cat_dataset = cat_in[:, pos: (cat_shape[0] * cat_shape[1] + pos)] cat_dataset = cat_dataset.view(cat_in.shape[0], cat_shape[0], cat_shape[1]) cat_target = cat_dataset @@ -327,8 +327,8 @@ def calculate_con_error( total_shape = 0 con_errors_list: list[torch.Tensor] = [] for s in self.continuous_shapes: - c_in = con_in[:, total_shape : (s + total_shape - 1)] - c_re = con_out[:, total_shape : (s + total_shape - 1)] + c_in = con_in[:, total_shape: (s + total_shape - 1)] + c_re = con_out[:, total_shape: (s + total_shape - 1)] error = loss(c_re, c_in) / batch_size con_errors_list.append(error) total_shape += s @@ -451,7 +451,7 @@ def encoding( elif self.num_continuous > 0: tensor = con else: - assert False, "Must have at least 1 categorial or 1 continuous feature" + raise ValueError("Must have at least 1 categorial or 1 continuous feature") optimizer.zero_grad() @@ -538,21 +538,21 @@ def get_cat_recon( shape_1 = 0 for cat_shape in self.categorical_shapes: # Get input categorical data - cat_in_tmp = cat[:, pos : (cat_shape[0] * cat_shape[1] + pos)] + cat_in_tmp = cat[:, pos: (cat_shape[0] * cat_shape[1] + pos)] cat_in_tmp = cat_in_tmp.view(cat.shape[0], cat_shape[0], cat_shape[1]) # Calculate target values for input cat_target_tmp = cat_in_tmp cat_target_tmp = torch.argmax(cat_target_tmp.detach(), dim=2) cat_target_tmp[cat_in_tmp.sum(dim=2) == 0] = -1 - cat_target[:, shape_1 : (cat_shape[0] + shape_1)] = ( + cat_target[:, shape_1: (cat_shape[0] + shape_1)] = ( cat_target_tmp # .numpy() ) # Get reconstructed categorical data cat_out_tmp = cat_out[count] cat_out_tmp = cat_out_tmp.transpose(1, 2) - cat_out_class[:, shape_1 : (cat_shape[0] + shape_1)] = torch.argmax( + cat_out_class[:, shape_1: (cat_shape[0] + shape_1)] = torch.argmax( cat_out_tmp, dim=2 ) # .numpy() @@ -694,7 +694,7 @@ def latent( elif self.num_continuous > 0: tensor = con else: - assert False, "Must have at least 1 categorial or 1 continuous feature" + raise ValueError("Must have at least 1 categorial or 1 continuous feature") # Evaluate cat_out, con_out, mu, logvar = self(tensor) @@ -713,14 +713,14 @@ def latent( cat_out_class, cat_target = self.get_cat_recon( batch, cat_total_shape, cat, cat_out ) - cat_recon[row : row + len(cat_out_class)] = torch.Tensor(cat_out_class) - cat_class[row : row + len(cat_target)] = torch.Tensor(cat_target) + cat_recon[row: row + len(cat_out_class)] = torch.Tensor(cat_out_class) + cat_class[row: row + len(cat_target)] = torch.Tensor(cat_target) if self.num_continuous > 0: - con_recon[row : row + len(con_out)] = con_out + con_recon[row: row + len(con_out)] = con_out - latent_var[row : row + len(logvar)] = logvar - latent[row : row + len(mu)] = mu + latent_var[row: row + len(logvar)] = logvar + latent[row: row + len(mu)] = mu row += len(mu) test_loss /= len(dataloader) diff --git a/src/move/tasks/analyze_latent.py b/src/move/tasks/analyze_latent.py index 788d08d5..c6d59125 100644 --- a/src/move/tasks/analyze_latent.py +++ b/src/move/tasks/analyze_latent.py @@ -48,17 +48,17 @@ def find_feature_values( Tuple containing (1) index of dataset containing feature and (2) values corresponding to the feature """ - dataset_index, feature_index = [None] * 2 - for dataset_index, feature_names in enumerate(feature_names_lists): + _dataset_index, feature_index = [None] * 2 + for _dataset_index, feature_names in enumerate(feature_names_lists): try: feature_index = feature_names.index(feature_name) except ValueError: continue break - if dataset_index is not None and feature_index is not None: + if _dataset_index is not None and feature_index is not None: return ( - dataset_index, - np.take(feature_values[dataset_index], feature_index, axis=1), + _dataset_index, + np.take(feature_values[_dataset_index], feature_index, axis=1), ) raise KeyError(f"Feature '{feature_name}' not in any dataset.") @@ -98,7 +98,7 @@ def analyze_latent(config: MOVEConfig) -> None: df_index = pd.Index(sample_names, name="sample") assert task_config.model is not None - device = torch.device("cuda" if task_config.model.cuda == True else "cpu") + device = torch.device("cuda" if task_config.model.cuda else "cpu") model: VAE = hydra.utils.instantiate( task_config.model, continuous_shapes=test_dataset.con_shapes, diff --git a/src/move/tasks/identify_associations.py b/src/move/tasks/identify_associations.py index c099624c..94c9bc2a 100644 --- a/src/move/tasks/identify_associations.py +++ b/src/move/tasks/identify_associations.py @@ -14,7 +14,6 @@ from torch.utils.data import DataLoader from move.analysis.metrics import get_2nd_order_polynomial - from move.conf.schema import ( IdentifyAssociationsBayesConfig, IdentifyAssociationsConfig, @@ -202,7 +201,7 @@ def _bayes_approach( ) -> tuple[Union[IntArray, FloatArray], ...]: assert task_config.model is not None - device = torch.device("cuda" if task_config.model.cuda == True else "cpu") + device = torch.device("cuda" if task_config.model.cuda else "cpu") # Train models logger = get_logger(__name__) @@ -319,7 +318,7 @@ def _ttest_approach( from scipy.stats import ttest_rel assert task_config.model is not None - device = torch.device("cuda" if task_config.model.cuda == True else "cpu") + device = torch.device("cuda" if task_config.model.cuda else "cpu") # Train models logger = get_logger(__name__) @@ -463,7 +462,7 @@ def _ks_approach( """ assert task_config.model is not None - device = torch.device("cuda" if task_config.model.cuda == True else "cpu") + device = torch.device("cuda" if task_config.model.cuda else "cpu") figure_path = output_path / "figures" figure_path.mkdir(exist_ok=True, parents=True) @@ -524,7 +523,7 @@ def _ks_approach( min_baseline = np.min(baseline_recon, axis=0) max_baseline = np.max(baseline_recon, axis=0) - ############ QC of feature's reconstruction ############################## + # QC of feature's reconstruction ############################## logger.debug("Calculating quality control of the feature reconstructions") # Correlation and slope for each feature's reconstruction feature_names = reduce(list.__add__, con_names) @@ -549,7 +548,7 @@ def _ks_approach( dpi=50, ) - ################## Calculate perturbed reconstruction and shifts ############################# + # Calculate perturbed reconstruction and shifts ############################# logger.debug("Computing KS scores") # Save original latent space for first refit: @@ -646,7 +645,7 @@ def _ks_approach( qc_df = pd.DataFrame({"Feature names": feature_names}) qc_df["slope"] = np.nanmean(slope, axis=0) qc_df["reconstruction_correlation"] = np.nanmean(rec_corr, axis=0) - qc_df.to_csv(output_path / f"QC_summary_KS.tsv", sep="\t", index=False) + qc_df.to_csv(output_path / "QC_summary_KS.tsv", sep="\t", index=False) # Return first idx associations: redefined for reasonable threshold @@ -739,8 +738,8 @@ def identify_associations(config: MOVEConfig) -> None: 2) Evaluate associations using bayes or ttest approach. 3) Save results. """ - #################### DATA PREPARATION ###################### - ####### Read original data and create perturbed datasets#### + # DATA PREPARATION ###################### + # Read original data and create perturbed datasets#### logger = get_logger(__name__) task_config = cast(IdentifyAssociationsConfig, config.task) @@ -811,7 +810,7 @@ def identify_associations(config: MOVEConfig) -> None: num_perturbed = len(dataloaders) - 1 # P logger.debug(f"# perturbed features: {num_perturbed}") - ################# APPROACH EVALUATION ########################## + # APPROACH EVALUATION ########################## if task_type == "bayes": task_config = cast(IdentifyAssociationsBayesConfig, task_config) @@ -870,7 +869,7 @@ def identify_associations(config: MOVEConfig) -> None: else: raise ValueError() - ###################### RESULTS ################################ + # RESULTS ################################ save_results( config, con_shapes, diff --git a/src/move/tasks/tune_model.py b/src/move/tasks/tune_model.py index 89bc6f1f..9d530028 100644 --- a/src/move/tasks/tune_model.py +++ b/src/move/tasks/tune_model.py @@ -1,7 +1,6 @@ __all__ = ["tune_model"] from pathlib import Path -from random import shuffle from typing import Any, Literal, cast import hydra @@ -26,7 +25,7 @@ TuneModelStabilityConfig, ) from move.core.logging import get_logger -from move.core.typing import BoolArray, FloatArray +from move.core.typing import BoolArray from move.data import io from move.data.dataloaders import MOVEDataset, make_dataloader, split_samples from move.models.vae import VAE @@ -87,7 +86,7 @@ def tune_model(config: MOVEConfig) -> float: ) assert task_config.model is not None - device = torch.device("cuda" if task_config.model.cuda == True else "cpu") + device = torch.device("cuda" if task_config.model.cuda is True else "cpu") def _tune_stability( task_config: TuneModelStabilityConfig, diff --git a/src/move/training/training_loop.py b/src/move/training/training_loop.py index 8f59afc4..9a54fdf6 100644 --- a/src/move/training/training_loop.py +++ b/src/move/training/training_loop.py @@ -23,14 +23,18 @@ def dilate_batch(dataloader: DataLoader) -> DataLoader: return DataLoader(dataset, batch_size, shuffle=True, drop_last=True) +BATCH_DILATION_STEPS = [] +KLD_WARMUP_STEPS = [] + + def training_loop( model: VAE, train_dataloader: DataLoader, valid_dataloader: Optional[DataLoader] = None, lr: float = 1e-4, num_epochs: int = 100, - batch_dilation_steps: list[int] = [], - kld_warmup_steps: list[int] = [], + batch_dilation_steps: list[int] = BATCH_DILATION_STEPS, + kld_warmup_steps: list[int] = KLD_WARMUP_STEPS, early_stopping: bool = False, patience: int = 0, ) -> TrainingLoopOutput: @@ -41,13 +45,17 @@ def training_loop( Args: model (VAE): trained VAE model object train_dataloader (DataLoader): An object feeding data to the VAE with training data - valid_dataloader (Optional[DataLoader], optional): An object feeding data to the VAE with validation data. Defaults to None. + valid_dataloader (Optional[DataLoader], optional): An object feeding data to the VAE with validation data. + Defaults to None. lr (float, optional): learning rate. Defaults to 1e-4. num_epochs (int, optional): number of epochs. Defaults to 100. - batch_dilation_steps (list[int], optional): a list with integers corresponding to epochs when batch size is increased. Defaults to []. - kld_warmup_steps (list[int], optional): a list with integers corresponding to epochs when kld is decreased by the selected rate. Defaults to []. + batch_dilation_steps (list[int], optional): a list with integers corresponding to epochs when batch size is + increased. Defaults to []. + kld_warmup_steps (list[int], optional): a list with integers corresponding to epochs when kld is decreased by + the selected rate. Defaults to []. early_stopping (bool, optional): boolean if use early stopping . Defaults to False. - patience (int, optional): number of epochs to wait before early stop if no progress on the validation set . Defaults to 0. + patience (int, optional): number of epochs to wait before early stop if no progress on the validation set. + Defaults to 0. Returns: (tuple): a tuple containing: diff --git a/src/move/visualization/dataset_distributions.py b/src/move/visualization/dataset_distributions.py index 3af4b1d2..ba448d2c 100644 --- a/src/move/visualization/dataset_distributions.py +++ b/src/move/visualization/dataset_distributions.py @@ -70,9 +70,9 @@ def plot_reconstruction_diff( Plot the reconstruction differences as a heatmap. """ with style_settings(style): - if vmin == None: + if vmin is None: vmin = np.min(diff_array) - elif vmax == None: + elif vmax is None: vmax = np.max(diff_array) fig = plt.figure(layout="constrained", figsize=(7, 7)) plt.imshow(diff_array, cmap=colormap, vmin=vmin, vmax=vmax) @@ -140,7 +140,7 @@ def plot_feature_association_graph( with_labels = True elif layout == "circular": pos = nx.circular_layout(G) - texts = [ + _ = [ plt.text( pos[node][0], pos[node][1], @@ -269,7 +269,7 @@ def plot_cumulative_distributions( (edges[:-1] + edges[1:]) / 2, np.cumsum(hist_pert), color="red", - label=f"Perturbed", + label="Perturbed", alpha=0.5, ) diff --git a/src/move/visualization/feature_importance.py b/src/move/visualization/feature_importance.py index 1d6208d6..f80f4b14 100644 --- a/src/move/visualization/feature_importance.py +++ b/src/move/visualization/feature_importance.py @@ -10,11 +10,10 @@ from matplotlib.colors import TwoSlopeNorm from move.core.typing import FloatArray -from move.visualization.style import ( +from move.visualization.style import ( # color_cycle, DEFAULT_DIVERGING_PALETTE, DEFAULT_PLOT_STYLE, DEFAULT_QUALITATIVE_PALETTE, - color_cycle, style_settings, ) diff --git a/src/move/visualization/latent_space.py b/src/move/visualization/latent_space.py index 4ebccf66..c53f2384 100644 --- a/src/move/visualization/latent_space.py +++ b/src/move/visualization/latent_space.py @@ -159,8 +159,8 @@ def plot_3D_latent_and_displacement( ax.view_init(altitude, azimuth) if show_baseline: - vmin, vmax = np.min(feature_values[::step]), np.max(feature_values[::step]) - abs_max = np.max([abs(vmin), abs(vmax)]) + # vmin, vmax = np.min(feature_values[::step]), np.max(feature_values[::step]) + # abs_max = np.max([abs(vmin), abs(vmax)]) ax.scatter( mu_baseline[::step, 0], mu_baseline[::step, 1], @@ -192,10 +192,14 @@ def plot_3D_latent_and_displacement( v = mu_perturbed[::step, 1] - mu_baseline[::step, 1] w = mu_perturbed[::step, 2] - mu_baseline[::step, 2] - module = np.sqrt(u * u + v * v + w * w) + # module = np.sqrt(u * u + v * v + w * w) max_u, max_v, max_w = np.max(abs(u)), np.max(abs(v)), np.max(abs(w)) - # Arrow colors will be weighted contributions of red -> dim1, green -> dim2, and blue-> dim3. I.e. purple arrow means movement in dims 1 and 3 + # Arrow colors will be weighted contributions of + # red -> dim1, + # green -> dim2, + # and blue-> dim3. + # I.e. purple arrow means movement in dims 1 and 3 colors = [ (abs(du) / max_u, abs(dv) / max_v, abs(dw) / max_w, 0.7) for du, dv, dw in zip(u, v, w) diff --git a/src/move/visualization/style.py b/src/move/visualization/style.py index d414c2ef..8b29a48a 100644 --- a/src/move/visualization/style.py +++ b/src/move/visualization/style.py @@ -28,11 +28,11 @@ def color_cycle(colormap: str) -> ContextManager: Returns: Context manager """ - registry: ColormapRegistry = getattr(matplotlib, "colormaps") + registry: ColormapRegistry = matplotlib.colormaps colormap = registry[colormap] if not isinstance(colormap, ListedColormap): raise ValueError("Only colormaps that are list of colors supported.") - prop_cycle = cycler(color=getattr(colormap, "colors")) + prop_cycle = cycler(color=colormap.colors) return matplotlib.rc_context({"axes.prop_cycle": prop_cycle}) diff --git a/src/move/visualization/vae_visualization.py b/src/move/visualization/vae_visualization.py index 1cb00d07..4bd226cf 100644 --- a/src/move/visualization/vae_visualization.py +++ b/src/move/visualization/vae_visualization.py @@ -59,7 +59,7 @@ def plot_vae( latent_node_distance = 550 latent_sep = 5 * latent_node_distance - ########################### Adding nodes to the graph ############################## + # Adding nodes to the graph ############################## # Bias nodes G.add_node( "input_bias", @@ -150,7 +150,7 @@ def plot_vae( color=0.0, ) - ########################## Adding weights to the graph ######################### + # Adding weights to the graph ######################### if plot_edges: for layer, values in model_weights.items(): @@ -166,7 +166,7 @@ def plot_vae( elif layer == "encoderlayers.0.bias": for j in range(values.shape[0]): # encoder_hidden G.add_edge( - f"input_bias", f"encoder_hidden_{j}", weight=values.numpy()[j] + "input_bias", f"encoder_hidden_{j}", weight=values.numpy()[j] ) elif layer == "mu.weight": @@ -180,7 +180,7 @@ def plot_vae( elif layer == "mu.bias": for i in range(values.shape[0]): # encoder_hidden - G.add_edge(f"mu_bias", f"mu_{i}", weight=values.numpy()[i]) + G.add_edge("mu_bias", f"mu_{i}", weight=values.numpy()[i]) elif layer == "var.weight": for j in range(values.shape[1]): # encoder hidden @@ -193,7 +193,7 @@ def plot_vae( elif layer == "var.bias": for i in range(values.shape[0]): # encoder_hidden - G.add_edge(f"var_bias", f"var_{i}", weight=values.numpy()[i]) + G.add_edge("var_bias", f"var_{i}", weight=values.numpy()[i]) # Sampled layer from mu and var: elif layer == "decoderlayers.0.weight": @@ -209,7 +209,7 @@ def plot_vae( elif layer == "decoderlayers.0.bias": for j in range(values.shape[0]): # decoder_hidden G.add_edge( - f"sam_bias", f"decoder_hidden_{j}", weight=values.numpy()[j] + "sam_bias", f"decoder_hidden_{j}", weight=values.numpy()[j] ) elif layer == "out.weight": @@ -223,7 +223,7 @@ def plot_vae( elif layer == "out.bias": for k in range(values.shape[0]): # output - G.add_edge(f"out_bias", f"output_{k}", weight=values.numpy()[k]) + G.add_edge("out_bias", f"output_{k}", weight=values.numpy()[k]) fig = plt.figure(figsize=(60, 60)) pos = nx.get_node_attributes(G, "pos") @@ -237,7 +237,7 @@ def plot_vae( abs_max = np.max([abs(np.min(color)), abs(np.max(color))]) abs_max_edge = np.max([abs(np.min(edge_color)), abs(np.max(edge_color))]) - sm_node = cm.ScalarMappable( + _ = cm.ScalarMappable( cmap=node_cmap, norm=matplotlib.colors.Normalize(vmin=-abs_max, vmax=abs_max) ) sm_edge = cm.ScalarMappable(