✨ add log2 option

- decide wheather to log2 transform - default: do not in order to allow negative features which are then standard normalized
RasmussenLab · Jul 9, 2024 · 9cd2a7b · 9cd2a7b
1 parent dc9020e
commit 9cd2a7b
Show file tree

Hide file tree

Showing 5 changed files with 19 additions and 8 deletions.
diff --git a/src/move/conf/schema.py b/src/move/conf/schema.py
@@ -32,6 +32,7 @@ class InputConfig:
 @dataclass
 class ContinuousInputConfig(InputConfig):
     scale: bool = True
+    log2: bool = False
 
 
 @dataclass

diff --git a/src/move/data/preprocessing.py b/src/move/data/preprocessing.py
@@ -64,19 +64,20 @@ def one_hot_encode_single(mapping: dict[str, int], value: Optional[str]) -> IntA
     return encoded_value
 
 
-def scale(x: np.ndarray) -> tuple[FloatArray, BoolArray]:
+def scale(x: np.ndarray, log2: bool = False) -> tuple[FloatArray, BoolArray]:
     """Center to mean and scale to unit variance. Convert NaN values to 0.
 
     Args:
         x: 2D array with samples in its rows and features in its columns
+        log2: whether to apply log2 transformation to the input
 
     Returns:
         Tuple containing (1) scaled output and (2) a 1D mask marking columns
         (i.e., features) without zero variance
     """
-    # CHANGES: Allow for negative values, so only standardization
-    # logx = np.log2(x + 1)
     logx = x
+    if log2:
+        logx = np.log2(x + 1)
     mask_1d = ~np.isclose(np.nanstd(logx, axis=0), 0.0)
     scaled_x = standardize(logx[:, mask_1d], axis=0)
     scaled_x[np.isnan(scaled_x)] = 0

diff --git a/src/move/tasks/encode_data.py b/src/move/tasks/encode_data.py
@@ -71,7 +71,7 @@ def encode_data(config: DataConfig):
         fig.savefig(fig_path)
 
         if scale:
-            values, mask_1d = preprocessing.scale(values)
+            values, mask_1d = preprocessing.scale(values, input_config.log2)
             names = names[mask_1d]
             logger.debug(f"Columns with zero variance: {np.sum(~mask_1d)}")
         io.dump_names(interim_data_path / f"{input_config.name}.txt", names)

diff --git a/tutorial/config/data/random_continuous.yaml b/tutorial/config/data/random_continuous.yaml
@@ -16,5 +16,10 @@ sample_names:
 categorical_inputs: [] # no categorical inputs
 
 continuous_inputs: # a list of continuous datasets
-  - name: random.continuous.proteomics
-  - name: random.continuous.metagenomics
+  - name: random.continuous.proteomics # filename in raw_data_path
+    log2: true # log2 transform data
+    scale: true # scale data
+  - name: random.continuous.metagenomics # filename in raw_data_path
+    log2: true # log2 transform data
+    scale: true # scale data
+
diff --git a/tutorial/config/data/random_small.yaml b/tutorial/config/data/random_small.yaml
@@ -17,5 +17,9 @@ categorical_inputs: # a list of categorical datasets
   - name: random.small.drugs
 
 continuous_inputs: # a list of continuous datasets
-  - name: random.small.proteomics
-  - name: random.small.metagenomics
+  - name: random.small.proteomics # filename in raw_data_path
+    scale: true # scale data
+    log2: true # log2 transform data
+  - name: random.small.metagenomics # filename in raw_data_path
+    scale: true # scale data
+    log2: true # log2 transform data