Skip to content

Commit

Permalink
✨ add log2 option
Browse files Browse the repository at this point in the history
- decide wheather to log2 transform
- default: do not in order to allow negative features which are then standard normalized
  • Loading branch information
Henry committed Jul 9, 2024
1 parent dc9020e commit 9cd2a7b
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 8 deletions.
1 change: 1 addition & 0 deletions src/move/conf/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class InputConfig:
@dataclass
class ContinuousInputConfig(InputConfig):
scale: bool = True
log2: bool = False


@dataclass
Expand Down
7 changes: 4 additions & 3 deletions src/move/data/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,19 +64,20 @@ def one_hot_encode_single(mapping: dict[str, int], value: Optional[str]) -> IntA
return encoded_value


def scale(x: np.ndarray) -> tuple[FloatArray, BoolArray]:
def scale(x: np.ndarray, log2: bool = False) -> tuple[FloatArray, BoolArray]:
"""Center to mean and scale to unit variance. Convert NaN values to 0.
Args:
x: 2D array with samples in its rows and features in its columns
log2: whether to apply log2 transformation to the input
Returns:
Tuple containing (1) scaled output and (2) a 1D mask marking columns
(i.e., features) without zero variance
"""
# CHANGES: Allow for negative values, so only standardization
# logx = np.log2(x + 1)
logx = x
if log2:
logx = np.log2(x + 1)
mask_1d = ~np.isclose(np.nanstd(logx, axis=0), 0.0)
scaled_x = standardize(logx[:, mask_1d], axis=0)
scaled_x[np.isnan(scaled_x)] = 0
Expand Down
2 changes: 1 addition & 1 deletion src/move/tasks/encode_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def encode_data(config: DataConfig):
fig.savefig(fig_path)

if scale:
values, mask_1d = preprocessing.scale(values)
values, mask_1d = preprocessing.scale(values, input_config.log2)
names = names[mask_1d]
logger.debug(f"Columns with zero variance: {np.sum(~mask_1d)}")
io.dump_names(interim_data_path / f"{input_config.name}.txt", names)
Expand Down
9 changes: 7 additions & 2 deletions tutorial/config/data/random_continuous.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,10 @@ sample_names:
categorical_inputs: [] # no categorical inputs

continuous_inputs: # a list of continuous datasets
- name: random.continuous.proteomics
- name: random.continuous.metagenomics
- name: random.continuous.proteomics # filename in raw_data_path
log2: true # log2 transform data
scale: true # scale data
- name: random.continuous.metagenomics # filename in raw_data_path
log2: true # log2 transform data
scale: true # scale data

8 changes: 6 additions & 2 deletions tutorial/config/data/random_small.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,9 @@ categorical_inputs: # a list of categorical datasets
- name: random.small.drugs

continuous_inputs: # a list of continuous datasets
- name: random.small.proteomics
- name: random.small.metagenomics
- name: random.small.proteomics # filename in raw_data_path
scale: true # scale data
log2: true # log2 transform data
- name: random.small.metagenomics # filename in raw_data_path
scale: true # scale data
log2: true # log2 transform data

0 comments on commit 9cd2a7b

Please sign in to comment.