Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug fixes for Joint-RPCA #98

Merged
merged 13 commits into from
Oct 1, 2024
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,17 @@

v0.0.12 (2024-10-01)

### Bug fixes

* Joint-RPCA re-centering bug.
* see issue #97
* Updated for the newest version of Pandas.
* see issue #96
* Reduce unecessary numpy warnings regarding log on unseen data.
* see issue #78
* Fix filter and ordering with TEMPTED
* see issue #92 and issue #93

v0.0.11 (2024-06-24)

### Bug fixes
Expand Down
2 changes: 1 addition & 1 deletion gemelli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

__version__ = "0.0.11"
__version__ = "0.0.12"
2 changes: 0 additions & 2 deletions gemelli/optspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,10 +374,8 @@ def joint_solve(self, multiple_obs):
S_shared = np.diag(S_shared)
S_shared = S_shared / np.linalg.norm(S_shared)
U_shared = np.average(sample_loadings, axis=0)
U_shared -= U_shared.mean(0)
cameronmartino marked this conversation as resolved.
Show resolved Hide resolved
feature_loadings = [(S_shared).dot(v_i.T).T
for v_i in feature_loadings]

# compensates the smaller average size of
# observed values vs. missing
S_shared = S_shared / rescal_param
Expand Down
9 changes: 6 additions & 3 deletions gemelli/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,9 +420,11 @@ def matrix_rclr(mat, branch_lengths=None):
raise ValueError('Data-matrix contains nans')
# take the log of the sample centered data
if branch_lengths is not None:
mat = np.log(matrix_closure(matrix_closure(mat) * branch_lengths))
with np.errstate(divide='ignore'):
mat = np.log(matrix_closure(matrix_closure(mat) * branch_lengths))
else:
mat = np.log(matrix_closure(mat))
with np.errstate(divide='ignore'):
mat = np.log(matrix_closure(mat))
# generate a mask of missing values
mask = [True] * mat.shape[0] * mat.shape[1]
mask = np.array(mat).reshape(mat.shape)
Expand Down Expand Up @@ -586,7 +588,8 @@ def matrix_closure(mat):
"""

mat = np.atleast_2d(mat)
mat = mat / mat.sum(axis=1, keepdims=True)
with np.errstate(divide='ignore', invalid='ignore'):
mat = mat / mat.sum(axis=1, keepdims=True)

return mat.squeeze()

Expand Down
2 changes: 1 addition & 1 deletion gemelli/q2/tests/test_transformer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
import qiime2
import numpy as np
import pandas.util.testing as pdt
import pandas.testing as pdt
from skbio.util import get_data_path
from qiime2.plugin.testing import TestPluginBase
from gemelli.q2._format import TrajectoryFormat
Expand Down
28 changes: 21 additions & 7 deletions gemelli/rpca.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,8 +596,9 @@ def frequency_filter(val, id_, md):
raise ValueError('Data-table contains duplicate sample IDs')
if len(table.ids('observation')) != len(set(table.ids('observation'))):
raise ValueError('Data-table contains duplicate feature IDs')
# ensure empty samples / features are removed
table = table.remove_empty()
if min_sample_count is not None:
# ensure empty samples / features are removed
table = table.remove_empty(inplace=False)

return table

Expand Down Expand Up @@ -746,10 +747,22 @@ def joint_rpca(tables: biom.Table,
pd.DataFrame):
sample_metadata = sample_metadata.to_dataframe()
if sample_metadata is None or train_test_column is None:
test_samples = sorted(list(shared_all_samples))[:n_test_samples]
# choose samples based on RPCA loadings on first table
# a somewhat intelligent way of making sure the test
# samples are representative of the data.
ord_sort, _ = optspace_helper(rclr_tables[0].T.values,
rclr_tables[0].index,
rclr_tables[0].columns,
n_components=n_components)
ord_sort = list(ord_sort.samples.iloc[:, 0].sort_values().index)
test_samples = np.round(np.linspace(0,
len(ord_sort) - 1,
n_test_samples)
).astype(int)
test_samples = [ord_sort[i] for i in test_samples]
train_samples = list(set(shared_all_samples) - set(test_samples))
else:
sample_metadata = sample_metadata.loc[shared_all_samples, :]
sample_metadata = sample_metadata.loc[list(shared_all_samples), :]
train_samples = sample_metadata[train_test_column] == 'train'
test_samples = sample_metadata[train_test_column] == 'test'
train_samples = sample_metadata[train_samples].index
Expand Down Expand Up @@ -909,9 +922,10 @@ def transform(ordination: OrdinationResults,
' the features in the ordination.')
elif subset_tables:
unshared_N = len(set(rclr_table_df.index)) - len(shared_features)
warnings.warn('Removing %i features(s) in table(s)'
' but not the ordination.'
% (unshared_N), RuntimeWarning)
if unshared_N != 0:
warnings.warn('Removing %i features(s) in table(s)'
' but not the ordination.'
% (unshared_N), RuntimeWarning)
else:
raise ValueError('Features in the input table(s) not in'
' the features in the ordination.'
Expand Down
400 changes: 200 additions & 200 deletions gemelli/scripts/tests/rpca_data/distance-matrix.tsv

Large diffs are not rendered by default.

402 changes: 201 additions & 201 deletions gemelli/scripts/tests/rpca_data/expected-joint-distance-matrix.tsv

Large diffs are not rendered by default.

Loading
Loading