-
Notifications
You must be signed in to change notification settings - Fork 7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Generate random (gaussian or realistic) distance matrices #45
Changes from 4 commits
71bf377
46af739
df74f16
1497fae
19e414f
55c8127
d3ddea2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
#!/usr/bin/env python | ||
|
||
import os | ||
|
||
import click | ||
import numpy as np | ||
from skbio import DistanceMatrix | ||
from skbio.stats.distance import randdm | ||
from bayesian_regression.util.generators import band_table, block_table | ||
from skbio.diversity import beta_diversity | ||
|
||
|
||
@click.command() | ||
@click.option('--structure', 'structure', type=str, | ||
help='Optional. Generate structure in the random data that ' | ||
'models distributions of microbial communities. ' | ||
'Options: "band" or "block". If not specified, then ' | ||
'draws from gaussian distribution to generate distance ' | ||
'matrices.', default=None) | ||
@click.option('--dim', 'dimensions', type=int, multiple=True, | ||
help='Dimension of the input matrix/matrices to generate') | ||
@click.option('--seed', 'seed', type=int, | ||
help='Random number generator seed value.') | ||
@click.option('--sub', 'subsample_dims', type=str, multiple=True, | ||
help='Generate subsampled distance matrix with given' | ||
'dimension (must be smaller than original dimension)' | ||
' for each randomly generated distance matrix.' | ||
' Can specify as int or percentage') | ||
@click.option('--overwrite', 'overwrite', type=bool, default=False, | ||
help='Overwrite output directory if it already exists. Do not ' | ||
'overwrite, by default.') | ||
@click.argument('output_dir', type=click.Path(), default=None) | ||
def generate(dimensions, output_dir, seed, subsample_dims, structure, | ||
overwrite): | ||
""" | ||
Generate random distance matrix | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a description of the parameters and the outputs? |
||
""" | ||
|
||
if seed is not None: | ||
np.random.seed(seed) | ||
|
||
if (not overwrite and os.path.exists(output_dir) and not click.confirm( | ||
'The output directory %s exists. ' | ||
'Do you want to overwrite?' % output_dir)): | ||
click.echo('Aborting.') | ||
return | ||
|
||
if not os.path.exists(output_dir): | ||
os.makedirs(output_dir) | ||
|
||
for dim in dimensions: | ||
outpath = os.path.join(output_dir, 'randdm-{}.txt'.format(dim)) | ||
|
||
click.echo('Generating random distance matrix: %s' % outpath) | ||
|
||
# Generate random distance matrix | ||
if structure is not None: | ||
click.echo('Generating synthetic OTU table with "{}"-like ' | ||
'microbial distribution...'.format(structure)) | ||
if structure == 'band': | ||
biom_table = band_table(dim, dim, seed=seed)[0] | ||
elif structure == 'block': | ||
biom_table = block_table(dim, dim, seed=seed)[0] | ||
else: | ||
raise ValueError('Invalid value for --structure parameter.') | ||
|
||
otu_table = biom_table.matrix_data.todense() | ||
|
||
click.echo('Generating distance matrix from OTU table...') | ||
distance_matrix = beta_diversity('braycurtis', otu_table, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we ok with hard-coding the distance measure? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I changed it to be a CLI param with braycurtis as default value. |
||
validate=False) | ||
|
||
else: | ||
click.echo('Generating random distance matrix ' | ||
'(gaussian distribution)') | ||
distance_matrix = randdm(dim, constructor=DistanceMatrix) | ||
|
||
# Serialize distance matrix | ||
distance_matrix.write(outpath) | ||
|
||
# Subsampling | ||
for subsample_dim in subsample_dims: | ||
# Parse parameter values into integers | ||
if '%' in subsample_dim: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So in theory you can pass: 100%, 200%, -10001%, XX for this value ... perhaps worth adding a nicer validation step so it doesn't brake cause it can't transform to int; this applies to all parameters. |
||
percent = float(subsample_dim[:-1]) / 100 | ||
subsample_dim = int(percent * dim) | ||
else: | ||
subsample_dim = int(subsample_dim) | ||
|
||
if subsample_dim >= dim: | ||
raise ValueError('Subsample dimension %d must be smaller than' | ||
'original matrix dimension %d' % ( | ||
subsample_dim, | ||
dim)) | ||
subsampled_outpath = os.path.join(output_dir, | ||
'randdm-{}-sub-{}.txt' | ||
.format(dim, subsample_dim)) | ||
|
||
click.echo( | ||
'Subsampling original randomly generated distance matrix ' | ||
'with subsample dimension %d: %s' % (subsample_dim, | ||
subsampled_outpath)) | ||
|
||
ids = distance_matrix.ids | ||
# Subsample without replacement | ||
subsampled_ids = np.random.choice(ids, subsample_dim, | ||
replace=False) | ||
subsampled_matrix = distance_matrix.filter(subsampled_ids) | ||
subsampled_matrix.write(subsampled_outpath) | ||
|
||
click.echo('Done.') | ||
|
||
|
||
if __name__ == '__main__': | ||
generate() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you specify the version of gneiss you are using? We may not guarantee backwards compatibility in the future.