Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate random (gaussian or realistic) distance matrices #45

Merged
merged 7 commits into from
May 7, 2018
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
bayesian-regression/

# OS generated files
.DS_Store
.DS_Store?
Expand Down Expand Up @@ -53,4 +55,4 @@ pip-log.txt
# Generated PCoA output of algorithms
out/*

.env
.env
Binary file added bayesian_regression-0.1-py3-none-any.whl
Binary file not shown.
14 changes: 11 additions & 3 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,21 @@ channels:
- defaults
- bioconda
- biocore
- anaconda
dependencies:
- scikit-bio=0.4.2
- click=6.7
- python=3
- scikit-bio
- click=6.7 # install from anaconda channel
- python=3.5
- numpy
- scikit-learn=0.19.1
- scipy
- pandas
- jupyter
- matplotlib
- gneiss
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you specify the version of gneiss you are using? We may not guarantee backwards compatibility in the future.

- pytz
- tensorflow
- pip
# also need to install bayesian-regression:
- pip:
- bayesian_regression-0.1-py3-none-any.whl
115 changes: 115 additions & 0 deletions scripts/randdm
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#!/usr/bin/env python

import os

import click
import numpy as np
from skbio import DistanceMatrix
from skbio.stats.distance import randdm
from bayesian_regression.util.generators import band_table, block_table
from skbio.diversity import beta_diversity


@click.command()
@click.option('--structure', 'structure', type=str,
help='Optional. Generate structure in the random data that '
'models distributions of microbial communities. '
'Options: "band" or "block". If not specified, then '
'draws from gaussian distribution to generate distance '
'matrices.', default=None)
@click.option('--dim', 'dimensions', type=int, multiple=True,
help='Dimension of the input matrix/matrices to generate')
@click.option('--seed', 'seed', type=int,
help='Random number generator seed value.')
@click.option('--sub', 'subsample_dims', type=str, multiple=True,
help='Generate subsampled distance matrix with given'
'dimension (must be smaller than original dimension)'
' for each randomly generated distance matrix.'
' Can specify as int or percentage')
@click.option('--overwrite', 'overwrite', type=bool, default=False,
help='Overwrite output directory if it already exists. Do not '
'overwrite, by default.')
@click.argument('output_dir', type=click.Path(), default=None)
def generate(dimensions, output_dir, seed, subsample_dims, structure,
overwrite):
"""
Generate random distance matrix
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a description of the parameters and the outputs?

"""

if seed is not None:
np.random.seed(seed)

if (not overwrite and os.path.exists(output_dir) and not click.confirm(
'The output directory %s exists. '
'Do you want to overwrite?' % output_dir)):
click.echo('Aborting.')
return

if not os.path.exists(output_dir):
os.makedirs(output_dir)

for dim in dimensions:
outpath = os.path.join(output_dir, 'randdm-{}.txt'.format(dim))

click.echo('Generating random distance matrix: %s' % outpath)

# Generate random distance matrix
if structure is not None:
click.echo('Generating synthetic OTU table with "{}"-like '
'microbial distribution...'.format(structure))
if structure == 'band':
biom_table = band_table(dim, dim, seed=seed)[0]
elif structure == 'block':
biom_table = block_table(dim, dim, seed=seed)[0]
else:
raise ValueError('Invalid value for --structure parameter.')

otu_table = biom_table.matrix_data.todense()

click.echo('Generating distance matrix from OTU table...')
distance_matrix = beta_diversity('braycurtis', otu_table,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we ok with hard-coding the distance measure?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed it to be a CLI param with braycurtis as default value.

validate=False)

else:
click.echo('Generating random distance matrix '
'(gaussian distribution)')
distance_matrix = randdm(dim, constructor=DistanceMatrix)

# Serialize distance matrix
distance_matrix.write(outpath)

# Subsampling
for subsample_dim in subsample_dims:
# Parse parameter values into integers
if '%' in subsample_dim:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So in theory you can pass: 100%, 200%, -10001%, XX for this value ... perhaps worth adding a nicer validation step so it doesn't brake cause it can't transform to int; this applies to all parameters.

percent = float(subsample_dim[:-1]) / 100
subsample_dim = int(percent * dim)
else:
subsample_dim = int(subsample_dim)

if subsample_dim >= dim:
raise ValueError('Subsample dimension %d must be smaller than'
'original matrix dimension %d' % (
subsample_dim,
dim))
subsampled_outpath = os.path.join(output_dir,
'randdm-{}-sub-{}.txt'
.format(dim, subsample_dim))

click.echo(
'Subsampling original randomly generated distance matrix '
'with subsample dimension %d: %s' % (subsample_dim,
subsampled_outpath))

ids = distance_matrix.ids
# Subsample without replacement
subsampled_ids = np.random.choice(ids, subsample_dim,
replace=False)
subsampled_matrix = distance_matrix.filter(subsampled_ids)
subsampled_matrix.write(subsampled_outpath)

click.echo('Done.')


if __name__ == '__main__':
generate()
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
url='http://github.com/biocore/mds-approximations/',
test_suite='nose.collector',
packages=find_packages(),
scripts=['scripts/mdsa', 'scripts/kruskal', 'scripts/procrustes'],
scripts=['scripts/mdsa', 'scripts/kruskal', 'scripts/procrustes',
'scripts/randdm'],
extras_require={'test': ["nose", "pep8", "flake8"]},
classifiers=classifiers)