diff --git a/qiime/filter.py b/qiime/filter.py index 6133ae2a49..1620cfff86 100644 --- a/qiime/filter.py +++ b/qiime/filter.py @@ -13,7 +13,7 @@ from collections import defaultdict from random import shuffle, sample -from numpy import array, inf +from numpy import array, inf, apply_along_axis, in1d from skbio.parse.sequences import parse_fasta, parse_fastq from skbio.format.sequences import format_fastq_record @@ -386,48 +386,68 @@ def keep_seq(seq_id): def filter_mapping_file(map_data, map_header, good_sample_ids, - include_repeat_cols=False, column_rename_ids=None): - """Filters map according to several criteria. - - - keep only sample ids in good_sample_ids - - drop cols that are different in every sample (except id) - - drop cols that are the same in every sample - """ - # keeping samples - to_keep = [] - to_keep.extend([i for i in map_data if i[0] in good_sample_ids]) - - # keeping columns - headers = [] - to_keep = zip(*to_keep) - headers.append(map_header[0]) - result = [to_keep[0]] - - if column_rename_ids: - # reduce in 1 as we are not using the first colum (SampleID) - column_rename_ids = column_rename_ids - 1 - for i, l in enumerate(to_keep[1:-1]): - if i == column_rename_ids: - if len(set(l)) != len(result[0]): - raise ValueError( - "The column to rename the samples is not unique.") - result.append(result[0]) - result[0] = l - headers.append('SampleID_was_' + map_header[i + 1]) - elif include_repeat_cols or len(set(l)) > 1: - headers.append(map_header[i + 1]) - result.append(l) - else: - for i, l in enumerate(to_keep[1:-1]): - if include_repeat_cols or len(set(l)) > 1: - headers.append(map_header[i + 1]) - result.append(l) - headers.append(map_header[-1]) - result.append(to_keep[-1]) - - result = map(list, zip(*result)) - - return headers, result + include_repeat_cols=False, include_unique_cols=True): + '''Filter samples from a mapping file. + + This script filters out all samples not in `good_sample_ids` (i.e. removing + rows of a mapping file). It defaults to removing metadata columns if every + sample in the `good_sample_ids` has the same value for that column. It can + optionally remove metadata columns if every sample in the `good_sample_ids` + has a unique value for that colum (excluding the 0th column - the sample id + column). + + If `good_sample_ids` only contains one sample then that sample will be + returned along with all columns because uniquness for metadata values is + no longer meaningful. + + Parameters + ---------- + map_data : list + List of lists of mapping file data. Expected format is that produced by + qiime.parse.parse_mapping_file. + map_header : list + List of strs of mapping file headers. Expected format is that produced + by qiime.parse.parse_mapping_file. + good_sample_ids : list + List of sample ids to retain. + include_repeat_cols : boolean + If true, retain columns where all retained samples have the same value + for the column. + include_unique_cols : boolean + If true, retain columns where all retained samples have a unique value + for the column. + + Returns + ------- + list + List of headers that have been retained. + list + List of lists of mapping data including retained rows (samples) and + columns. + ''' + # this breaks backward compatability on one line mapping files. + if len(good_sample_ids) == 1: + return map_header, [r for r in map_data if r[0] == good_sample_ids[0]] + + data = array(map_data) + headers = array(map_header) + rd = data[in1d(data[:, 0], good_sample_ids)] + + # calculate number of unique values in each column of the mapping file + num_unique_vals = apply_along_axis(lambda col: len(set(col)), 0, rd) + + cols_to_keep = array([True] * rd.shape[1]) + if not include_repeat_cols: + cols_to_keep[(num_unique_vals == 1)] = False + if not include_unique_cols: + cols_to_keep[(num_unique_vals == rd.shape[0])] = False + + # we will always keep the 0th column - the sample id column, so we are + # doing more calculations than necessary, but its simpler to just set the + # first entry of cols_to_keep to True. + cols_to_keep[0] = True + + return list(headers[cols_to_keep]), map(list, rd[:, cols_to_keep]) def filter_mapping_file_from_mapping_f( diff --git a/qiime_test_data/filter_samples_from_otu_table/new_map.txt b/qiime_test_data/filter_samples_from_otu_table/new_map.txt new file mode 100644 index 0000000000..accede626d --- /dev/null +++ b/qiime_test_data/filter_samples_from_otu_table/new_map.txt @@ -0,0 +1 @@ +#SampleID BarcodeSequence LinkerPrimerSequence Treatment DOB Description PC.354 AGCACGAGCCTA YATGCTGCCTCCCGTAGGAGT Control 20061218 Control_mouse__I.D._354 PC.355 AACTCGTCGATG YATGCTGCCTCCCGTAGGAGT Control 20061218 Control_mouse__I.D._355 PC.356 ACAGACCACTCA YATGCTGCCTCCCGTAGGAGT Control 20061126 Control_mouse__I.D._356 PC.481 ACCAGCGACTAG YATGCTGCCTCCCGTAGGAGT Control 20070314 Control_mouse__I.D._481 PC.593 AGCAGCACTTGT YATGCTGCCTCCCGTAGGAGT Control 20071210 Control_mouse__I.D._593 \ No newline at end of file diff --git a/scripts/filter_samples_from_otu_table.py b/scripts/filter_samples_from_otu_table.py index 26aa8e5719..25bf5f7874 100755 --- a/scripts/filter_samples_from_otu_table.py +++ b/scripts/filter_samples_from_otu_table.py @@ -9,7 +9,8 @@ "Jesse Stombaugh", "Dan Knights", "Daniel McDonald", - "Jai Ram Rideout"] + "Jai Ram Rideout", + "Will Van Treuren"] __license__ = "GPL" __version__ = "1.9.1-dev" __maintainer__ = "Greg Caporaso" @@ -39,6 +40,8 @@ "%prog -i otu_table.biom -o otu_table_control_only.biom -m map.txt -s 'Treatment:Control'"), ("Metadata-based filtering (negative)", "Filter samples from the table, keeping samples where the value for 'Treatment' in the mapping file is not 'Control'", "%prog -i otu_table.biom -o otu_table_not_control.biom -m map.txt -s 'Treatment:*,!Control'"), + ("Metadata-based filtering (positive)", "Filter samples from the table, keeping samples where the value for 'Treatment' in the mapping file is 'Control' and output a new mapping file", + "%prog -i otu_table.biom -o otu_table_control_only.biom -m map.txt -s 'Treatment:Control' --output_mapping_fp new_map.txt"), ("ID-based filtering", "Keep samples where the id is listed in ids.txt", "%prog -i otu_table.biom -o filtered_otu_table.biom --sample_id_fp ids.txt"), ("ID-based filtering (negation)", "Discard samples where the id is listed in ids.txt", "%prog -i otu_table.biom -o filtered_otu_table.biom --sample_id_fp ids.txt --negate_sample_id_fp")] @@ -149,7 +152,9 @@ def main(): filter_mapping_file( mapping_data, mapping_headers, - filtered_otu_table.ids()) + filtered_otu_table.ids(), + include_repeat_cols=True, + include_unique_cols=True) open( output_mapping_fp, 'w').write( diff --git a/tests/test_filter.py b/tests/test_filter.py index f55c54cccf..a907876be8 100755 --- a/tests/test_filter.py +++ b/tests/test_filter.py @@ -4,7 +4,8 @@ __author__ = "Greg Caporaso" __copyright__ = "Copyright 2011, The QIIME Project" -__credits__ = ["Greg Caporaso", "Jai Ram Rideout", "Yoshiki Vazquez Baeza"] +__credits__ = ["Greg Caporaso", "Jai Ram Rideout", "Yoshiki Vazquez Baeza", + "Will Van Treuren"] __license__ = "GPL" __version__ = "1.9.1-dev" __maintainer__ = "Greg Caporaso" @@ -105,12 +106,81 @@ def test_negate_tips_to_keep(self): self.assertItemsEqual(negate_tips_to_keep(tips_to_keep, t), expected) def test_filter_mapping_file(self): - """filter_mapping_file should filter map file according to sample ids""" + """filter_mapping_file should filter map file according to sample ids + """ + # make sure new code passes old tests self.assertEqual(filter_mapping_file(self.map_data, self.map_headers, - ['a', 'b', 'c', 'd', 'e', 'f']), (self.map_headers, self.map_data)) - self.assertEqual( - filter_mapping_file(self.map_data, self.map_headers, ['a']), - (['SampleID', 'Description'], ['a\tx'.split('\t')])) + ['a', 'b', 'c', 'd', 'e', 'f']), + (self.map_headers, self.map_data)) + + # this test has been removed - we now won't filter 1 sample mapping + # files because the idea of uniqueness doesn't make sense. + # self.assertEqual(filter_mapping_file(self.map_data, self.map_headers, + # ['a']), + # (['SampleID', 'Description'], ['a\tx'.split('\t')])) + + # test: non-unique colums removed, unique columns retained + obs_headers, obs_data = filter_mapping_file(self.map_data, + self.map_headers, + ['a', 'b', 'e'], + include_repeat_cols=False, + include_unique_cols=True) + exp_headers = ['SampleID', 'Study', 'Description'] + exp_data = [['a', 'Dog', 'x'], + ['b', 'Dog', 'y'], + ['e', 'WholeBody', 'b']] + self.assertEqual(obs_headers, exp_headers) + self.assertEqual(obs_data, exp_data) + + # test: non-unique colums retained, unique columns removed + obs_headers, obs_data = filter_mapping_file(self.map_data, + self.map_headers, + ['a', 'b'], + include_repeat_cols=True, + include_unique_cols=False) + exp_headers = ['SampleID', 'Study', 'BodySite'] + exp_data = [['a', 'Dog', 'Stool'], + ['b', 'Dog', 'Stool']] + self.assertEqual(obs_headers, exp_headers) + self.assertEqual(obs_data, exp_data) + + # test: non-unique colums remove, unique columns removed + obs_headers, obs_data = filter_mapping_file(self.map_data, + self.map_headers, + ['a', 'b'], + include_repeat_cols=False, + include_unique_cols=False) + exp_headers = ['SampleID'] + exp_data = [['a'], + ['b']] + self.assertEqual(obs_headers, exp_headers) + self.assertEqual(obs_data, exp_data) + + # test with new data - retain both + data, headers, _ = parse_mapping_file(StringIO(map_str3)) + + obs_headers, obs_data = filter_mapping_file(data, headers, + ['a', 'b', 'd', 'e'], + include_repeat_cols=True, + include_unique_cols=True) + exp_headers = ['SampleID', 'Study', 'BodySite', 'Description'] + exp_data = [['a', 'study1', 'Stool', 'x'], + ['b', 'study1', 'Eye', 'y'], + ['d', 'study1', 'Palm', 'a'], + ['e', 'study1', 'Lung', 'b']] + self.assertEqual(obs_headers, exp_headers) + self.assertEqual(obs_data, exp_data) + + # test that with one sample we get back that sample and all columns + obs_headers, obs_data = filter_mapping_file(self.map_data, + self.map_headers, + ['a'], + include_repeat_cols=False, + include_unique_cols=False) + exp_headers = ['SampleID', 'Study', 'BodySite', 'Description'] + exp_data = [['a', 'Dog', 'Stool', 'x']] + self.assertEqual(obs_headers, exp_headers) + self.assertEqual(obs_data, exp_data) def test_filter_mapping_file_from_mapping_f(self): """ filter_mapping_file_from_mapping_f functions as expected """ @@ -1252,6 +1322,13 @@ def test_get_seq_ids_from_seq_id_file(self): f\tI1\t3\tPalm\tc g\tI1\t2\tStool\td""" +map_str3 = """#SampleID\tStudy\tBodySite\tDescription +a\tstudy1\tStool\tx +b\tstudy1\tEye\ty +c\tstudy1\tEar\tz +d\tstudy1\tPalm\ta +e\tstudy1\tLung\tb""" + filter_fasta_expected1 = """>Seq1 some comment ACCTTGG >s2 some other comment