biocore · wdwvt1 · Jul 20, 2015 · Jul 20, 2015
diff --git a/qiime/filter.py b/qiime/filter.py
@@ -13,7 +13,7 @@
 
 from collections import defaultdict
 from random import shuffle, sample
-from numpy import array, inf
+from numpy import array, inf, apply_along_axis, in1d
 
 from skbio.parse.sequences import parse_fasta, parse_fastq
 from skbio.format.sequences import format_fastq_record
@@ -386,48 +386,68 @@ def keep_seq(seq_id):
 
 
 def filter_mapping_file(map_data, map_header, good_sample_ids,
-                        include_repeat_cols=False, column_rename_ids=None):
-    """Filters map according to several criteria.
-
-    - keep only sample ids in good_sample_ids
-    - drop cols that are different in every sample (except id)
-    - drop cols that are the same in every sample
-    """
-    # keeping samples
-    to_keep = []
-    to_keep.extend([i for i in map_data if i[0] in good_sample_ids])
-
-    # keeping columns
-    headers = []
-    to_keep = zip(*to_keep)
-    headers.append(map_header[0])
-    result = [to_keep[0]]
-
-    if column_rename_ids:
-        # reduce in 1 as we are not using the first colum (SampleID)
-        column_rename_ids = column_rename_ids - 1
-        for i, l in enumerate(to_keep[1:-1]):
-            if i == column_rename_ids:
-                if len(set(l)) != len(result[0]):
-                    raise ValueError(
-                        "The column to rename the samples is not unique.")
-                result.append(result[0])
-                result[0] = l
-                headers.append('SampleID_was_' + map_header[i + 1])
-            elif include_repeat_cols or len(set(l)) > 1:
-                headers.append(map_header[i + 1])
-                result.append(l)
-    else:
-        for i, l in enumerate(to_keep[1:-1]):
-            if include_repeat_cols or len(set(l)) > 1:
-                headers.append(map_header[i + 1])
-                result.append(l)
-    headers.append(map_header[-1])
-    result.append(to_keep[-1])
-
-    result = map(list, zip(*result))
-
-    return headers, result
+                        include_repeat_cols=False, include_unique_cols=True):
+    '''Filter samples from a mapping file.
+
+    This script filters out all samples not in `good_sample_ids` (i.e. removing
+    rows of a mapping file). It defaults to removing metadata columns if every
+    sample in the `good_sample_ids` has the same value for that column. It can
+    optionally remove metadata columns if every sample in the `good_sample_ids`
+    has a unique value for that colum (excluding the 0th column - the sample id
+    column).
+
+    If `good_sample_ids` only contains one sample then that sample will be
+    returned along with all columns because uniquness for metadata values is
+    no longer meaningful.
+
+    Parameters
+    ----------
+    map_data : list
+        List of lists of mapping file data. Expected format is that produced by
+        qiime.parse.parse_mapping_file.
+    map_header : list
+        List of strs of mapping file headers. Expected format is that produced
+        by qiime.parse.parse_mapping_file.
+    good_sample_ids : list
+        List of sample ids to retain.
+    include_repeat_cols : boolean
+        If true, retain columns where all retained samples have the same value
+        for the column.
+    include_unique_cols : boolean
+        If true, retain columns where all retained samples have a unique value
+        for the column.
+
+    Returns
+    -------
+    list
+        List of headers that have been retained.
+    list
+        List of lists of mapping data including retained rows (samples) and
+        columns.
+    '''
+    # this breaks backward compatability on one line mapping files.
+    if len(good_sample_ids) == 1:
+        return map_header, [r for r in map_data if r[0] == good_sample_ids[0]]
+
+    data = array(map_data)
+    headers = array(map_header)
+    rd = data[in1d(data[:, 0], good_sample_ids)]
+
+    # calculate number of unique values in each column of the mapping file
+    num_unique_vals = apply_along_axis(lambda col: len(set(col)), 0, rd)
+
+    cols_to_keep = array([True] * rd.shape[1])
+    if not include_repeat_cols:
+        cols_to_keep[(num_unique_vals == 1)] = False
+    if not include_unique_cols:
+        cols_to_keep[(num_unique_vals == rd.shape[0])] = False
+
+    # we will always keep the 0th column - the sample id column, so we are
+    # doing more calculations than necessary, but its simpler to just set the
+    # first entry of cols_to_keep to True.
+    cols_to_keep[0] = True
+
+    return list(headers[cols_to_keep]), map(list, rd[:, cols_to_keep])
 
 
 def filter_mapping_file_from_mapping_f(

diff --git a/qiime_test_data/filter_samples_from_otu_table/new_map.txt b/qiime_test_data/filter_samples_from_otu_table/new_map.txt
@@ -0,0 +1 @@
+#SampleID	BarcodeSequence	LinkerPrimerSequence	Treatment	DOB	DescriptionPC.354	AGCACGAGCCTA	YATGCTGCCTCCCGTAGGAGT	Control	20061218	Control_mouse__I.D._354PC.355	AACTCGTCGATG	YATGCTGCCTCCCGTAGGAGT	Control	20061218	Control_mouse__I.D._355PC.356	ACAGACCACTCA	YATGCTGCCTCCCGTAGGAGT	Control	20061126	Control_mouse__I.D._356PC.481	ACCAGCGACTAG	YATGCTGCCTCCCGTAGGAGT	Control	20070314	Control_mouse__I.D._481PC.593	AGCAGCACTTGT	YATGCTGCCTCCCGTAGGAGT	Control	20071210	Control_mouse__I.D._593

diff --git a/scripts/filter_samples_from_otu_table.py b/scripts/filter_samples_from_otu_table.py
@@ -9,7 +9,8 @@
                "Jesse Stombaugh",
                "Dan Knights",
                "Daniel McDonald",
-               "Jai Ram Rideout"]
+               "Jai Ram Rideout",
+               "Will Van Treuren"]
 __license__ = "GPL"
 __version__ = "1.9.1-dev"
 __maintainer__ = "Greg Caporaso"
@@ -39,6 +40,8 @@
                         "%prog -i otu_table.biom -o otu_table_control_only.biom -m map.txt -s 'Treatment:Control'"),
                        ("Metadata-based filtering (negative)", "Filter samples from the table, keeping samples where the value for 'Treatment' in the mapping file is not 'Control'",
                         "%prog -i otu_table.biom -o otu_table_not_control.biom -m map.txt -s 'Treatment:*,!Control'"),
+                       ("Metadata-based filtering (positive)", "Filter samples from the table, keeping samples where the value for 'Treatment' in the mapping file is 'Control' and output a new mapping file",
+                        "%prog -i otu_table.biom -o otu_table_control_only.biom -m map.txt -s 'Treatment:Control' --output_mapping_fp new_map.txt"),
                        ("ID-based filtering", "Keep samples where the id is listed in ids.txt", "%prog -i otu_table.biom -o filtered_otu_table.biom --sample_id_fp ids.txt"),
                        ("ID-based filtering (negation)", "Discard samples where the id is listed in ids.txt", "%prog -i otu_table.biom -o filtered_otu_table.biom --sample_id_fp ids.txt --negate_sample_id_fp")]
 
@@ -149,7 +152,9 @@ def main():
             filter_mapping_file(
                 mapping_data,
                 mapping_headers,
-                filtered_otu_table.ids())
+                filtered_otu_table.ids(),
+                include_repeat_cols=True,
+                include_unique_cols=True)
         open(
             output_mapping_fp,
             'w').write(

diff --git a/tests/test_filter.py b/tests/test_filter.py
@@ -4,7 +4,8 @@
 
 __author__ = "Greg Caporaso"
 __copyright__ = "Copyright 2011, The QIIME Project"
-__credits__ = ["Greg Caporaso", "Jai Ram Rideout", "Yoshiki Vazquez Baeza"]
+__credits__ = ["Greg Caporaso", "Jai Ram Rideout", "Yoshiki Vazquez Baeza",
+               "Will Van Treuren"]
 __license__ = "GPL"
 __version__ = "1.9.1-dev"
 __maintainer__ = "Greg Caporaso"
@@ -105,12 +106,81 @@ def test_negate_tips_to_keep(self):
         self.assertItemsEqual(negate_tips_to_keep(tips_to_keep, t), expected)
 
     def test_filter_mapping_file(self):
-        """filter_mapping_file should filter map file according to sample ids"""
+        """filter_mapping_file should filter map file according to sample ids
+        """
+        # make sure new code passes old tests
         self.assertEqual(filter_mapping_file(self.map_data, self.map_headers,
-                                             ['a', 'b', 'c', 'd', 'e', 'f']), (self.map_headers, self.map_data))
-        self.assertEqual(
-            filter_mapping_file(self.map_data, self.map_headers, ['a']),
-            (['SampleID', 'Description'], ['a\tx'.split('\t')]))
+                                             ['a', 'b', 'c', 'd', 'e', 'f']),
+                                            (self.map_headers, self.map_data))
+
+        # this test has been removed - we now won't filter 1 sample mapping
+        # files because the idea of uniqueness doesn't make sense.
+        # self.assertEqual(filter_mapping_file(self.map_data, self.map_headers,
+        #                                      ['a']),
+        #                 (['SampleID', 'Description'], ['a\tx'.split('\t')]))
+
+        # test: non-unique colums removed, unique columns retained
+        obs_headers, obs_data = filter_mapping_file(self.map_data,
+                                                    self.map_headers,
+                                                    ['a', 'b', 'e'],
+                                                    include_repeat_cols=False,
+                                                    include_unique_cols=True)
+        exp_headers = ['SampleID', 'Study', 'Description']
+        exp_data = [['a', 'Dog', 'x'],
+                    ['b', 'Dog', 'y'],
+                    ['e', 'WholeBody', 'b']]
+        self.assertEqual(obs_headers, exp_headers)
+        self.assertEqual(obs_data, exp_data)
+
+        # test: non-unique colums retained, unique columns removed
+        obs_headers, obs_data = filter_mapping_file(self.map_data,
+                                                    self.map_headers,
+                                                    ['a', 'b'],
+                                                    include_repeat_cols=True,
+                                                    include_unique_cols=False)
+        exp_headers = ['SampleID', 'Study', 'BodySite']
+        exp_data = [['a', 'Dog', 'Stool'],
+                    ['b', 'Dog', 'Stool']]
+        self.assertEqual(obs_headers, exp_headers)
+        self.assertEqual(obs_data, exp_data)
+
+        # test: non-unique colums remove, unique columns removed
+        obs_headers, obs_data = filter_mapping_file(self.map_data,
+                                                    self.map_headers,
+                                                    ['a', 'b'],
+                                                    include_repeat_cols=False,
+                                                    include_unique_cols=False)
+        exp_headers = ['SampleID']
+        exp_data = [['a'],
+                    ['b']]
+        self.assertEqual(obs_headers, exp_headers)
+        self.assertEqual(obs_data, exp_data)
+
+        # test with new data - retain both
+        data, headers, _ = parse_mapping_file(StringIO(map_str3))
+
+        obs_headers, obs_data = filter_mapping_file(data, headers,
+                                                    ['a', 'b', 'd', 'e'],
+                                                    include_repeat_cols=True,
+                                                    include_unique_cols=True)
+        exp_headers = ['SampleID', 'Study', 'BodySite', 'Description']
+        exp_data = [['a', 'study1', 'Stool', 'x'],
+                    ['b', 'study1', 'Eye', 'y'],
+                    ['d', 'study1', 'Palm', 'a'],
+                    ['e', 'study1', 'Lung', 'b']]
+        self.assertEqual(obs_headers, exp_headers)
+        self.assertEqual(obs_data, exp_data)
+
+        # test that with one sample we get back that sample and all columns
+        obs_headers, obs_data = filter_mapping_file(self.map_data,
+                                                    self.map_headers,
+                                                    ['a'],
+                                                    include_repeat_cols=False,
+                                                    include_unique_cols=False)
+        exp_headers = ['SampleID', 'Study', 'BodySite', 'Description']
+        exp_data = [['a', 'Dog', 'Stool', 'x']]
+        self.assertEqual(obs_headers, exp_headers)
+        self.assertEqual(obs_data, exp_data)
 
     def test_filter_mapping_file_from_mapping_f(self):
         """ filter_mapping_file_from_mapping_f functions as expected """
@@ -1252,6 +1322,13 @@ def test_get_seq_ids_from_seq_id_file(self):
 f\tI1\t3\tPalm\tc
 g\tI1\t2\tStool\td"""
 
+map_str3 = """#SampleID\tStudy\tBodySite\tDescription
+a\tstudy1\tStool\tx
+b\tstudy1\tEye\ty
+c\tstudy1\tEar\tz
+d\tstudy1\tPalm\ta
+e\tstudy1\tLung\tb"""
+
 filter_fasta_expected1 = """>Seq1 some comment
 ACCTTGG
 >s2 some other comment