Skip to content
This repository has been archived by the owner on Nov 9, 2023. It is now read-only.

Filter mapping file issue #2062

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 63 additions & 43 deletions qiime/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from collections import defaultdict
from random import shuffle, sample
from numpy import array, inf
from numpy import array, inf, apply_along_axis, in1d

from skbio.parse.sequences import parse_fasta, parse_fastq
from skbio.format.sequences import format_fastq_record
Expand Down Expand Up @@ -386,48 +386,68 @@ def keep_seq(seq_id):


def filter_mapping_file(map_data, map_header, good_sample_ids,
include_repeat_cols=False, column_rename_ids=None):
"""Filters map according to several criteria.

- keep only sample ids in good_sample_ids
- drop cols that are different in every sample (except id)
- drop cols that are the same in every sample
"""
# keeping samples
to_keep = []
to_keep.extend([i for i in map_data if i[0] in good_sample_ids])

# keeping columns
headers = []
to_keep = zip(*to_keep)
headers.append(map_header[0])
result = [to_keep[0]]

if column_rename_ids:
# reduce in 1 as we are not using the first colum (SampleID)
column_rename_ids = column_rename_ids - 1
for i, l in enumerate(to_keep[1:-1]):
if i == column_rename_ids:
if len(set(l)) != len(result[0]):
raise ValueError(
"The column to rename the samples is not unique.")
result.append(result[0])
result[0] = l
headers.append('SampleID_was_' + map_header[i + 1])
elif include_repeat_cols or len(set(l)) > 1:
headers.append(map_header[i + 1])
result.append(l)
else:
for i, l in enumerate(to_keep[1:-1]):
if include_repeat_cols or len(set(l)) > 1:
headers.append(map_header[i + 1])
result.append(l)
headers.append(map_header[-1])
result.append(to_keep[-1])

result = map(list, zip(*result))

return headers, result
include_repeat_cols=False, include_unique_cols=True):
'''Filter samples from a mapping file.

This script filters out all samples not in `good_sample_ids` (i.e. removing
rows of a mapping file). It defaults to removing metadata columns if every
sample in the `good_sample_ids` has the same value for that column. It can
optionally remove metadata columns if every sample in the `good_sample_ids`
has a unique value for that colum (excluding the 0th column - the sample id
column).

If `good_sample_ids` only contains one sample then that sample will be
returned along with all columns because uniquness for metadata values is
no longer meaningful.

Parameters
----------
map_data : list
List of lists of mapping file data. Expected format is that produced by
qiime.parse.parse_mapping_file.
map_header : list
List of strs of mapping file headers. Expected format is that produced
by qiime.parse.parse_mapping_file.
good_sample_ids : list
List of sample ids to retain.
include_repeat_cols : boolean
If true, retain columns where all retained samples have the same value
for the column.
include_unique_cols : boolean
If true, retain columns where all retained samples have a unique value
for the column.

Returns
-------
list
List of headers that have been retained.
list
List of lists of mapping data including retained rows (samples) and
columns.
'''
# this breaks backward compatability on one line mapping files.
if len(good_sample_ids) == 1:
return map_header, [r for r in map_data if r[0] == good_sample_ids[0]]

data = array(map_data)
headers = array(map_header)
rd = data[in1d(data[:, 0], good_sample_ids)]

# calculate number of unique values in each column of the mapping file
num_unique_vals = apply_along_axis(lambda col: len(set(col)), 0, rd)

cols_to_keep = array([True] * rd.shape[1])
if not include_repeat_cols:
cols_to_keep[(num_unique_vals == 1)] = False
if not include_unique_cols:
cols_to_keep[(num_unique_vals == rd.shape[0])] = False

# we will always keep the 0th column - the sample id column, so we are
# doing more calculations than necessary, but its simpler to just set the
# first entry of cols_to_keep to True.
cols_to_keep[0] = True

return list(headers[cols_to_keep]), map(list, rd[:, cols_to_keep])


def filter_mapping_file_from_mapping_f(
Expand Down
1 change: 1 addition & 0 deletions qiime_test_data/filter_samples_from_otu_table/new_map.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
#SampleID BarcodeSequence LinkerPrimerSequence Treatment DOB DescriptionPC.354 AGCACGAGCCTA YATGCTGCCTCCCGTAGGAGT Control 20061218 Control_mouse__I.D._354PC.355 AACTCGTCGATG YATGCTGCCTCCCGTAGGAGT Control 20061218 Control_mouse__I.D._355PC.356 ACAGACCACTCA YATGCTGCCTCCCGTAGGAGT Control 20061126 Control_mouse__I.D._356PC.481 ACCAGCGACTAG YATGCTGCCTCCCGTAGGAGT Control 20070314 Control_mouse__I.D._481PC.593 AGCAGCACTTGT YATGCTGCCTCCCGTAGGAGT Control 20071210 Control_mouse__I.D._593
Expand Down
9 changes: 7 additions & 2 deletions scripts/filter_samples_from_otu_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
"Jesse Stombaugh",
"Dan Knights",
"Daniel McDonald",
"Jai Ram Rideout"]
"Jai Ram Rideout",
"Will Van Treuren"]
__license__ = "GPL"
__version__ = "1.9.1-dev"
__maintainer__ = "Greg Caporaso"
Expand Down Expand Up @@ -39,6 +40,8 @@
"%prog -i otu_table.biom -o otu_table_control_only.biom -m map.txt -s 'Treatment:Control'"),
("Metadata-based filtering (negative)", "Filter samples from the table, keeping samples where the value for 'Treatment' in the mapping file is not 'Control'",
"%prog -i otu_table.biom -o otu_table_not_control.biom -m map.txt -s 'Treatment:*,!Control'"),
("Metadata-based filtering (positive)", "Filter samples from the table, keeping samples where the value for 'Treatment' in the mapping file is 'Control' and output a new mapping file",
"%prog -i otu_table.biom -o otu_table_control_only.biom -m map.txt -s 'Treatment:Control' --output_mapping_fp new_map.txt"),
("ID-based filtering", "Keep samples where the id is listed in ids.txt", "%prog -i otu_table.biom -o filtered_otu_table.biom --sample_id_fp ids.txt"),
("ID-based filtering (negation)", "Discard samples where the id is listed in ids.txt", "%prog -i otu_table.biom -o filtered_otu_table.biom --sample_id_fp ids.txt --negate_sample_id_fp")]

Expand Down Expand Up @@ -149,7 +152,9 @@ def main():
filter_mapping_file(
mapping_data,
mapping_headers,
filtered_otu_table.ids())
filtered_otu_table.ids(),
include_repeat_cols=True,
include_unique_cols=True)
open(
output_mapping_fp,
'w').write(
Expand Down
89 changes: 83 additions & 6 deletions tests/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Greg Caporaso", "Jai Ram Rideout", "Yoshiki Vazquez Baeza"]
__credits__ = ["Greg Caporaso", "Jai Ram Rideout", "Yoshiki Vazquez Baeza",
"Will Van Treuren"]
__license__ = "GPL"
__version__ = "1.9.1-dev"
__maintainer__ = "Greg Caporaso"
Expand Down Expand Up @@ -105,12 +106,81 @@ def test_negate_tips_to_keep(self):
self.assertItemsEqual(negate_tips_to_keep(tips_to_keep, t), expected)

def test_filter_mapping_file(self):
"""filter_mapping_file should filter map file according to sample ids"""
"""filter_mapping_file should filter map file according to sample ids
"""
# make sure new code passes old tests
self.assertEqual(filter_mapping_file(self.map_data, self.map_headers,
['a', 'b', 'c', 'd', 'e', 'f']), (self.map_headers, self.map_data))
self.assertEqual(
filter_mapping_file(self.map_data, self.map_headers, ['a']),
(['SampleID', 'Description'], ['a\tx'.split('\t')]))
['a', 'b', 'c', 'd', 'e', 'f']),
(self.map_headers, self.map_data))

# this test has been removed - we now won't filter 1 sample mapping
# files because the idea of uniqueness doesn't make sense.
# self.assertEqual(filter_mapping_file(self.map_data, self.map_headers,
# ['a']),
# (['SampleID', 'Description'], ['a\tx'.split('\t')]))

# test: non-unique colums removed, unique columns retained
obs_headers, obs_data = filter_mapping_file(self.map_data,
self.map_headers,
['a', 'b', 'e'],
include_repeat_cols=False,
include_unique_cols=True)
exp_headers = ['SampleID', 'Study', 'Description']
exp_data = [['a', 'Dog', 'x'],
['b', 'Dog', 'y'],
['e', 'WholeBody', 'b']]
self.assertEqual(obs_headers, exp_headers)
self.assertEqual(obs_data, exp_data)

# test: non-unique colums retained, unique columns removed
obs_headers, obs_data = filter_mapping_file(self.map_data,
self.map_headers,
['a', 'b'],
include_repeat_cols=True,
include_unique_cols=False)
exp_headers = ['SampleID', 'Study', 'BodySite']
exp_data = [['a', 'Dog', 'Stool'],
['b', 'Dog', 'Stool']]
self.assertEqual(obs_headers, exp_headers)
self.assertEqual(obs_data, exp_data)

# test: non-unique colums remove, unique columns removed
obs_headers, obs_data = filter_mapping_file(self.map_data,
self.map_headers,
['a', 'b'],
include_repeat_cols=False,
include_unique_cols=False)
exp_headers = ['SampleID']
exp_data = [['a'],
['b']]
self.assertEqual(obs_headers, exp_headers)
self.assertEqual(obs_data, exp_data)

# test with new data - retain both
data, headers, _ = parse_mapping_file(StringIO(map_str3))

obs_headers, obs_data = filter_mapping_file(data, headers,
['a', 'b', 'd', 'e'],
include_repeat_cols=True,
include_unique_cols=True)
exp_headers = ['SampleID', 'Study', 'BodySite', 'Description']
exp_data = [['a', 'study1', 'Stool', 'x'],
['b', 'study1', 'Eye', 'y'],
['d', 'study1', 'Palm', 'a'],
['e', 'study1', 'Lung', 'b']]
self.assertEqual(obs_headers, exp_headers)
self.assertEqual(obs_data, exp_data)

# test that with one sample we get back that sample and all columns
obs_headers, obs_data = filter_mapping_file(self.map_data,
self.map_headers,
['a'],
include_repeat_cols=False,
include_unique_cols=False)
exp_headers = ['SampleID', 'Study', 'BodySite', 'Description']
exp_data = [['a', 'Dog', 'Stool', 'x']]
self.assertEqual(obs_headers, exp_headers)
self.assertEqual(obs_data, exp_data)

def test_filter_mapping_file_from_mapping_f(self):
""" filter_mapping_file_from_mapping_f functions as expected """
Expand Down Expand Up @@ -1252,6 +1322,13 @@ def test_get_seq_ids_from_seq_id_file(self):
f\tI1\t3\tPalm\tc
g\tI1\t2\tStool\td"""

map_str3 = """#SampleID\tStudy\tBodySite\tDescription
a\tstudy1\tStool\tx
b\tstudy1\tEye\ty
c\tstudy1\tEar\tz
d\tstudy1\tPalm\ta
e\tstudy1\tLung\tb"""

filter_fasta_expected1 = """>Seq1 some comment
ACCTTGG
>s2 some other comment
Expand Down