diff --git a/test-data/deduplicate.nosortids.fasta b/test-data/deduplicate.nosortids.fasta index 83bedbb8..69c9d381 100644 --- a/test-data/deduplicate.nosortids.fasta +++ b/test-data/deduplicate.nosortids.fasta @@ -8,3 +8,5 @@ AAaa CCCC >6 last! GGGG +>7;7 representing 2 records +ATGC diff --git a/test-data/deduplicate.sortids.fasta b/test-data/deduplicate.sortids.fasta index 64ecc632..fa373a71 100644 --- a/test-data/deduplicate.sortids.fasta +++ b/test-data/deduplicate.sortids.fasta @@ -8,3 +8,5 @@ AAaa CCCC >6 last! GGGG +>7;7 representing 2 records +ATGC diff --git a/test-data/duplicates.fasta b/test-data/duplicates.fasta index c216fd68..9d3b3472 100644 --- a/test-data/duplicates.fasta +++ b/test-data/duplicates.fasta @@ -10,3 +10,7 @@ CCCC acgt >6 last! GGGG +>7 +ATGC +>7 +ATGC diff --git a/test-data/duplicates.fasta.gz b/test-data/duplicates.fasta.gz index 6df71e64..ead903f5 100644 Binary files a/test-data/duplicates.fasta.gz and b/test-data/duplicates.fasta.gz differ diff --git a/test-data/duplicates.nr.fasta b/test-data/duplicates.nr.fasta index 2a62db2b..4c3a31b8 100644 --- a/test-data/duplicates.nr.fasta +++ b/test-data/duplicates.nr.fasta @@ -8,3 +8,5 @@ ACgt CCCC >6 last! GGGG +>7;7 representing 2 records +ATGC diff --git a/tools/make_nr/make_nr.py b/tools/make_nr/make_nr.py index 4a1199f8..c4fae513 100755 --- a/tools/make_nr/make_nr.py +++ b/tools/make_nr/make_nr.py @@ -45,30 +45,17 @@ """ parser = OptionParser(usage=usage) -parser.add_option( - "-s", - "--sep", - dest="sep", - default=";", - help="Separator character for combining identifiers " - "of duplicated records e.g. '|' or ';' (required)", -) -parser.add_option( - "-a", - "--alphasort", - action="store_true", - help="When merging duplicated records sort their " - "identifiers alphabetically before combining them. " - "Default is input file order.", -) -parser.add_option( - "-o", - "--output", - dest="output", - default="/dev/stdout", - metavar="FILE", - help="Output filename (defaults to stdout)", -) +parser.add_option("-s", "--sep", dest="sep", + default=";", + help="Separator character for combining identifiers " + "of duplicated records e.g. '|' or ';' (required)") +parser.add_option("-a", "--alphasort", action="store_true", + help="When merging duplicated records sort their " + "identifiers alphabetically before combining them. " + "Default is input file order.") +parser.add_option("-o", "--output", dest="output", + default="/dev/stdout", metavar="FILE", + help="Output filename (defaults to stdout)") options, args = parser.parse_args() if not args: @@ -79,18 +66,33 @@ def gzip_open(filename): """Open a possibly gzipped text file.""" with open(filename, "rb") as h: magic = h.read(2) - if magic == b"\x1f\x8b": + if magic == b'\x1f\x8b': return gzip.open(filename, "rt") else: return open(filename) +def make_p_seq(ids, records, seq, sort_ids): + """Make the printable sequence.""" + if sort_ids: + return(">{} representing {} records\n{}\n".format( + ";".join(sorted(ids)), + records, + seq)) + else: + return(">{} representing {} records\n{}\n".format( + ";".join(ids), + records, + seq)) + + def make_nr(input_fasta, output_fasta, sep=";", sort_ids=False): """Make the sequences in FASTA files non-redundant. Argument input_fasta is a list of filenames. """ by_seq = dict() + duplicate = 0 try: from Bio.SeqIO.FastaIO import SimpleFastaParser except ImportError: @@ -102,46 +104,38 @@ def make_nr(input_fasta, output_fasta, sep=";", sort_ids=False): seq = seq.upper() try: by_seq[seq].append(idn) + duplicate += 1 except KeyError: by_seq[seq] = [idn] unique = 0 - representatives = dict() - duplicates = set() - for cluster in by_seq.values(): - if len(cluster) > 1: - # Is it useful to offer to sort here? - # if sort_ids: - # cluster.sort() - representatives[cluster[0]] = cluster - duplicates.update(cluster[1:]) - else: - unique += 1 - del by_seq - if duplicates: - # TODO - refactor as a generator with single SeqIO.write(...) call + representative = 0 + sequences = set() + if duplicate: with open(output_fasta, "w") as handle: for f in input_fasta: with gzip_open(f) as in_handle: - for title, seq in SimpleFastaParser(in_handle): + for title, one_seq in SimpleFastaParser(in_handle): idn = title.split(None, 1)[0] # first word only - if idn in representatives: - cluster = representatives[idn] - if sort_ids: - cluster.sort() - idn = sep.join(cluster) - title = "%s representing %i records" % (idn, len(cluster)) - elif idn in duplicates: - continue - # TODO - line wrapping - handle.write(">%s\n%s\n" % (title, seq)) - sys.stderr.write( - "%i unique entries; removed %i duplicates " - "leaving %i representative records\n" - % (unique, len(duplicates), len(representatives)) - ) + if one_seq.upper() not in sequences: + if len(by_seq[one_seq.upper()]) > 1: + pseq = make_p_seq(by_seq[one_seq.upper()], + len(by_seq[one_seq.upper()]), + one_seq, + sort_ids) + handle.write(pseq) + representative += 1 + else: + handle.write(">{}\n{}\n".format(title, + one_seq)) + unique += 1 + sequences.update([one_seq.upper()]) + sys.stderr.write("%i unique entries; removed %i duplicates " + "leaving %i representative records\n" + % (unique, duplicate, representative)) else: os.symlink(os.path.abspath(input_fasta), output_fasta) - sys.stderr.write("No perfect duplicates in file, %i unique entries\n" % unique) + sys.stderr.write("No perfect duplicates in file, %i unique entries\n" + % len(by_seq)) make_nr(args, options.output, options.sep, options.alphasort)