From a0be96d4f50bdda6ec33d834303b514d1f74fb79 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Fri, 2 Feb 2024 10:02:24 +0100 Subject: [PATCH] implement issue #55 add output file that contains the boundaries of genes in the concatenated MSA. --- read2tree/Aligner.py | 27 +++++++++++++++++---------- read2tree/utils/seq_utils.py | 7 +++++-- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/read2tree/Aligner.py b/read2tree/Aligner.py index 81e48de..df4efcd 100644 --- a/read2tree/Aligner.py +++ b/read2tree/Aligner.py @@ -373,6 +373,7 @@ def concat_alignment(self): use_alignments = self.alignments alignments_aa = [] alignments_dna = [] + grp_names = [] def sorter_groups(grp): if grp.startswith('OG') and grp[2:].isdigit(): @@ -384,22 +385,28 @@ def sorter_groups(grp): value = use_alignments[key] alignments_aa.append(value.aa) alignments_dna.append(value.dna) + grp_names.append(key) concatination_aa = concatenate(alignments_aa) concatination_dna = concatenate(alignments_dna) if concatination_aa: - align_output = open(os.path.join(self.args.output_path, - "concat_" + self._species_name + "_aa.phy"), "w") - AlignIO.write(concatination_aa, align_output, "phylip-relaxed") - align_output.close() + outfn = os.path.join(self.args.output_path, "concat_" + self._species_name + "_aa.phy") + self._write_concation(outfn, concatination_aa, grp_names) if concatination_dna: - align_output = open(os.path.join(self.args.output_path, - "concat_" + self._species_name + "_dna.phy"), "w") - AlignIO.write(concatination_dna, align_output, "phylip-relaxed") - align_output.close() - - return (concatination_aa, concatination_dna) + outfn = os.path.join(self.args.output_path,"concat_" + self._species_name + "_dna.phy") + self._write_concation(outfn, concatination_dna, grp_names) + return concatination_aa, concatination_dna + + def _write_concation(self, outfn, msa, group_names): + with open(outfn, "w") as align_output: + AlignIO.write(msa, align_output, "phylip-relaxed") + if 'boundaries' in msa.annotations: + with open(outfn.replace(".phy", ".group_boundaries"), "w") as fh: + start_pos = 0 + for name, end_pos in zip(group_names, msa.annotations['boundaries']): + fh.write(f"{name} = {start_pos + 1}-{end_pos + 1}") + start_pos = end_pos def add_to_alignment(self, mapper): """ diff --git a/read2tree/utils/seq_utils.py b/read2tree/utils/seq_utils.py index 421d3e2..119106f 100644 --- a/read2tree/utils/seq_utils.py +++ b/read2tree/utils/seq_utils.py @@ -121,6 +121,7 @@ def concatenate(alignments): else: unknown_char = '?' + boundaries, cumlength = [], 0 for aln in alignments: length = aln.get_alignment_length() @@ -137,8 +138,10 @@ def concatenate(alignments): # else stuff the string representation into the tmp dict for rec in aln: tmp[rec.id].append(str(rec.seq)) + cumlength += length + boundaries.append(cumlength) # Stitch all the substrings together using join (most efficient way), # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment - return MultipleSeqAlignment(SeqRecord(Seq(''.join(v)), id=k) - for (k, v) in tmp.items()) + return MultipleSeqAlignment([SeqRecord(Seq(''.join(v)), id=k) for (k, v) in tmp.items()], + annotations={'boundaries': boundaries})