Skip to content

Commit

Permalink
Run multiple alignment with mafft, debug tree construction.
Browse files Browse the repository at this point in the history
  • Loading branch information
aghozlane committed Oct 16, 2024
1 parent 10276fa commit 7e0eecd
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 37 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Besides python packages dependencies, Meteor requires:
- python 3.10+
- [bowtie2 2.3.5+](https://github.com/BenLangmead/bowtie2)
- [freebayes 1.3.6+](https://github.com/freebayes/freebayes)
- [mafft 7.407+](https://mafft.cbrc.jp/alignment/software/)

## Installation

Expand Down
2 changes: 2 additions & 0 deletions conda_recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,13 @@ requirements:
- biom-format
- py-bgzip
- freebayes >=1.3.6
- mafft >=7.407
test:
commands:
- meteor -h
- bowtie2 -h
- freebayes -h
- mafft -h
imports:
- pysam
- pandas
Expand Down
107 changes: 71 additions & 36 deletions meteor/phylogeny.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
"""Effective phylogeny"""
import re
import logging
import sys
from subprocess import run, Popen, PIPE
from packaging.version import parse

# import sys
from dataclasses import dataclass, field
Expand All @@ -28,7 +31,7 @@
from collections import OrderedDict
from datetime import datetime
from typing import Iterable, Tuple
from cogent3 import load_unaligned_seqs, make_aligned_seqs
from cogent3 import load_aligned_seqs
from cogent3.evolve.distance import EstimateDistances
from cogent3.evolve.models import GTR
from cogent3.cluster.UPGMA import upgma
Expand Down Expand Up @@ -98,7 +101,7 @@ def set_tree_config(self):
config = {
"meteor_version": self.meteor.version,
"phylogeny": {
"phylogeny_tool": "cogent3",
"" "phylogeny_tool": "cogent3",
"phylogeny_date": datetime.now().strftime("%Y-%m-%d"),
"tree_files": ",".join([tree.name for tree in self.tree_files]),
},
Expand All @@ -122,48 +125,80 @@ def process_msp_file(
msp_file.name.replace(".fasta", ""),
)
tree_file = tree_dir / f"{msp_file.stem}.tree"
ali_file = tree_dir / f"{msp_file.stem}_aligned.fasta"
self.tree_files: list[Path] = []

with NamedTemporaryFile(mode="wt", dir=tmp_dir, suffix=".fasta") as temp_clean:
# Clean sites
logging.info("Clean sites for %s", msp_file.name)
_, info_sites = self.clean_sites(msp_file, temp_clean)

if info_sites < self.min_info_sites:
logging.info(
"Only %d informative sites (< %d threshold) left after cleaning, skipping %s.",
info_sites,
self.min_info_sites,
msp_file.name.replace(".fasta", ""),
)
return tree_file, False # Return False to indicate skipping

# Perform alignments and UPGMA
logging.info("Running UPGMA and Distance Estimation")
aligned_seqs = make_aligned_seqs(
load_unaligned_seqs(temp_clean.name, moltype="dna"),
moltype="dna",
array_align=True,
)
d = EstimateDistances(aligned_seqs, submodel=GTR())
d.run(show_progress=False)

# Create UPGMA Tree
mycluster = upgma(d.get_pairwise_distances())
mycluster = mycluster.unrooted_deepcopy()
with NamedTemporaryFile(
suffix=".fasta", dir=tmp_dir, delete=True
) as temp_ali_file:
# Start alignment
with Popen(
[
"mafft",
"--thread",
str(2),
"--quiet",
str(msp_file.resolve()),
],
stdout=temp_ali_file, # Redirect stdout to the temp file
stderr=PIPE,
) as align_exec:
_, error = align_exec.communicate()
if align_exec.returncode != 0:
raise RuntimeError(f"MAFFT failed with error: {error}")
logging.info("Clean sites for %s", msp_file.name)
with ali_file.open("w") as aligned_seq:
_, info_sites = self.clean_sites(
Path(temp_ali_file.name), aligned_seq
)

with tree_file.open("w") as f:
f.write(
self.remove_edge_labels(mycluster.get_newick(with_distances=True))
)
if info_sites < self.min_info_sites:
logging.info(
"Only %d informative sites (< %d threshold) left after cleaning, skipping %s.",
info_sites,
self.min_info_sites,
msp_file.name.replace(".fasta", ""),
)
return tree_file, False # Return False to indicate skipping
cleaned_alignment = load_aligned_seqs(ali_file, moltype="dna")
d = EstimateDistances(cleaned_alignment, submodel=GTR())
d.run(show_progress=False)

# Create UPGMA Tree
mycluster = upgma(d.get_pairwise_distances())
mycluster = mycluster.unrooted_deepcopy()

with tree_file.open("w") as f:
f.write(
self.remove_edge_labels(
mycluster.get_newick(with_distances=True)
)
)
# Perform alignments and UPGMA
logging.info("Align sequences")

return tree_file, tree_file.exists()
return tree_file, tree_file.exists()

def execute(self) -> None:
logging.info("Launch phylogeny analysis")
start = perf_counter()

self.tree_files: list[Path] = []
msp_count = len(self.msp_file_list)
# Check the mafft version
mafft_exec = run(["mafft", "--version"], check=False, capture_output=True)
if mafft_exec.returncode != 0:
logging.error(
"Checking mafft version failed:\n%s",
mafft_exec.stderr.decode("utf-8"),
)
sys.exit(1)
mafft_version = str(mafft_exec.stderr.decode("utf-8")).split(" ")[0][1:]
if parse(mafft_version) < self.meteor.MIN_MAFFT_VERSION:
logging.error(
"The mafft version %s is outdated for meteor. Please update mafft to >= %s.",
mafft_version,
self.meteor.MIN_MAFFT_VERSION,
)
sys.exit(1)
# Using ProcessPoolExecutor to parallelize the MSP file processing
with ProcessPoolExecutor(max_workers=self.meteor.threads) as executor:
futures = {
Expand Down
1 change: 1 addition & 0 deletions meteor/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class Component:

MIN_BOWTIE2_VERSION: ClassVar[Version] = Version("2.3.5")
MIN_FREEBAYES_VERSION: ClassVar[Version] = Version("1.3.6")
MIN_MAFFT_VERSION: ClassVar[Version] = Version("7.407")
DEFAULT_GAP_CHAR: ClassVar[str] = "N"
DEFAULT_CORE_SIZE: ClassVar[int] = 100

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "meteor"
version = "2.0.16"
version = "2.0.17"
description = "Meteor - A plateform for quantitative metagenomic profiling of complex ecosystems"
authors = ["Amine Ghozlane <[email protected]>", "Florence Thirion <[email protected]>", "Florian Plaza-Oñate <[email protected]>"]
license = "GPL-3.0-or-later"
Expand Down

0 comments on commit 7e0eecd

Please sign in to comment.