Skip to content

Commit

Permalink
Compression of intermediate files in strain analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
aghozlane committed Mar 28, 2024
1 parent 26a830f commit 7c00ca2
Show file tree
Hide file tree
Showing 7 changed files with 36 additions and 17 deletions.
7 changes: 5 additions & 2 deletions meteor/mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,13 @@
from re import findall
from meteor.session import Session, Component
from time import perf_counter
from tempfile import mkstemp

# from tempfile import mkstemp
import pysam
import logging
import sys


@dataclass
class Mapper(Session):
"""Run the bowtie"""
Expand Down Expand Up @@ -126,7 +128,8 @@ def execute(self) -> None:
[
"bowtie2",
parameters,
"--mm", "--no-unal",
"--mm",
"--no-unal",
"-x",
str(bowtie_index.resolve()),
"-U",
Expand Down
4 changes: 3 additions & 1 deletion meteor/phylogeny.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,9 @@ def execute(self) -> None:
with NamedTemporaryFile(
mode="wt", dir=self.meteor.tmp_dir, suffix=".fasta", delete=False
) as temp_clean:
tree_file = self.meteor.tree_dir / f"{msp_file.stem}.tree"
tree_file = self.meteor.tree_dir / f"{msp_file.name}".replace(
".fasta", ".tree"
)
# Clean sites
self.clean_sites(msp_file, temp_clean)
with tree_file.open("wt", encoding="UTF-8") as tree:
Expand Down
2 changes: 1 addition & 1 deletion meteor/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def get_sequences(self, fasta_file: Path) -> Iterator[tuple[int, str]]:
"""
gene_id: int = 0
seq: str = ""
with fasta_file.open("rt", encoding="UTF-8") as fasta:
with lzma.open(fasta_file, "rt") as fasta:
for line in fasta:
if line.startswith(">"):
if len(seq) > 0:
Expand Down
7 changes: 4 additions & 3 deletions meteor/strain.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import sys
import logging
import lzma
import pandas as pd
from dataclasses import dataclass, field
from meteor.variantcalling import VariantCalling
Expand Down Expand Up @@ -160,7 +161,7 @@ def get_msp_variant(
len(msp_with_overlapping_genes["msp_name"].values),
)
for msp_name in msp_with_overlapping_genes["msp_name"].values:
msp_file = self.json_data["directory"] / Path(msp_name + ".fasta")
msp_file = self.json_data["directory"] / Path(msp_name + ".fasta.xz")
msp_seq = ""
for gene_id in msp_content[msp_content["msp_name"] == msp_name][
"gene_id"
Expand All @@ -169,7 +170,7 @@ def get_msp_variant(
msp_seq += gene_dict[gene_id]
else:
msp_seq += "-" * len(gene_dict[gene_id])
with msp_file.open("wt", encoding="UTF-8") as msp:
with lzma.open(msp_file, "wt", preset=0) as msp:
print(
f">{self.json_data['census']['sample_info']['sample_name']}\n{msp_seq}\n",
file=msp,
Expand Down Expand Up @@ -247,7 +248,7 @@ def execute(self) -> None:
)
consensus_file = (
self.json_data["directory"]
/ f"{sample_info['sample_name']}_consensus.fasta"
/ f"{sample_info['sample_name']}_consensus.fasta.xz"
)
# count_file = (
# self.json_data["mapped_sample_dir"] / f"{sample_info['sample_name']}.tsv"
Expand Down
2 changes: 1 addition & 1 deletion meteor/tests/test_variantcalling.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,6 @@ def test_execute(vc_builder: VariantCalling) -> None:
assert output_vcf.exists()
output_consensus = (
vc_builder.census["directory"]
/ f"{vc_builder.census['census']['sample_info']['sample_name']}_consensus.fasta"
/ f"{vc_builder.census['census']['sample_info']['sample_name']}_consensus.fasta.xz"
)
assert output_consensus.exists()
10 changes: 7 additions & 3 deletions meteor/treebuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import logging
import sys
import pandas as pd
import lzma


@dataclass
Expand All @@ -48,10 +49,12 @@ def concatenate(self, msp_file_dict: dict[str, list[Path]]) -> list[Path]:
# Concatenate files that occur in more than one directory
for filename, paths in msp_file_dict.items():
if len(paths) > 1:
res = self.meteor.tree_dir / f"{filename}"
res = self.meteor.tree_dir / f"{filename}".replace(
".fasta.xz", ".fasta"
)
with res.open("wt", encoding="UTF-8") as outfile:
for path in paths:
with open(path, "r") as infile:
with lzma.open(path, "rt") as infile:
outfile.write(infile.read())
msp_list += [res]
return msp_list
Expand Down Expand Up @@ -79,7 +82,7 @@ def execute(self) -> None:
else:
logging.info("%d samples have been detected.", len(all_census))
msp_file_dict = defaultdict(list)
for filepath in self.meteor.strain_dir.glob("**/*.fasta"):
for filepath in self.meteor.strain_dir.glob("**/*.fasta.xz"):
if not filepath.name.endswith("_consensus.fasta"):
msp_file_dict[filepath.name].append(filepath)
# Concatenate msp files
Expand All @@ -91,6 +94,7 @@ def execute(self) -> None:
phylogeny_process.execute()
# Analyze tree data
for msp_file in msp_file_list:
print("here WTF:", msp_file)
tree_file = self.meteor.tree_dir / f"{msp_file.stem}.tree"
img_file = self.meteor.tree_dir / f"{msp_file.stem}.{self.format}"
try:
Expand Down
21 changes: 15 additions & 6 deletions meteor/variantcalling.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
import logging
import sys
import tempfile
from subprocess import check_call, run
import lzma
from subprocess import check_call, run, Popen, PIPE
from dataclasses import dataclass
from pathlib import Path
from datetime import datetime
Expand Down Expand Up @@ -232,7 +233,7 @@ def execute(self) -> None:
)
consensus_file = (
self.census["directory"]
/ f"{self.census['census']['sample_info']['sample_name']}_consensus.fasta"
/ f"{self.census['census']['sample_info']['sample_name']}_consensus.fasta.xz"
)
reference_file = (
self.meteor.ref_dir
Expand Down Expand Up @@ -388,7 +389,7 @@ def execute(self) -> None:
perf_counter() - startlowcovbed,
)
startlowcov = perf_counter()
check_call(
bcftools_process = Popen(
[
"bcftools",
"consensus",
Expand All @@ -398,11 +399,19 @@ def execute(self) -> None:
"-",
"-f",
str(reference_file.resolve()),
"-o",
str(consensus_file.resolve()),
str(vcf_file.resolve()),
]
],
stdout=PIPE,
)
# capture output of bcftools_process
bcftools_output = bcftools_process.communicate()[0]

# compress output using lzma
compressed_output = lzma.compress(bcftools_output)

# write compressed output to file
with open(str(consensus_file.resolve()), "wb") as f:
f.write(compressed_output)
logging.info(
"Completed low coverage regions filtering step in %f seconds",
perf_counter() - startlowcov,
Expand Down

0 comments on commit 7c00ca2

Please sign in to comment.