Compression of intermediate files in strain analysis

metagenopolis · Mar 28, 2024 · 7c00ca2 · 7c00ca2
1 parent 26a830f
commit 7c00ca2
Show file tree

Hide file tree

Showing 7 changed files with 36 additions and 17 deletions.
diff --git a/meteor/mapper.py b/meteor/mapper.py
@@ -20,11 +20,13 @@
 from re import findall
 from meteor.session import Session, Component
 from time import perf_counter
-from tempfile import mkstemp
+
+# from tempfile import mkstemp
 import pysam
 import logging
 import sys
 
+
 @dataclass
 class Mapper(Session):
     """Run the bowtie"""
@@ -126,7 +128,8 @@ def execute(self) -> None:
             [
                 "bowtie2",
                 parameters,
-                "--mm", "--no-unal",
+                "--mm",
+                "--no-unal",
                 "-x",
                 str(bowtie_index.resolve()),
                 "-U",

diff --git a/meteor/phylogeny.py b/meteor/phylogeny.py
@@ -111,7 +111,9 @@ def execute(self) -> None:
             with NamedTemporaryFile(
                 mode="wt", dir=self.meteor.tmp_dir, suffix=".fasta", delete=False
             ) as temp_clean:
-                tree_file = self.meteor.tree_dir / f"{msp_file.stem}.tree"
+                tree_file = self.meteor.tree_dir / f"{msp_file.name}".replace(
+                    ".fasta", ".tree"
+                )
                 # Clean sites
                 self.clean_sites(msp_file, temp_clean)
                 with tree_file.open("wt", encoding="UTF-8") as tree:

diff --git a/meteor/session.py b/meteor/session.py
@@ -159,7 +159,7 @@ def get_sequences(self, fasta_file: Path) -> Iterator[tuple[int, str]]:
         """
         gene_id: int = 0
         seq: str = ""
-        with fasta_file.open("rt", encoding="UTF-8") as fasta:
+        with lzma.open(fasta_file, "rt") as fasta:
             for line in fasta:
                 if line.startswith(">"):
                     if len(seq) > 0:

diff --git a/meteor/strain.py b/meteor/strain.py
@@ -14,6 +14,7 @@
 
 import sys
 import logging
+import lzma
 import pandas as pd
 from dataclasses import dataclass, field
 from meteor.variantcalling import VariantCalling
@@ -160,7 +161,7 @@ def get_msp_variant(
             len(msp_with_overlapping_genes["msp_name"].values),
         )
         for msp_name in msp_with_overlapping_genes["msp_name"].values:
-            msp_file = self.json_data["directory"] / Path(msp_name + ".fasta")
+            msp_file = self.json_data["directory"] / Path(msp_name + ".fasta.xz")
             msp_seq = ""
             for gene_id in msp_content[msp_content["msp_name"] == msp_name][
                 "gene_id"
@@ -169,7 +170,7 @@ def get_msp_variant(
                     msp_seq += gene_dict[gene_id]
                 else:
                     msp_seq += "-" * len(gene_dict[gene_id])
-            with msp_file.open("wt", encoding="UTF-8") as msp:
+            with lzma.open(msp_file, "wt", preset=0) as msp:
                 print(
                     f">{self.json_data['census']['sample_info']['sample_name']}\n{msp_seq}\n",
                     file=msp,
@@ -247,7 +248,7 @@ def execute(self) -> None:
                 )
             consensus_file = (
                 self.json_data["directory"]
-                / f"{sample_info['sample_name']}_consensus.fasta"
+                / f"{sample_info['sample_name']}_consensus.fasta.xz"
             )
             # count_file = (
             #     self.json_data["mapped_sample_dir"] / f"{sample_info['sample_name']}.tsv"

diff --git a/meteor/tests/test_variantcalling.py b/meteor/tests/test_variantcalling.py
@@ -71,6 +71,6 @@ def test_execute(vc_builder: VariantCalling) -> None:
     assert output_vcf.exists()
     output_consensus = (
         vc_builder.census["directory"]
-        / f"{vc_builder.census['census']['sample_info']['sample_name']}_consensus.fasta"
+        / f"{vc_builder.census['census']['sample_info']['sample_name']}_consensus.fasta.xz"
     )
     assert output_consensus.exists()
diff --git a/meteor/treebuilder.py b/meteor/treebuilder.py
@@ -22,6 +22,7 @@
 import logging
 import sys
 import pandas as pd
+import lzma
 
 
 @dataclass
@@ -48,10 +49,12 @@ def concatenate(self, msp_file_dict: dict[str, list[Path]]) -> list[Path]:
         # Concatenate files that occur in more than one directory
         for filename, paths in msp_file_dict.items():
             if len(paths) > 1:
-                res = self.meteor.tree_dir / f"{filename}"
+                res = self.meteor.tree_dir / f"{filename}".replace(
+                    ".fasta.xz", ".fasta"
+                )
                 with res.open("wt", encoding="UTF-8") as outfile:
                     for path in paths:
-                        with open(path, "r") as infile:
+                        with lzma.open(path, "rt") as infile:
                             outfile.write(infile.read())
                 msp_list += [res]
         return msp_list
@@ -79,7 +82,7 @@ def execute(self) -> None:
         else:
             logging.info("%d samples have been detected.", len(all_census))
         msp_file_dict = defaultdict(list)
-        for filepath in self.meteor.strain_dir.glob("**/*.fasta"):
+        for filepath in self.meteor.strain_dir.glob("**/*.fasta.xz"):
             if not filepath.name.endswith("_consensus.fasta"):
                 msp_file_dict[filepath.name].append(filepath)
         # Concatenate msp files
@@ -91,6 +94,7 @@ def execute(self) -> None:
         phylogeny_process.execute()
         # Analyze tree data
         for msp_file in msp_file_list:
+            print("here WTF:", msp_file)
             tree_file = self.meteor.tree_dir / f"{msp_file.stem}.tree"
             img_file = self.meteor.tree_dir / f"{msp_file.stem}.{self.format}"
             try:

diff --git a/meteor/variantcalling.py b/meteor/variantcalling.py
@@ -15,7 +15,8 @@
 import logging
 import sys
 import tempfile
-from subprocess import check_call, run
+import lzma
+from subprocess import check_call, run, Popen, PIPE
 from dataclasses import dataclass
 from pathlib import Path
 from datetime import datetime
@@ -232,7 +233,7 @@ def execute(self) -> None:
         )
         consensus_file = (
             self.census["directory"]
-            / f"{self.census['census']['sample_info']['sample_name']}_consensus.fasta"
+            / f"{self.census['census']['sample_info']['sample_name']}_consensus.fasta.xz"
         )
         reference_file = (
             self.meteor.ref_dir
@@ -388,7 +389,7 @@ def execute(self) -> None:
                         perf_counter() - startlowcovbed,
                     )
                     startlowcov = perf_counter()
-                    check_call(
+                    bcftools_process = Popen(
                         [
                             "bcftools",
                             "consensus",
@@ -398,11 +399,19 @@ def execute(self) -> None:
                             "-",
                             "-f",
                             str(reference_file.resolve()),
-                            "-o",
-                            str(consensus_file.resolve()),
                             str(vcf_file.resolve()),
-                        ]
+                        ],
+                        stdout=PIPE,
                     )
+                    # capture output of bcftools_process
+                    bcftools_output = bcftools_process.communicate()[0]
+
+                    # compress output using lzma
+                    compressed_output = lzma.compress(bcftools_output)
+
+                    # write compressed output to file
+                    with open(str(consensus_file.resolve()), "wb") as f:
+                        f.write(compressed_output)
                 logging.info(
                     "Completed low coverage regions filtering step in %f seconds",
                     perf_counter() - startlowcov,