Fix issue #56, fix reference builder, fix default parameters in strain.

metagenopolis · Nov 19, 2024 · dbe9387 · dbe9387
1 parent 18f296f
commit dbe9387
Show file tree

Hide file tree

Showing 6 changed files with 18 additions and 19 deletions.
diff --git a/meteor/profiler.py b/meteor/profiler.py
@@ -33,7 +33,7 @@ class Profiler(Session):
     NO_RAREFACTION: ClassVar[int] = 0
     DEFAULT_RAREFACTION_LEVEL: ClassVar[int] = NO_RAREFACTION
     DEFAULT_RANDOM_SEED: ClassVar[int] = 1234
-    NORMALIZATIONS: ClassVar[list[str | None]] = [None, "coverage", "fpkm", "raw"]
+    NORMALIZATIONS: ClassVar[list[str]] = ["coverage", "fpkm", "raw"]
     DEFAULT_NORMALIZATION: ClassVar[str] = "coverage"
     DEFAULT_COVERAGE_FACTOR: ClassVar[float] = 100.0
     DEFAULT_MSP_FILTER_COMPLETE: ClassVar[float] = 0.1
@@ -101,19 +101,15 @@ def __post_init__(self):
             self.msp_filter = self.msp_filter_user
 
         # Get the associated count table
-        self.input_count_table = (
-            self.meteor.mapping_dir / self.sample_name
-        ).with_suffix(".tsv.xz")
+        self.input_count_table = self.meteor.mapping_dir / f"{self.sample_name}.tsv.xz"
         try:
             assert self.input_count_table.is_file()
         except AssertionError:
             logging.error("The count table %s does not exist.", self.input_count_table)
             sys.exit(1)
 
         # Add a symlink to get the raw count table in the profile directory (for merging purpose)
-        raw_count_table_symlink = (
-            self.stage2_dir / f"{self.sample_name}_raw"
-        ).with_suffix(".tsv.xz")
+        raw_count_table_symlink = self.stage2_dir / f"{self.sample_name}_raw.tsv.xz"
         try:
             raw_count_table_symlink.symlink_to(self.input_count_table.resolve())
         except FileExistsError:

diff --git a/meteor/referencebuilder.py b/meteor/referencebuilder.py
@@ -17,6 +17,7 @@
 import bz2
 import lzma
 import sys
+import bgzip
 from subprocess import check_call, run
 from pathlib import Path
 from dataclasses import dataclass, field
@@ -55,7 +56,8 @@ def __post_init__(self) -> None:
         self.output_annotation_file = (
             self.database_dir / f"{self.meteor.ref_name}_annotation.tsv"
         )
-        self.output_fasta_file = self.fasta_dir / f"{self.meteor.ref_name}.fasta"
+
+        self.output_fasta_file = self.fasta_dir / f"{self.meteor.ref_name}.fasta.gz"
         self.output_index_file = self.fasta_dir / f"{self.meteor.ref_name}.dict"
 
         # Write configuration file
@@ -119,12 +121,13 @@ def create_reference(self):
             "wt", encoding="UTF-8"
         ) as output_annotation:
             output_annotation.write("gene_id\tgene_name\tgene_length\n")
-            with self.output_fasta_file.open("wt", encoding="UTF-8") as output_fasta:
-                for gene_id, (header, len_seq, seq) in enumerate(
-                    self.read_reference(), start=1
-                ):
-                    output_annotation.write(f"{gene_id}\t{header}\t{len_seq}\n")
-                    output_fasta.write(f">{gene_id}\n{seq}\n")
+            with self.output_fasta_file.open("wb") as output_fasta:
+                with bgzip.BGZipWriter(output_fasta) as fh:
+                    for gene_id, (header, len_seq, seq) in enumerate(
+                        self.read_reference(), start=1
+                    ):
+                        output_annotation.write(f"{gene_id}\t{header}\t{len_seq}\n")
+                        fh.write(f">{gene_id}\n{seq}\n".encode("utf-8"))
 
     def execute(self) -> None:
         """Build the database"""

diff --git a/meteor/session.py b/meteor/session.py
@@ -232,7 +232,7 @@ def load_data(self, file_path: Path):
         :return:  pd.DataFrame: Data loaded into a pandas DataFrame.
         """
         # Choose the appropriate pandas function based on extension
-        if "".join(file_path.suffixes) in [".tsv", ".tsv.xz"]:
+        if "".join(file_path.suffixes[-2:]) in [".tsv", ".tsv.xz"]:
             return pd.read_csv(
                 file_path,
                 sep="\t",

diff --git a/meteor/strain.py b/meteor/strain.py
@@ -36,11 +36,11 @@ class Strain(Session):
     MIN_MIN_SNP_DEPTH: ClassVar[int] = 1
     MAX_MIN_SNP_DEPTH: ClassVar[int] = 10000
     DEFAULT_MIN_SNP_DEPTH: ClassVar[int] = 3
-    DEFAULT_MIN_FREQUENCY: ClassVar[float] = 0.01
+    DEFAULT_MIN_FREQUENCY: ClassVar[float] = 0.1
     DEFAULT_PLOIDY: ClassVar[int] = 1
     MIN_MIN_MSP_COVERAGE: ClassVar[int] = 1
     MAX_MIN_MSP_COVERAGE: ClassVar[int] = 100
-    DEFAULT_MIN_MSP_COVERAGE: ClassVar[int] = 50
+    DEFAULT_MIN_MSP_COVERAGE: ClassVar[int] = 80
     DEFAULT_MIN_GENE_COVERAGE: ClassVar[float] = 0.5
     DEFAULT_NUM_THREADS: ClassVar[int] = 1
     DEFAULT_MIN_DEPTH: ClassVar[int] = 3

diff --git a/meteor/tests/test_profiler.py b/meteor/tests/test_profiler.py
@@ -31,7 +31,7 @@ def profiler_standard(datadir: Path, tmp_path: Path) -> Profiler:
         rarefaction_level=-1,
         seed=12345,
         coverage_factor=100.0,
-        normalization=None,
+        normalization="raw",
         core_size=4,
         msp_filter_user=0.5,
         completeness=0.6,

diff --git a/meteor/tests/test_reference_builder.py b/meteor/tests/test_reference_builder.py
@@ -62,7 +62,7 @@ def test_read_reference(builder_defec: ReferenceBuilder):
     (
         pytest.param(
             "be4ea162246d2f23ed8b33bdf9b209d8",
-            "2912b682a8e7554025cc5feadd641570",
+            "55b4a418bd2814f14dd84b7217762b8b",
             id="Accurate output",
         ),
     ),