metagenopolis · aghozlane · Apr 5, 2024 · Apr 2, 2024 · Apr 2, 2024 · Apr 2, 2024
diff --git a/meteor/counter.py b/meteor/counter.py
@@ -51,7 +51,7 @@ class Counter(Session):
 
     def __post_init__(self) -> None:
         if self.counting_type not in Counter.COUNTING_TYPES:
-            raise ValueError(f'{self.counting_type} is not a valid counting type')
+            raise ValueError(f"{self.counting_type} is not a valid counting type")
 
         if self.meteor.tmp_path:
             self.meteor.tmp_path.mkdir(exist_ok=True)
@@ -79,7 +79,6 @@ def launch_mapping(self) -> None:
                 self.mapping_type,
                 self.trim,
                 self.alignment_number,
-                self.counting_type,
                 self.identity_threshold,
             )
             mapping_process.execute()

diff --git a/meteor/downloader.py b/meteor/downloader.py
@@ -51,7 +51,7 @@ def load_catalogues_config() -> dict:
         except FileNotFoundError:
             logging.error("The file %s is missing in meteor source", Downloader.CONFIG_DATA_FILE.name)
             sys.exit(1)
-    
+
     @staticmethod
     def get_available_catalogues() -> list[str]:
         catalogues_config = Downloader.load_catalogues_config()
@@ -130,7 +130,7 @@ def execute(self) -> None:
             print(flush=True)
             if self.choice == Downloader.TEST_CATALOGUE:
                 for sample in self.catalogues_config[self.choice]["samples"]:
-                    logging.info(f"Download {sample} fastq file")
+                    logging.info("Download %s fastq file", sample)
                     url_fastq = self.catalogues_config[self.choice]["samples"][sample][
                         "catalogue"
                     ]

diff --git a/meteor/mapper.py b/meteor/mapper.py
@@ -45,12 +45,11 @@ class Mapper(Session):
     mapping_type: str
     trim: int
     alignment_number: int
-    counting_type: str
     identity_threshold: float
 
     def __post_init__(self) -> None:
         if self.mapping_type not in Mapper.MAPPING_TYPES:
-            raise ValueError(f'{self.mapping_type} is not a valid mapping type')
+            raise ValueError(f"{self.mapping_type} is not a valid mapping type")
 
     def set_mapping_config(
         self,
@@ -117,11 +116,10 @@ def execute(self) -> None:
         if self.trim > Mapper.NO_TRIM:
             parameters += f"--trim-to {self.trim} "
         if self.alignment_number > 1:
-            # and self.counting_type != "best"
             parameters += f"-k {self.alignment_number} "
         # Check the bowtie2 version
-        bowtie_exec = run(["bowtie2", "--version"], capture_output=True)
-        bowtie_version = str(bowtie_exec.stdout).split("\\n")[0].split(" ")[2]
+        bowtie_exec = run(["bowtie2", "--version"], check=False, capture_output=True)
+        bowtie_version = str(bowtie_exec.stdout).split("\\n", maxsplit=1)[0].split(" ")[2]
         if bowtie_exec.returncode != 0:
             logging.error(
                 "Checking bowtie2 version failed:\n%s",
@@ -136,7 +134,7 @@ def execute(self) -> None:
             sys.exit(1)
         # Start mapping
         start = perf_counter()
-        mapping_exec = Popen(
+        with Popen(
             [
                 "bowtie2",
                 parameters,
@@ -149,51 +147,37 @@ def execute(self) -> None:
             ],
             stdout=PIPE,
             stderr=PIPE,
-        )
-        # cramfile_unsorted = Path(mkstemp(dir=self.meteor.tmp_dir)[1])
-        assert mapping_exec.stdout is not None and mapping_exec.stderr is not None
-        with pysam.AlignmentFile(
-            mapping_exec.stdout,
-            "r",
-        ) as samdesc:
+        ) as mapping_exec:
+            assert mapping_exec.stdout is not None and mapping_exec.stderr is not None
             with pysam.AlignmentFile(
-                str(cram_file.resolve()),
-                # cramfile_unsorted,
-                "wc",
-                template=samdesc,
-                reference_filename=str(reference.resolve()),
-            ) as cram:
-                for element in samdesc:
-                    cram.write(element)
-        # pysam.sort(
-        #     "-o",
-        #     str(cram_file.resolve()),
-        #     "-@",
-        #     str(self.meteor.threads),
-        #     "-O",
-        #     "cram",
-        #     str(cramfile_unsorted.resolve()),
-        #     catch_stdout=False,
-        # )
-        # pysam.index(str(cram_file.resolve()))
-        # Read standard error from the process (non-blocking read)
-        mapping_result = mapping_exec.stderr.read().decode("utf-8")
-        mapping_exec.stderr.close()
-
-        # Wait for the process to finish and get the exit code
-        exit_code = mapping_exec.wait()
-
-        # Check for errors and print the error output if necessary
-        if exit_code != 0:
-            logging.error("bowtie2 failed:\n%s" % mapping_result)
-            sys.exit(1)
-        try:
-            mapping_log = findall(r"([0-9]+)\s+\(", mapping_result)
-            assert len(mapping_log) == 4
-            mapping_data = [int(i) for i in mapping_log]
-        except AssertionError:
-            logging.error("Could not access the mapping result from bowtie2")
-            sys.exit(1)
+                mapping_exec.stdout,
+                "r",
+            ) as samdesc:
+                with pysam.AlignmentFile(
+                    str(cram_file.resolve()),
+                    # cramfile_unsorted,
+                    "wc",
+                    template=samdesc,
+                    reference_filename=str(reference.resolve()),
+                ) as cram:
+                    for element in samdesc:
+                        cram.write(element)
+            # Read standard error from the process (non-blocking read)
+            mapping_result = mapping_exec.stderr.read().decode("utf-8")
+            mapping_exec.stderr.close()
+            # Wait for the process to finish and get the exit code
+            exit_code = mapping_exec.wait()
+            # Check for errors and print the error output if necessary
+            if exit_code != 0:
+                logging.error("bowtie2 failed:\n%s", mapping_result)
+                sys.exit(1)
+            try:
+                mapping_log = findall(r"([0-9]+)\s+\(", mapping_result)
+                assert len(mapping_log) == 4
+                mapping_data = [int(i) for i in mapping_log]
+            except AssertionError:
+                logging.error("Could not access the mapping result from bowtie2")
+                sys.exit(1)
         logging.info("Completed mapping creation in %f seconds", perf_counter() - start)
         config = self.set_mapping_config(cram_file, bowtie_version, mapping_data)
         self.save_config(config, self.census["Stage1FileName"])
diff --git a/meteor/merging.py b/meteor/merging.py
@@ -18,9 +18,9 @@
 from pathlib import Path
 import logging
 import sys
-import numpy as np
 from biom.table import Table # type: ignore
 from typing import ClassVar
+from functools import partial
 
 
 @dataclass
@@ -41,7 +41,7 @@ class Merging(Session):
     min_msp_occurrence: int
     remove_sample_with_no_msp: bool
     output_mpa: bool
-    mpa_taxonomic_level: str
+    mpa_taxonomic_level: str|None
     output_biom: bool
     output_gene_matrix: bool
     ranks: dict[str, str] = field(
@@ -84,7 +84,7 @@ def find_files_to_merge(
             for my_sample, my_dir in input_dir.items()
         }
         # Check that there is exactly one element in each list
-        len_list = list(set([len(value) for value in list(dict_to_merge.values())]))
+        len_list = list({len(value) for value in list(dict_to_merge.values())})
         assert len(len_list) == 1
         assert len_list[0] == 1
         files_to_merge = {
@@ -103,7 +103,7 @@ def extract_json_info(
         """
         # Check that sections are present
         try:
-            assert all([my_section in config for my_section in list(param_dict.keys())])
+            assert all(my_section in config for my_section in param_dict.keys())
         except AssertionError:
             logging.error("Missing required section in census json file.")
             sys.exit(1)
@@ -115,11 +115,9 @@ def extract_json_info(
         # Check that required fields are present
         try:
             assert all(
-                [
-                    my_field in config[my_section]
-                    for my_section in param_dict
-                    for my_field in param_dict[my_section]
-                ]
+                my_field in config[my_section]
+                for my_section in param_dict
+                for my_field in param_dict[my_section]
             )
         except AssertionError:
             logging.error("Missing required fields in census ini file.")
@@ -259,12 +257,7 @@ def execute(self) -> None:
         # Save database_type for later use
         try:
             database_type_all = list(
-                set(
-                    [
-                        my_info["database_type"]
-                        for my_info in list(all_information.values())
-                    ]
-                )
+                {my_info["database_type"] for my_info in list(all_information.values())}
             )
             assert len(database_type_all) == 1
             database_type = database_type_all[0]
@@ -365,7 +358,7 @@ def execute(self) -> None:
                     # Apply the prefixes to each taxonomic rank
                     for rank, prefix in self.ranks.items():
                         annotation[rank] = annotation[rank].apply(
-                            lambda x: f"{prefix}{x}"
+                            partial(lambda prefix, x: f"{prefix}{x}", prefix)
                         )
                     observ_metadata = [
                         {"taxonomy": row.iloc[1:].tolist()}
@@ -382,7 +375,7 @@ def execute(self) -> None:
                     # Generate JSON representation of the BIOM table
                     biom_json = biom_table.to_json(generated_by="Meteor")
                     # Write the JSON to a file
-                    with open(output_name.with_suffix(".biom"), "wt") as f:
+                    with open(output_name.with_suffix(".biom"), "wt", encoding="UTF-8") as f:
                         f.write(biom_json)
                     # with h5py.File(output_name.with_suffix(".biom"), "w") as f:
                     #     table.to_hdf5(f, generated_by="Meteor", compress=True)

diff --git a/meteor/meteor.py b/meteor/meteor.py
@@ -783,7 +783,7 @@ def main() -> None:  # pragma: no cover
             args.min_msp_occurrence,
             args.remove_sample_with_no_msp,
             False,
-            "a",
+            None,
             #args.output_mpa,
             #args.taxonomic_level,
             args.output_biom,

diff --git a/meteor/phylogeny.py b/meteor/phylogeny.py
@@ -98,7 +98,7 @@ def execute(self) -> None:
         # Define the regex pattern to match the version number
         version_pattern = re.compile(r"RAxML-NG v\. (\d+\.\d+\.\d+)")
         raxml_ng_help = run(
-            ["raxml-ng", "--version"], capture_output=True
+            ["raxml-ng", "--version"], check=False, capture_output=True
         ).stdout.decode("utf-8")
         match = version_pattern.search(raxml_ng_help)
         # Check if a match is found
@@ -152,7 +152,7 @@ def execute(self) -> None:
                         "MSP %s have less than 4 sequences, we compute the mutation rate",
                         msp_file.name,
                     )
-                    with open(tree_file.parent / "cleaned_sequences.fasta", "w") as f:
+                    with open(tree_file.parent / "cleaned_sequences.fasta", "w", encoding="UTF-8") as f:
                         for seq_name, sequence in cleaned_seqs.items():
                             f.write(f">{seq_name}\n{sequence}\n")
                     mutation_rate = []
@@ -163,7 +163,7 @@ def execute(self) -> None:
                             seq2 = cleaned_seqs[seq_ids[j]]
                             mutation_rate += [self.compute_mutation_rate(seq1, seq2)]
                     # Construct Newick format string
-                    with open(tree_file.with_suffix(".tree"), "wt") as tree:
+                    with open(tree_file.with_suffix(".tree"), "wt", encoding="UTF-8") as tree:
                         if len(seq_ids) == 2:
                             tree.write(
                                 f"({seq_ids[0]}:{mutation_rate[0]}, {seq_ids[1]}:{mutation_rate[0]});"
@@ -174,11 +174,15 @@ def execute(self) -> None:
                                 min_rate_idx == 0
                             ):  # seq1 and seq2 have the smallest distance
                                 tree.write(
-                                    f"(({seq_ids[0]}:{mutation_rate[0]}, {seq_ids[1]}:{mutation_rate[0]}):{mutation_rate[1]}, {seq_ids[2]}:{mutation_rate[1]});"
+                                    f"(({seq_ids[0]}:{mutation_rate[0]}, "
+                                    f"{seq_ids[1]}:{mutation_rate[0]}):{mutation_rate[1]}, "
+                                    f"{seq_ids[2]}:{mutation_rate[1]});"
                                 )
                             else:  # seq1 and seq3 have the smallest distance
                                 tree.write(
-                                    f"(({seq_ids[0]}:{mutation_rate[1]}, {seq_ids[2]}:{mutation_rate[1]}):{mutation_rate[0]}, {seq_ids[1]}:{mutation_rate[0]});"
+                                    f"(({seq_ids[0]}:{mutation_rate[1]}, "
+                                    f"{seq_ids[2]}:{mutation_rate[1]}):{mutation_rate[0]}, "
+                                    f"{seq_ids[1]}:{mutation_rate[0]});"
                                 )
                 tree_files.append(tree_file)
             logging.info("Completed MSP tree %d/%d", idx, msp_count)

diff --git a/meteor/profiler.py b/meteor/profiler.py
@@ -43,16 +43,16 @@ class Profiler(Session):
     meteor: type[Component]
     rarefaction_level: int
     seed: int
-    normalization: str
+    normalization: str|None
     core_size: int
     msp_filter: float
     completeness: float
     coverage_factor: float
 
     def __post_init__(self):
         if self.normalization not in Profiler.NORMALIZATIONS:
-            raise ValueError(f'{self.normalization} is not a valid normalization')
-        
+            raise ValueError(f"{self.normalization} is not a valid normalization")
+
         # Get the json file
         self.sample_config = self.get_census_stage(self.meteor.mapping_dir, 1)
 

diff --git a/meteor/referencebuilder.py b/meteor/referencebuilder.py
@@ -132,7 +132,7 @@ def execute(self) -> None:
         # Prepare the reference for meteor
         self.create_reference()
         # Check the bowtie2 version
-        bowtie_exec = run(["bowtie2", "--version"], capture_output=True)
+        bowtie_exec = run(["bowtie2", "--version"], check=False, capture_output=True)
         bowtie_version = bowtie_exec.stdout.decode("utf-8").split(" ")[2].split("\n")[0]
         if bowtie_exec.returncode != 0:
             logging.error(

diff --git a/meteor/strain.py b/meteor/strain.py
@@ -31,7 +31,7 @@
 @dataclass
 class Strain(Session):
     """Counter session map and count"""
-    
+
     DEFAULT_MAX_DEPTH: ClassVar[int] = 100
     MIN_MIN_SNP_DEPTH: ClassVar[int] = 1
     MAX_MIN_SNP_DEPTH: ClassVar[int] = 10000
@@ -166,9 +166,7 @@ def get_msp_variant(
                 consensus_file,
             )
             sys.exit(1)
-        gene_dict = {
-            gene_id: seq for gene_id, seq in self.get_sequences(consensus_file)
-        }
+        gene_dict = dict(self.get_sequences(consensus_file))
         logging.info(
             "%s MSPs have sufficient signal for SNP analysis ",
             len(msp_with_overlapping_genes["msp_name"].values),

diff --git a/meteor/tests/test_counter.py b/meteor/tests/test_counter.py
@@ -284,8 +284,8 @@ def test_save_cram(counter_unique: Counter, datadir: Path, tmp_path: Path) -> No
         reads, _ = counter_unique.filter_alignments(
             cramdesc
         )  # pylint: disable=unused-variable
-        read_list = list(chain(reads.values()))
-        merged_list = list(chain.from_iterable(read_list))
+        read_list = reads.values()
+        merged_list = chain.from_iterable(read_list)
         tmpcramfile = tmp_path / "test"
         counter_unique.save_cram_strain(tmpcramfile, cramdesc, merged_list, ref_json)
         assert tmpcramfile.exists()

diff --git a/meteor/tests/test_fastq_importer.py b/meteor/tests/test_fastq_importer.py
@@ -80,13 +80,13 @@ def test_replace_ext(builder: FastqImporter, fastq_filename: str, name: str) ->
 
 
 @pytest.mark.parametrize(
-    ("fastq_filename", "tag"),
+    ("fastq_filename"),
     (
-        ("test.fastq.gz", ""),
-        pytest.param("pretty.complex_pain.fasta", "", id="fasta"),
+        ("test.fastq.gz"),
+        pytest.param("pretty.complex_pain.fasta", id="fasta"),
     ),
 )
-def test_get_tag_none(builder: FastqImporter, fastq_filename: str, tag: str) -> None:
+def test_get_tag_none(builder: FastqImporter, fastq_filename: str) -> None:
     assert builder.get_tag(fastq_filename) is None
 
 @pytest.mark.parametrize(

diff --git a/meteor/tests/test_mapper.py b/meteor/tests/test_mapper.py
@@ -52,7 +52,6 @@ def mapping_builder(datadir: Path, tmp_path: Path) -> Mapper:
         "end-to-end",
         80,
         10000,
-        "smart_shared_reads",
         0.95,
     )
-Original file line number
+Diff line change
@@ Expand Up / @@ -52,7 +52,6 @@ def mapping_builder(datadir: Path, tmp_path: Path) -> Mapper: @@
             "end-to-end",
 ,
 ,
-            "smart_shared_reads",
 .95,
         )
@@ Expand Down @@