From 3a783c62fa5a7a004e0c21ff7c2631bf9561e862 Mon Sep 17 00:00:00 2001 From: Amine Ghozlane Date: Thu, 31 Oct 2024 18:00:17 +0100 Subject: [PATCH] Fix issue #57 --- meteor/counter.py | 36 ++++++++++++------- meteor/fastqimporter.py | 2 +- meteor/mapper.py | 9 ++--- meteor/merging.py | 6 ++-- meteor/tests/test_counter.py | 6 ++-- .../test/part2/part2_census_stage_1.json | 8 ++--- meteor/tests/test_merging.py | 5 +-- .../test_project_census_stage_2_report.tsv | 8 ++--- .../sample1/sample1_census_stage_1.json | 6 ++-- .../sample2/sample2_census_stage_1.json | 6 ++-- .../sample3/sample3_census_stage_1.json | 6 ++-- .../sample1/sample1_census_stage_2.json | 10 +++--- .../sample2/sample2_census_stage_2.json | 10 +++--- .../sample3/sample3_census_stage_2.json | 9 +++-- .../map/test/test_census_stage_1.json | 29 ++++++++++++++- .../eva71_bench_census_stage_1.json | 8 ++--- 16 files changed, 100 insertions(+), 64 deletions(-) diff --git a/meteor/counter.py b/meteor/counter.py index 0d29b56..6ebf8a8 100644 --- a/meteor/counter.py +++ b/meteor/counter.py @@ -121,12 +121,17 @@ def get_aligned_nucleotides(self, element) -> Iterator[int]: """ yield from (item[1] for item in element.cigartuples if item[0] < 3) - def set_counter_config(self, counted_reads): - """Define the count of reads""" + def set_counter_config(self, counted_reads: float, count_file: Path) -> dict: + """Save in the json essential info + :param counted_read: (float) Number of reads counted + :param count_file: (Path) Count file + :return: (Dict) dictionnary data + """ return { "counting": { - "counted_reads": counted_reads, + "counted_reads": int(round(counted_reads, 0)), "identity_threshold": round(self.identity_threshold, 2), + "count_file": count_file.name, } } @@ -412,6 +417,7 @@ def launch_counting( count_file: Path, ref_json: dict, census_json: dict, + Stage1Json: Path, ): """Function that count reads from a cram file, using the given methods in count: "total" or "shared" or "unique". @@ -493,13 +499,9 @@ def launch_counting( catch_stdout=False, ) total_read_count = self.write_table(cramfile_sorted, count_file) - config = self.set_counter_config(total_read_count) - Stage1Json = ( - self.meteor.mapping_dir - / f"{census_json['sample_info']['sample_name']}_census_stage_1.json" - ) - - self.save_config(census_json.update(config), Stage1Json) + config = self.set_counter_config(total_read_count, count_file) + census_json.update(config) + self.save_config(census_json, Stage1Json) if self.keep_filtered_alignments: cramfile_strain_unsorted = Path(mkstemp(dir=self.meteor.tmp_dir)[1]) self.save_cram_strain( @@ -556,7 +558,6 @@ def execute(self) -> None: # mapping of each sample against reference for library in census_json_files: - print(library) census_json = self.read_json(library) sample_info = census_json["sample_info"] stage1_dir = self.meteor.mapping_dir / sample_info["sample_name"] @@ -596,8 +597,19 @@ def execute(self) -> None: / f"{sample_info['sample_name']}.tsv.xz" ) start = perf_counter() + Stage1Json = ( + self.meteor.mapping_dir + / sample_info["sample_name"] + / f"{sample_info['sample_name']}_census_stage_1.json" + ) + census_json = self.read_json(Stage1Json) self.launch_counting( - raw_cram_file, cram_file, count_file, ref_json, census_json + raw_cram_file, + cram_file, + count_file, + ref_json, + census_json, + Stage1Json, ) logging.info("Completed counting in %f seconds", perf_counter() - start) if not self.keep_all_alignments: diff --git a/meteor/fastqimporter.py b/meteor/fastqimporter.py index 0f79e03..ba95e65 100644 --- a/meteor/fastqimporter.py +++ b/meteor/fastqimporter.py @@ -163,7 +163,7 @@ def execute(self) -> None: samples_names.add(sample_name) sample_dir = self.meteor.fastq_dir / sample_name sample_dir.mkdir(exist_ok=True, parents=True) - sym_fastq = Path(sample_dir / fastq_file.name) + sym_fastq = Path(sample_dir / fastq_file.name).resolve() if not sym_fastq.is_symlink(): sym_fastq.symlink_to(fastq_file.resolve()) # Create a configuration diff --git a/meteor/mapper.py b/meteor/mapper.py index 234629b..e75eb78 100644 --- a/meteor/mapper.py +++ b/meteor/mapper.py @@ -62,10 +62,10 @@ def set_mapping_config( :param cram_file: A path to the raw cram file :return: (Dict) A dict object with the census 1 config """ + del self.census["census"]["sample_info"]["full_sample_name"] config = { "meteor_version": self.meteor.version, "sample_info": self.census["census"]["sample_info"], - "sample_file": self.census["census"]["sample_file"], "mapping": { "mapping_tool": "bowtie2", "mapping_tool_version": bowtie_version, @@ -81,10 +81,8 @@ def set_mapping_config( "overall_alignment_rate": round( (mapping_data[2] + mapping_data[3]) / mapping_data[0] * 100, 2 ), - "fastq_files": ",".join(self.fastq_list), - }, - "mapping_file": { - "bowtie_file": cram_file.name, + "fastq_files": self.fastq_list, + "mapping_file": cram_file.name, }, } return config @@ -176,7 +174,6 @@ def execute(self) -> None: mapping_log = findall(r"([0-9]+)\s+\(", mapping_result) assert len(mapping_log) == 4 mapping_data = [int(i) for i in mapping_log] - print(mapping_data) except AssertionError: logging.error("Could not access the mapping result from bowtie2") sys.exit(1) diff --git a/meteor/merging.py b/meteor/merging.py index a2e3683..d49816e 100644 --- a/meteor/merging.py +++ b/meteor/merging.py @@ -263,9 +263,11 @@ def execute(self) -> None: "trim", "alignment_number", "mapping_type", - "identity_threshold", "database_type", ], + "counting": [ + "identity_threshold", + ], "profiling_parameters": [""], } all_information = { @@ -284,7 +286,7 @@ def execute(self) -> None: # Force to taxo in no consensus database_type = "taxo" - # Merge ini information + # Merge json information logging.info("Merging json information...") # Get all values from all fields from all sections from all json files all_information_to_save = { diff --git a/meteor/tests/test_counter.py b/meteor/tests/test_counter.py index 664a612..a5e9545 100644 --- a/meteor/tests/test_counter.py +++ b/meteor/tests/test_counter.py @@ -310,7 +310,7 @@ def test_launch_counting_unique(counter_unique: Counter, datadir: Path, tmp_path counter_unique.meteor.ref_dir / "mock_reference.json" ) counter_unique.launch_counting( - raw_cramfile, cramfile, countfile, ref_json, census_json + raw_cramfile, cramfile, countfile, ref_json, census_json, census_json_file ) with countfile.open("rb") as out: assert md5(out.read()).hexdigest() == "f5bc528dcbf594b5089ad7f6228ebab5" @@ -326,7 +326,7 @@ def test_launch_counting_total(counter_total: Counter, datadir: Path, tmp_path: counter_total.meteor.ref_dir / "mock_reference.json" ) counter_total.launch_counting( - raw_cramfile, cramfile, countfile, ref_json, census_json + raw_cramfile, cramfile, countfile, ref_json, census_json, census_json_file ) with countfile.open("rb") as out: assert md5(out.read()).hexdigest() == "f010e4136323ac408d4c127e243756c2" @@ -346,7 +346,7 @@ def test_launch_counting_smart_shared( counter_smart_shared.meteor.ref_dir / "mock_reference.json" ) counter_smart_shared.launch_counting( - raw_cramfile, cramfile, countfile, ref_json, census_json + raw_cramfile, cramfile, countfile, ref_json, census_json, census_json_file ) # with countfile.open("rb") as out: # assert md5(out.read()).hexdigest() == "4bdd7327cbad8e71d210feb0c6375077" diff --git a/meteor/tests/test_counter/test/part2/part2_census_stage_1.json b/meteor/tests/test_counter/test/part2/part2_census_stage_1.json index 08076a0..c56dc9a 100644 --- a/meteor/tests/test_counter/test/part2/part2_census_stage_1.json +++ b/meteor/tests/test_counter/test/part2/part2_census_stage_1.json @@ -28,11 +28,7 @@ "matches": 10000, "is_local_mapping": 1, "mapping_software": "Meteor", - "mapping_software_version": "3.3" - }, - "mapping_file": { - "mapping_file_count": 1, - "bowtie_file_1": "part2.bam", - "mapping_file_format": "sam" + "mapping_software_version": "3.3", + "mapping_file": "part2.bam" } } \ No newline at end of file diff --git a/meteor/tests/test_merging.py b/meteor/tests/test_merging.py index 9a03464..92eb696 100644 --- a/meteor/tests/test_merging.py +++ b/meteor/tests/test_merging.py @@ -113,13 +113,13 @@ def test_extract_json_info(merging_profiles: Merging) -> None: config, param_dict={ "profiling_parameters": ["msp_filter", "modules_def"], - "mapping_file": [""], + "mapping": ["mapping_file"], }, ) assert info == { "msp_filter": 0.1, "modules_def": "modules_definition.tsv", - "bowtie_file": "sample1.sam", + "mapping_file": "sample1.sam", } @@ -228,6 +228,7 @@ def test_execute1(merging_profiles: Merging, datadir: Path) -> None: datadir / "expected_output" / "test_project_census_stage_2_report.tsv" ) real_output_df = pd.read_table(real_output) + expected_output_df = pd.read_table(expected_output) real_output_df = ( real_output_df.sort_values(by=["sample"]) diff --git a/meteor/tests/test_merging/expected_output/test_project_census_stage_2_report.tsv b/meteor/tests/test_merging/expected_output/test_project_census_stage_2_report.tsv index ed6b306..18fc894 100644 --- a/meteor/tests/test_merging/expected_output/test_project_census_stage_2_report.tsv +++ b/meteor/tests/test_merging/expected_output/test_project_census_stage_2_report.tsv @@ -1,4 +1,4 @@ -sample sample_name census_status full_sample_name fastq_file mapping_tool mapping_tool_version mapping_date reference_name trim alignment_number mapping_type meteor_version identity_threshold total_read_count mapped_read_count overall_alignment_rate gene_count msp_count msp_signal mustard_signal fastq_files database_type bowtie_file profiling_date normalization rarefaction_level seed msp_core_size msp_filter msp_def mustard_filename modules_db modules_db_filenames modules_def module_completeness -sample1 sample1 0 sample1_trimmed.Q17.converted.noHost sample1_trimmed.Q17.converted.noHost.fastq.gz bowtie2 2.5.1 2023-11-17 IGC2 80 10000 end-to-end 2.0.9 0.95 19234567 16234987 84.41 627516 297 0.63 0.0 ['fastq1.fastq.gz', 'fastq2.fastq.gz'] complete sample1.sam 2023-11-17 fpkm 5000000 1234 100 0.1 IGC2_1990MSPs.tsv IGC2_mustard.tsv kegg IGC2_kegg_107.tsv modules_definition.tsv 0.9 -sample2 sample2 0 sample2_trimmed.Q17.converted.noHost sample2_trimmed.Q17.converted.noHost.fastq.gz bowtie2 2.5.1 2023-11-17 IGC2 80 10000 end-to-end 2.0.9 0.95 15000000 10000000 66.67 687432 325 0.87 0.1 ['fastq1.fastq.gz', 'fastq2.fastq.gz'] complete sample2.sam 2023-11-17 fpkm 5000000 1234 100 0.1 IGC2_1990MSPs.tsv IGC2_mustard.tsv kegg IGC2_kegg_107.tsv modules_definition.tsv 0.9 -sample3 sample3 0 sample3_trimmed.Q17.converted.noHost sample3_trimmed.Q17.converted.noHost.fastq.gz bowtie2 2.5.1 2023-11-17 IGC2 80 10000 end-to-end 2.0.9 0.95 20000000 10000000 50.00 599999 354 0.56 0.3 ['fastq1.fastq.gz', 'fastq2.fastq.gz'] complete sample3.sam 2023-11-17 fpkm 5000000 1234 90 0.1 IGC2_1990MSPs.tsv IGC2_mustard.tsv kegg IGC2_kegg_107.tsv modules_definition.tsv 0.9 \ No newline at end of file +sample sample_name census_status full_sample_name fastq_file mapping_tool mapping_tool_version mapping_date reference_name trim alignment_number mapping_type meteor_version identity_threshold total_read_count mapped_read_count overall_alignment_rate gene_count msp_count msp_signal mustard_signal fastq_files database_type mapping_file profiling_date normalization rarefaction_level seed msp_core_size msp_filter msp_def mustard_filename modules_db modules_db_filenames modules_def module_completeness count_file counted_reads +sample1 sample1 0 sample1_trimmed.Q17.converted.noHost sample1_trimmed.Q17.converted.noHost.fastq.gz bowtie2 2.5.1 2023-11-17 IGC2 80 10000 end-to-end 2.0.9 0.95 19234567 16234987 84.41 627516 297 0.63 0.0 ['fastq1.fastq.gz', 'fastq2.fastq.gz'] complete sample1.sam 2023-11-17 fpkm 5000000 1234 100 0.1 IGC2_1990MSPs.tsv IGC2_mustard.tsv kegg IGC2_kegg_107.tsv modules_definition.tsv 0.9 sample1.tsv.xz 14591228 +sample2 sample2 0 sample2_trimmed.Q17.converted.noHost sample2_trimmed.Q17.converted.noHost.fastq.gz bowtie2 2.5.1 2023-11-17 IGC2 80 10000 end-to-end 2.0.9 0.95 15000000 10000000 66.67 687432 325 0.87 0.1 ['fastq1.fastq.gz', 'fastq2.fastq.gz'] complete sample2.sam 2023-11-17 fpkm 5000000 1234 100 0.1 IGC2_1990MSPs.tsv IGC2_mustard.tsv kegg IGC2_kegg_107.tsv modules_definition.tsv 0.9 sample2.tsv.xz 10000000 +sample3 sample3 0 sample3_trimmed.Q17.converted.noHost sample3_trimmed.Q17.converted.noHost.fastq.gz bowtie2 2.5.1 2023-11-17 IGC2 80 10000 end-to-end 2.0.9 0.95 20000000 10000000 50.00 599999 354 0.56 0.3 ['fastq1.fastq.gz', 'fastq2.fastq.gz'] complete sample3.sam 2023-11-17 fpkm 5000000 1234 90 0.1 IGC2_1990MSPs.tsv IGC2_mustard.tsv kegg IGC2_kegg_107.tsv modules_definition.tsv 0.9 sample3.tsv.xz 10000000 diff --git a/meteor/tests/test_merging/mapping/sample1/sample1_census_stage_1.json b/meteor/tests/test_merging/mapping/sample1/sample1_census_stage_1.json index 5c01eae..95753c3 100644 --- a/meteor/tests/test_merging/mapping/sample1/sample1_census_stage_1.json +++ b/meteor/tests/test_merging/mapping/sample1/sample1_census_stage_1.json @@ -23,9 +23,7 @@ "fastq_files": [ "fastq1.fastq.gz", "fastq2.fastq.gz" - ] - }, - "mapping_file": { - "bowtie_file": "sample1.sam" + ], + "mapping_file": "sample1.sam" } } \ No newline at end of file diff --git a/meteor/tests/test_merging/mapping/sample2/sample2_census_stage_1.json b/meteor/tests/test_merging/mapping/sample2/sample2_census_stage_1.json index 3f842f8..c3b6423 100644 --- a/meteor/tests/test_merging/mapping/sample2/sample2_census_stage_1.json +++ b/meteor/tests/test_merging/mapping/sample2/sample2_census_stage_1.json @@ -23,9 +23,7 @@ "fastq_files": [ "fastq1.fastq.gz", "fastq2.fastq.gz" - ] - }, - "mapping_file": { - "bowtie_file": "sample2.sam" + ], + "mapping_file": "sample2.sam" } } \ No newline at end of file diff --git a/meteor/tests/test_merging/mapping/sample3/sample3_census_stage_1.json b/meteor/tests/test_merging/mapping/sample3/sample3_census_stage_1.json index 5c0088c..10781ea 100644 --- a/meteor/tests/test_merging/mapping/sample3/sample3_census_stage_1.json +++ b/meteor/tests/test_merging/mapping/sample3/sample3_census_stage_1.json @@ -23,9 +23,7 @@ "fastq_files": [ "fastq1.fastq.gz", "fastq2.fastq.gz" - ] - }, - "mapping_file": { - "bowtie_file": "sample3.sam" + ], + "mapping_file": "sample3.sam" } } \ No newline at end of file diff --git a/meteor/tests/test_merging/profiles/sample1/sample1_census_stage_2.json b/meteor/tests/test_merging/profiles/sample1/sample1_census_stage_2.json index 3c06920..4ac200a 100644 --- a/meteor/tests/test_merging/profiles/sample1/sample1_census_stage_2.json +++ b/meteor/tests/test_merging/profiles/sample1/sample1_census_stage_2.json @@ -15,7 +15,6 @@ "trim": 80, "alignment_number": 10000, "mapping_type": "end-to-end", - "identity_threshold": 0.95, "total_read_count": 19234567, "mapped_read_count": 16234987, "overall_alignment_rate": 84.41, @@ -23,10 +22,13 @@ "fastq1.fastq.gz", "fastq2.fastq.gz" ], - "database_type": "complete" + "database_type": "complete", + "mapping_file": "sample1.sam" }, - "mapping_file": { - "bowtie_file": "sample1.sam" + "counting": { + "counted_reads": 14591228, + "identity_threshold": 0.95, + "count_file": "sample1.tsv.xz" }, "profiling_session": { "meteor_version": "2.0.9", diff --git a/meteor/tests/test_merging/profiles/sample2/sample2_census_stage_2.json b/meteor/tests/test_merging/profiles/sample2/sample2_census_stage_2.json index 46ede55..039be05 100644 --- a/meteor/tests/test_merging/profiles/sample2/sample2_census_stage_2.json +++ b/meteor/tests/test_merging/profiles/sample2/sample2_census_stage_2.json @@ -15,7 +15,6 @@ "trim": 80, "alignment_number": 10000, "mapping_type": "end-to-end", - "identity_threshold": 0.95, "total_read_count": 15000000, "mapped_read_count": 10000000, "overall_alignment_rate": 66.67, @@ -23,10 +22,13 @@ "fastq1.fastq.gz", "fastq2.fastq.gz" ], - "database_type": "complete" + "database_type": "complete", + "mapping_file": "sample2.sam" }, - "mapping_file": { - "bowtie_file": "sample2.sam" + "counting": { + "counted_reads": 10000000, + "identity_threshold": 0.95, + "count_file": "sample2.tsv.xz" }, "profiling_session": { "meteor_version": "2.0.9", diff --git a/meteor/tests/test_merging/profiles/sample3/sample3_census_stage_2.json b/meteor/tests/test_merging/profiles/sample3/sample3_census_stage_2.json index b30d9e4..f36d708 100644 --- a/meteor/tests/test_merging/profiles/sample3/sample3_census_stage_2.json +++ b/meteor/tests/test_merging/profiles/sample3/sample3_census_stage_2.json @@ -23,10 +23,13 @@ "fastq1.fastq.gz", "fastq2.fastq.gz" ], - "database_type": "complete" + "database_type": "complete", + "mapping_file": "sample3.sam" }, - "mapping_file": { - "bowtie_file": "sample3.sam" + "counting": { + "counted_reads": 10000000, + "identity_threshold": 0.95, + "count_file": "sample3.tsv.xz" }, "profiling_session": { "meteor_version": "2.0.9", diff --git a/meteor/tests/test_strain/map/test/test_census_stage_1.json b/meteor/tests/test_strain/map/test/test_census_stage_1.json index a14b684..bf72317 100644 --- a/meteor/tests/test_strain/map/test/test_census_stage_1.json +++ b/meteor/tests/test_strain/map/test/test_census_stage_1.json @@ -1 +1,28 @@ -{"meteor_version": "2.0.6", "sample_info": {"sample_name": "test", "tag": "single", "full_sample_name": "test"}, "sample_file": {"fastq_file": "test.fastq.gz"}, "mapping": {"mapping_tool": "bowtie2", "mapping_tool_version": "2.5.3", "mapping_date": "2024-06-11", "reference_name": "mock", "trim": "80", "alignment_number": 10000, "mapping_type": "end-to-end", "identity_threshold": 0.95, "total_read_count": 827509, "mapped_read_count": 793977, "overall_alignment_rate": 95.95, "fastq_files": "fastq/test/test.fastq.gz"}, "mapping_file": {"bowtie_file": "test_raw.cram"}} \ No newline at end of file +{ + "meteor_version": "2.0.6", + "sample_info": { + "sample_name": "test", + "tag": "single", + "full_sample_name": "test" + }, + "sample_file": { + "fastq_file": "test.fastq.gz" + }, + "mapping": { + "mapping_tool": "bowtie2", + "mapping_tool_version": "2.5.3", + "mapping_date": "2024-06-11", + "reference_name": "mock", + "trim": "80", + "alignment_number": 10000, + "mapping_type": "end-to-end", + "identity_threshold": 0.95, + "total_read_count": 827509, + "mapped_read_count": 793977, + "overall_alignment_rate": 95.95, + "fastq_files": [ + "fastq/test/test.fastq.gz" + ], + "mapping_file": "test_raw.cram" + } +} \ No newline at end of file diff --git a/meteor/tests/test_variantcalling/eva71_bench/eva71_bench_census_stage_1.json b/meteor/tests/test_variantcalling/eva71_bench/eva71_bench_census_stage_1.json index e79c75c..d55ebf8 100644 --- a/meteor/tests/test_variantcalling/eva71_bench/eva71_bench_census_stage_1.json +++ b/meteor/tests/test_variantcalling/eva71_bench/eva71_bench_census_stage_1.json @@ -20,9 +20,9 @@ "total_read_count": 1480, "mapped_read_count": 1480, "overall_alignment_rate": 100.0, - "fastq_files": ["eva71_bench.fq.gz"] - }, - "mapping_file": { - "bowtie_file": "eva71_bench.sam" + "fastq_files": [ + "eva71_bench.fq.gz" + ], + "mapping_file": "eva71_bench.sam" } } \ No newline at end of file