Skip to content

Commit

Permalink
Dev fpo (#28)
Browse files Browse the repository at this point in the history
* Better detection of fastq files

* Fix bug when directory does not exist

* Improve robustness and logging of fastqimporter

* Remove unnecessary cast

* Split merged parameters in bowtie2

* Remove poetry.lock from index

* Reindex poetry.lock

* Reduce memory usage in counter
  • Loading branch information
Florian Plaza Oñate authored Mar 16, 2024
1 parent b545dbc commit bb84292
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 20 deletions.
4 changes: 2 additions & 2 deletions meteor/counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,7 @@ def launch_counting(
template=cramdesc,
reference_filename=str(reference.resolve()),
) as total_reads:
for element in chain(*reads.values()):
for element in chain.from_iterable(reads.values()):
# if int(element.reference_name) in ref_json["reference_file"]:
total_reads.write(element)
sort(
Expand All @@ -473,7 +473,7 @@ def launch_counting(
self.save_cram_strain(
cramfile_strain_unsorted,
cramdesc,
chain(*reads.values()),
chain.from_iterable(reads.values()),
ref_json,
)
sort(
Expand Down
36 changes: 25 additions & 11 deletions meteor/fastqimporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,10 @@
import sys
import logging
import re
from itertools import product
from itertools import product, chain
from pathlib import Path
from dataclasses import dataclass, field
from typing import Iterator
from itertools import chain
from meteor.session import Session, Component


Expand Down Expand Up @@ -79,13 +78,14 @@ def get_paired_dirname(self, fastq_filename: str, tag: str) -> str:
fastq_filename = fastq_filename.replace(e, "")
return fastq_filename

def get_fastq_files(self) -> Iterator: # pragma: no cover
def get_fastq_files(self) -> Iterator[Path]: # pragma: no cover
"""Find all fastq files in the given input"""
return chain.from_iterable(
files_expected_ext = chain.from_iterable(
self.input_fastq_dir.glob("*" + e) for e in self.short_extension()
)
return (f for f in files_expected_ext if f.is_file())

def get_tag(self, fastq_filename: str) -> str|None:
def get_tag(self, fastq_filename: str) -> str | None:
"""Extract paired-end info
:param fastq_filename: Name of the fastq file
Expand Down Expand Up @@ -122,19 +122,26 @@ def set_fastq_config(

def execute(self) -> None:
"""Dispatch the fastq file"""
logging.info("Start importing task")
logging.info("Starting import of fastq files from %s", self.input_fastq_dir)
fastq_files = list(self.get_fastq_files())
if not fastq_files:
logging.error("No fastq file detected in %s", self.input_fastq_dir)
logging.error("No fastq file detected")
sys.exit(1)
else:
logging.info("%d fastq files detected", len(fastq_files))

num_imported_fastq_files = 0
samples_names = set()
for fastq_file in fastq_files:
# Get rid of all possible extension
full_sample_name = self.replace_ext(fastq_file.name)
if self.ispaired:
# Extract paired-end info
tag = self.get_tag(fastq_file.name)
if not tag:
logging.error('Pairing tag (1 or 2) is not detected in %s', fastq_file)
logging.error(
"Pairing tag (1 or 2) is not detected in %s", fastq_file
)
sys.exit(1)
else:
tag = "single"
Expand All @@ -145,18 +152,18 @@ def execute(self) -> None:
self.mask_sample_name, full_sample_name
)
if full_sample_name_array:
logging.info("Import %s", fastq_file)
sample_name = full_sample_name_array[0]
else:
# sample do not match the mask
logging.warning("Regular expression does not match %s", fastq_file)
continue
else:
if self.ispaired:
sample_name = self.get_paired_dirname(fastq_file.name, tag)
else:
sample_name = full_sample_name
logging.info("Import %s", fastq_file)
logging.info("Importing %s in sample %s", fastq_file, sample_name)
# Create directory for the sample and symlink fastq file into
samples_names.add(sample_name)
sample_dir = self.meteor.fastq_dir / sample_name
sample_dir.mkdir(exist_ok=True, parents=True)
sym_fastq = Path(sample_dir / fastq_file.name)
Expand All @@ -168,3 +175,10 @@ def execute(self) -> None:
)
config_path = sample_dir / f"{full_sample_name}_census_stage_0.json"
self.save_config(config_fastq, config_path)
num_imported_fastq_files += 1
logging.info(
"%d/%d fastq files imported in %d samples",
num_imported_fastq_files,
len(fastq_files),
len(samples_names),
)
2 changes: 1 addition & 1 deletion meteor/mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def execute(self) -> None:
[
"bowtie2",
parameters,
"--mm --no-unal",
"--mm", "--no-unal",
"-x",
str(bowtie_index.resolve()),
"-U",
Expand Down
6 changes: 3 additions & 3 deletions meteor/merging.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,8 @@ def compare_section_info(
}
if len(problematic_config) > 0:
logging.warning(
"There was %s files that do not match the reference json file",
str(len(problematic_config)),
"There was %d files that do not match the reference json file",
len(problematic_config),
)
for i in problematic_config:
logging.warning(
Expand Down Expand Up @@ -213,7 +213,7 @@ def execute(self) -> None:
logging.error("No census stage 2 found in the specified repository.")
sys.exit(1)
else:
logging.info("%s census files have been detected.", str(len(all_census)))
logging.info("%d census files have been detected.", len(all_census))
# Create the dict: path -> Dict
all_census_dict = {
my_census.parent: self.read_json(my_census) for my_census in all_census
Expand Down
2 changes: 0 additions & 2 deletions meteor/meteor.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,12 @@ def isdir(path: str) -> Path: # pragma: no cover
:return: (str) Path object of the directory
"""
mydir = Path(path)
# if not mydir.is_dir():
if not mydir.is_dir():
if not mydir.exists() :
msg = f"{mydir.name} does not exist."
else:
msg = f"{mydir.name} is not a directory."
raise ArgumentTypeError(msg)

return mydir


Expand Down
2 changes: 1 addition & 1 deletion meteor/treebuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def execute(self) -> None:
logging.error("No census stage found in the specified repository.")
sys.exit(1)
else:
logging.info("%s samples have been detected.", str(len(all_census)))
logging.info("%d samples have been detected.", len(all_census))
msp_file_dict = defaultdict(list)
for filepath in self.meteor.strain_dir.glob("**/" + "msp_*"):
msp_file_dict[filepath.name].append(filepath)
Expand Down

0 comments on commit bb84292

Please sign in to comment.