Skip to content

Commit

Permalink
Compile the necessary scripts to identify sequence errors in FASTQ fi…
Browse files Browse the repository at this point in the history
…les. It is not necessary to perform SAM or MIDSV conversions for all alleles; instead, these operations are only applied to the control allele, reducing computational costs.
  • Loading branch information
akikuno committed Oct 29, 2024
1 parent 5be0f60 commit 363aa8e
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 21 deletions.
1 change: 1 addition & 0 deletions docs/RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

## 🌟 New Features
## 🐛 Bug Fixes

## 🔧 Maintenance

+ With the end of security support for Python 3.8 in October 2024, we have updated DAJIN2 to support Python 3.9 or later. [[Commit Detail](https://github.com/akikuno/DAJIN2/commit/0967c463386a48639a614849b3d3e4453079c8b1)]
Expand Down
61 changes: 40 additions & 21 deletions src/DAJIN2/core/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def execute_control(arguments: dict):
logger.info(f"{arguments['control']} is now processing...")

###########################################################
# Preprocess
# Setup arguments
###########################################################
ARGS: FormattedInputs = preprocess.format_inputs(arguments)
preprocess.create_temporal_directories(ARGS.tempdir, ARGS.control_name, is_control=True)
Expand All @@ -37,38 +37,56 @@ def execute_control(arguments: dict):
logger.info(f"Preprocess {arguments['control']}...")

###########################################################
# Merge fastq files
# Concatenate fastq files
###########################################################

fastx_handler.save_inputs_as_single_fastq(ARGS, is_control=True)

path_fastq = Path(ARGS.tempdir, ARGS.control_name, "fastq", f"{ARGS.control_name}.fastq.gz")
fastx_handler.overwrite_with_downsampled_fastq(path_fastq, num_reads=20_000)

###########################################################
# Export fasta files as single-FASTA format
###########################################################
fastx_handler.export_fasta_files(ARGS, is_control=True)

###########################################################
# Separate fastq files by sequence error
###########################################################
paths_fasta = Path(ARGS.tempdir, ARGS.control_name, "fasta").glob("control.fasta")
preprocess.generate_sam(ARGS, paths_fasta, is_control=True, is_sv=False)
preprocess.generate_midsv(ARGS, is_control=True, is_sv=False)
# ============================================================
# Detect sequence error reads
# ============================================================
preprocess.detect_sequence_error_reads(ARGS, is_control=True)
preprocess.split_fastq_by_sequence_error(ARGS, is_control=True)

fastx_handler.save_inputs_as_single_fastx(ARGS.tempdir, ARGS.path_control, ARGS.control_name)
# ============================================================
# Save subsetted fastq if the read number is too large (> 10,000 reads)
fastx_handler.save_subsetted_fastx(
Path(ARGS.tempdir, ARGS.control_name, "fastq", f"{ARGS.control_name}.fastq.gz"), num_reads=10_000
# ============================================================
path_fastq = Path(ARGS.tempdir, ARGS.control_name, "fastq", f"{ARGS.control_name}.fastq.gz")
path_fastq_with_error = Path(
ARGS.tempdir, ARGS.control_name, "fastq", f"{ARGS.control_name}_sequence_error.fastq.gz"
)
fastx_handler.overwrite_with_downsampled_fastq(path_fastq, num_reads=10_000)
fastx_handler.overwrite_with_downsampled_fastq(path_fastq_with_error, num_reads=10_000)

###########################################################
# Mapping
# Mapping filtered control reads
###########################################################

# Export fasta files as single-FASTA format
fastx_handler.export_fasta_files(ARGS.tempdir, ARGS.fasta_alleles, ARGS.control_name)

# ============================================================
# Mapping using mappy
# ============================================================
paths_fasta = Path(ARGS.tempdir, ARGS.control_name, "fasta").glob("*.fasta")
preprocess.generate_sam(ARGS, paths_fasta, is_control=True, is_sv=False)

###########################################################
# ============================================================
# MIDSV conversion
###########################################################
# ============================================================
preprocess.generate_midsv(ARGS, is_control=True, is_sv=False)

###########################################################
# Detect sequence error reads
###########################################################
preprocess.detect_sequence_error_reads(ARGS, is_control=True)
preprocess.split_fastq_by_sequence_error(ARGS, is_control=True)

###########################################################
# Prepare data to `extract mutaion loci`
###########################################################
Expand Down Expand Up @@ -104,11 +122,12 @@ def execute_sample(arguments: dict):
# Merge fastq files
# ============================================================

fastx_handler.save_inputs_as_single_fastx(ARGS.tempdir, ARGS.path_sample, ARGS.sample_name)
fastx_handler.save_inputs_as_single_fastq(ARGS, is_control=False)

# Save subsetted fastq if the read number is too large (> 10,000 reads)
fastx_handler.save_subsetted_fastx(
Path(ARGS.tempdir, ARGS.sample_name, "fastq", f"{ARGS.sample_name}.fastq.gz"), num_reads=10_000
)
path_fastq = Path(ARGS.tempdir, ARGS.sample_name, "fastq", f"{ARGS.sample_name}.fastq.gz")
fastx_handler.overwrite_with_downsampled_fastq(path_fastq, num_reads=10_000)

# ============================================================
# Mapping with mappy
# ============================================================
Expand Down
3 changes: 3 additions & 0 deletions src/DAJIN2/core/preprocess/midsv_caller.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,9 @@ def generate_midsv(ARGS, is_control: bool = False, is_sv: bool = False) -> None:
name = ARGS.control_name if is_control else ARGS.sample_name

for allele, sequence in ARGS.fasta_alleles.items():
if not Path(ARGS.tempdir, name, "sam", allele).exists():
continue

path_midsv_directory = Path(ARGS.tempdir, name, "midsv", allele)
path_midsv_directory.mkdir(parents=True, exist_ok=True)

Expand Down

0 comments on commit 363aa8e

Please sign in to comment.