From 39b13d8cdd77a9467a93708a106651bdc070ebf7 Mon Sep 17 00:00:00 2001 From: Tony Liu Date: Mon, 18 Sep 2023 12:11:51 -0700 Subject: [PATCH 1/3] integrate fastqc, quast, & genomecov --- dev.sh | 5 ++- envs/base.yml | 6 ++- src/fabfos/external_qc.py | 82 +++++++++++++++++++++++++++++++++++++++ src/fabfos/fabfos.py | 59 ++++++++++++++++------------ src/fabfos/version.txt | 2 +- 5 files changed, 127 insertions(+), 27 deletions(-) create mode 100644 src/fabfos/external_qc.py diff --git a/dev.sh b/dev.sh index f9dcd22..fdd588a 100755 --- a/dev.sh +++ b/dev.sh @@ -133,7 +133,7 @@ case $1 in python -m $NAME \ --overwrite \ --threads 12 \ - --output ./no_miffed_test \ + --output ./manual_test \ --assembler megahit \ -i --reads ./beaver_cecum_2ndhits/EKL/Raw_Data/EKL_Cecum_ligninases_pool_secondary_hits_ss01.fastq \ -b ./ecoli_k12_mg1655.fasta \ @@ -142,6 +142,9 @@ case $1 in --ends-fw-flag "FW" \ --vector ./pcc1.fasta + # --reads ./beaver_cecum_2ndhits/EKL/Raw_Data/EKL_Cecum_ligninases_pool_secondary_hits_ss01.fastq \ + # --parity se \ + # --pool-size 20 # -i --reads ./beaver_cecum_2ndhits/EKL/Raw_Data/EKL_Cecum_ligninases_pool_secondary_hits_ss10.fastq \ diff --git a/envs/base.yml b/envs/base.yml index 430d942..118e35a 100644 --- a/envs/base.yml +++ b/envs/base.yml @@ -3,7 +3,7 @@ channels: - bioconda - conda-forge dependencies: - - python=3.11 + - python=3.10 - packaging=21.3 - bwa=0.7 - samtools=1.15 @@ -15,3 +15,7 @@ dependencies: - canu=2.2 - biopython=1.81 - vsearch=2.23 + - fastqc + - minimap2 + - bedtools + - quast diff --git a/src/fabfos/external_qc.py b/src/fabfos/external_qc.py new file mode 100644 index 0000000..c1e5f01 --- /dev/null +++ b/src/fabfos/external_qc.py @@ -0,0 +1,82 @@ +import os, sys +from pathlib import Path +import logging + +def _read_counts(ws: Path, read_file: Path): + #https://bioinformatics.stackexchange.com/questions/935/fast-way-to-count-number-of-reads-and-number-of-bases-in-a-fastq-file + read_sizes = ws.joinpath("temp.readcount.txt") + os.system(f"""\ + cat {read_file} \ + | awk 'NR % 4 == 2' \ + | wc -cl >{read_sizes} \ + """) + + num_reads, nucleotides = 0, 0 + with open(read_sizes) as f: + toks = f.readline()[:-1].strip() + if "\t" in toks: toks = toks.split("\t") + else: toks = [t for t in toks.split(" ") if len(t)>0] + num_reads, nucleotides = toks + os.unlink(read_sizes) + + return int(nucleotides), int(num_reads) + +# runs fastqc on reads +def Fastqc(ws: Path, reads: list[Path]): + logging.info("Running Fastqc...") + OUT = ws.joinpath("fastqc"); os.makedirs(OUT, exist_ok=True) + LOG = OUT.joinpath("log.txt") + os.system(f"""\ + fastqc -o {OUT} {" ".join(str(r) for r in reads)} 1>{LOG} 2>{LOG} + """) + logging.info("done.\n") + return OUT + +# gets read coverage of contigs and various assembly stats +def AssemblyStats(ws: Path, reads: list[Path], assembly: Path, cpus: int, paired_end=True): + logging.info("Calculating assembly statistics...") + LOG = "log.txt" + OUT = ws.joinpath("temp_assembly_stats"); os.makedirs(OUT, exist_ok=True) + READ_FILES = [r.absolute() for r in reads] + ASM = assembly.absolute() + nucleotides, read_count = 0, 0 + for r in READ_FILES: + _n, _c = _read_counts(OUT, r) + nucleotides+=_n + read_count+=_c + + pe_params = "-x sr" if paired_end else "" + + COV = f"./assembly_coverage.tsv" + COV_HEADER = "\t".join("contig, start, end, coverage".split(", ")) + os.system(f"""\ + cd {OUT} + BAM=./temp.coverage.bam + + # align reads to assembly + minimap2 -a {pe_params} --secondary=no {ASM} {' '.join([str(f) for f in READ_FILES])} 2>>{LOG} | samtools sort --threads {cpus} -o $BAM --write-index - + + # read coverage + echo "{COV_HEADER}">{COV} + bedtools genomecov -ibam $BAM -bg >>{COV} + mv {COV} ../ + samtools flagstat $BAM >./stats.txt + + # quast for asm stats + quast -t {cpus} -o ./quast {ASM} 1>>{LOG} + mv ./quast ../ + """) + # with open(OUT.joinpath(COV)) as f: + # with open(ws.joinpath(COV), "w") as out: + # out.write("\t".join("contig, start, end, coverage".split(", "))) + # out.writelines(f.readlines()) + # for l in f: + # # out.wr + # # con, s, e, cov = l.split("\t") + # # con = int(con.split("_")[-1]) + # # con = f"{con:04}" + # # out.write("\t".join([con, s, e, cov])) + # os.unlink(OUT.joinpath(COV)) + + logging.info("done.\n") + return OUT diff --git a/src/fabfos/fabfos.py b/src/fabfos/fabfos.py index 0781eb2..2668621 100755 --- a/src/fabfos/fabfos.py +++ b/src/fabfos/fabfos.py @@ -29,6 +29,7 @@ from pathlib import Path from packaging import version from .addons import EstimateFosmidPoolSize + from .external_qc import Fastqc, AssemblyStats except (ImportWarning, ModuleNotFoundError): import traceback @@ -1190,13 +1191,13 @@ def assemble_fosmids(sample: Sample, assembler: str, assembly_mode: str, k_min: :return: """ asm_param_stmt = "Assembling sequences using {}\n".format(assembler) - if assembly_mode != "NA": - asm_param_stmt += "Assembly mode is '{}'\n".format(assembly_mode) - asm_param_stmt += "Parameters:\n--k-min = {}\t--k-max = {}\t--k-step = {}".format(k_min, k_max, k_step) - if assembly_mode != "meta": - asm_param_stmt += "\t--min-count = {}\n".format(min_count) - else: - asm_param_stmt += "\n" + # if assembly_mode != "NA": + # asm_param_stmt += "Assembly mode is '{}'\n".format(assembly_mode) + # asm_param_stmt += "Parameters:\n--k-min = {}\t--k-max = {}\t--k-step = {}".format(k_min, k_max, k_step) + # if assembly_mode != "meta": + # asm_param_stmt += "\t--min-count = {}\n".format(min_count) + # else: + # asm_param_stmt += "\n" logging.info(asm_param_stmt) if assembler == "spades": @@ -1827,7 +1828,7 @@ def write_header(file, id: int, clone, mapping=None): for file_handle in _all, _none, _single, _complete: file_handle.close() - return + return _fpaths[0] def read_fastq(file_path): fq = SeqIO.parse(file_path, "fastq") @@ -2290,11 +2291,12 @@ def fabfos_main(sys_args): ghetto_sync_args(args, fos_father, sample) sample.gather_reads(args.reads, args.reverse, args.parity, executables, sample.output_dir, args.type) + Fastqc(out_path, [Path(r) for r in [sample.forward_reads, sample.reverse_reads] if r is not None]) qc_pe, qc_se = sample.qc_reads(args.background, args.parity, fos_father.adaptor_trim, executables, args.threads) + qced_reads = qc_pe if len(qc_pe) > 0 else [qc_se[0]] if args.pool_size is None: logging.info(f"Estimating fosmid pool size... ") - qced_reads = qc_pe if len(qc_pe) > 0 else [qc_se[0]] size_estimate = EstimateFosmidPoolSize([Path(r) for r in qced_reads], Path(args.vector), Path(args.output).joinpath("temp_pool_size_estimate"), args.threads) if size_estimate is None: logging.error(f"failed to estimate pool size, defaulting to given estimate\n") @@ -2331,13 +2333,17 @@ def fabfos_main(sys_args): ends_mapping = parse_end_alignments(sample, fosmid_assembly, ends_stats) clone_map, multi_fosmid_map = assign_clones(ends_mapping, ends_stats, fosmid_assembly) clone_map = prune_and_scaffold_fosmids(sample, clone_map, multi_fosmid_map) - write_fosmid_assignments(sample, clone_map) + final_contig_file = write_fosmid_assignments(sample, clone_map) write_fosmid_end_failures(sample, ends_stats) logging.info(f"done.\n") else: # some function above does this if ends are provided and also adds end mappings to the headers - shutil.copy(sample.assembled_fosmids, Path(sample.output_dir).joinpath(f"{sample.id}_contigs.fasta")) + final_contig_file = Path(sample.output_dir).joinpath(f"{sample.id}_contigs.fasta") + shutil.copy(sample.assembled_fosmids, final_contig_file) + is_pe = args.parity != "se" + reads = qc_pe if is_pe else qc_se + AssemblyStats(out_path, [Path(r) for r in reads], final_contig_file, int(args.threads), paired_end=is_pe) # clean up, organize outputs logging.info(f"Final cleanup... ") @@ -2354,20 +2360,25 @@ def fabfos_main(sys_args): meta["n_reads_after_qc"] = sample.read_stats.num_filtered_reads os.unlink(tmp_cov) - # - asm stats - def _parse_k(k): - if k.endswith("kbp"): k = f"atleast_{k}" - return k.lower() - meta |= {f"asm_{_parse_k(k)}":v for k, v in assembly_stats.items()} + # # - asm stats + # def _parse_k(k): + # if k.endswith("kbp"): k = f"atleast_{k}" + # return k.lower() + # meta |= {f"asm_{_parse_k(k)}":v for k, v in assembly_stats.items()} - # - write meta to table - with open(ws.joinpath(f"{sample.id}_metadata.tsv"), "w") as f: - header, values = [], [] - for k, v in meta.items(): - header.append(str(k)) - values.append(str(v)) - for l in [header, values]: - f.write("\t".join(l)+"\n") + # # - write meta to table + # with open(ws.joinpath(f"{sample.id}_metadata.tsv"), "w") as f: + # header, values = [], [] + # for k, v in meta.items(): + # header.append(str(k)) + # values.append(str(v)) + # for l in [header, values]: + # f.write("\t".join(l)+"\n") + os.system(f"""\ + mv {ws}/fastqc {ws}/{sample_id}_fastqc + mv {ws}/quast {ws}/{sample_id}_quast + mv {ws}/assembly_coverage.tsv {ws}/{sample_id}_read_coverage.tsv + """) # - write end mapping to table end_map = ws.joinpath("end_seqs/endsMapped.tbl") diff --git a/src/fabfos/version.txt b/src/fabfos/version.txt index 6f165bc..f88cf52 100644 --- a/src/fabfos/version.txt +++ b/src/fabfos/version.txt @@ -1 +1 @@ -1.12.1 \ No newline at end of file +1.13.0 \ No newline at end of file From 1c6143b5e1d9333a606bcf7f1336f2977dc1ad2f Mon Sep 17 00:00:00 2001 From: Tony Liu Date: Mon, 18 Sep 2023 16:08:46 -0700 Subject: [PATCH 2/3] trim out vector backbone --- src/fabfos/addons.py | 54 +++++++++++++++++++++++++++++++++++++++ src/fabfos/external_qc.py | 2 +- src/fabfos/fabfos.py | 14 +++++++--- src/fabfos/version.txt | 2 +- 4 files changed, 66 insertions(+), 6 deletions(-) diff --git a/src/fabfos/addons.py b/src/fabfos/addons.py index 93b75a2..8b9b9b0 100644 --- a/src/fabfos/addons.py +++ b/src/fabfos/addons.py @@ -18,10 +18,64 @@ from Bio import SeqIO from pathlib import Path +import logging import subprocess import os +def TrimBackbone(ws: Path, backbone: Path, contigs: Path): + OUT = ws.joinpath("temp_trim_vector"); os.makedirs(OUT, exist_ok=True) + logging.info("Removing vector backbone from contigs... ") + contigs = contigs.absolute() + backbone = backbone.absolute() + db = "raw_contigs" + log = "log.txt" + mapping = "vector_mapping.tsv" + + os.system(f"""\ + cd {OUT} + makeblastdb -in {contigs} -dbtype nucl -out {db} 1>>{log} 2>&1 + blastn -db {db} -outfmt "6 sseqid pident length sstart send" -query {backbone} -perc_identity 85 -out ./{mapping} 1>>{log} 2>&1 + """) + + hits = {} + with open(OUT.joinpath(mapping)) as f: + for l in f: + id, _, length, start, end = l.split("\t") + hits[id] = hits.get(id, [])+[(int(start), int(end))] + + cleaned_contigs = OUT.joinpath(f"no_vector.fa") + with open(cleaned_contigs, "w") as f: + original = SeqIO.parse(contigs, "fasta") + for entry in original: + if entry.id in hits: + # mark detected vector sequeces with x + _seq = list(entry.seq) + for s, e in hits[entry.id]: + for i in range(s-1, e): + _seq[i] = "x" + seqs = [] + _curr = [] + # remove vector and split contigs if need be + for ch in _seq: + if ch == "x": + if len(_curr)>0: + seqs.append("".join(_curr)) + _curr.clear() + else: + _curr.append(ch) + seqs.append("".join(_curr)) + seqs = [s for s in seqs if len(s)>0] + else: + seqs = [str(entry.seq)] + + for i, seq in enumerate(seqs): + f.write(f">{entry.id}_{i:02} length={len(seq)}\n") + f.write(seq+"\n") + + logging.info("done\n") + return str(cleaned_contigs) + def EstimateFosmidPoolSize( reads: list[Path], backbone: Path, diff --git a/src/fabfos/external_qc.py b/src/fabfos/external_qc.py index c1e5f01..4508b25 100644 --- a/src/fabfos/external_qc.py +++ b/src/fabfos/external_qc.py @@ -54,7 +54,7 @@ def AssemblyStats(ws: Path, reads: list[Path], assembly: Path, cpus: int, paired BAM=./temp.coverage.bam # align reads to assembly - minimap2 -a {pe_params} --secondary=no {ASM} {' '.join([str(f) for f in READ_FILES])} 2>>{LOG} | samtools sort --threads {cpus} -o $BAM --write-index - + minimap2 -a {pe_params} --secondary=no {ASM} {' '.join([str(f) for f in READ_FILES])} 2>>{LOG} | samtools sort --threads {cpus} -o $BAM --write-index - 2>>{LOG} # read coverage echo "{COV_HEADER}">{COV} diff --git a/src/fabfos/fabfos.py b/src/fabfos/fabfos.py index 2668621..973ef4a 100755 --- a/src/fabfos/fabfos.py +++ b/src/fabfos/fabfos.py @@ -28,7 +28,7 @@ from Bio import SeqIO from pathlib import Path from packaging import version - from .addons import EstimateFosmidPoolSize + from .addons import EstimateFosmidPoolSize, TrimBackbone from .external_qc import Fastqc, AssemblyStats except (ImportWarning, ModuleNotFoundError): @@ -2290,12 +2290,14 @@ def fabfos_main(sys_args): sample = Sample(sample_id) ghetto_sync_args(args, fos_father, sample) + # read qc sample.gather_reads(args.reads, args.reverse, args.parity, executables, sample.output_dir, args.type) Fastqc(out_path, [Path(r) for r in [sample.forward_reads, sample.reverse_reads] if r is not None]) qc_pe, qc_se = sample.qc_reads(args.background, args.parity, fos_father.adaptor_trim, executables, args.threads) qced_reads = qc_pe if len(qc_pe) > 0 else [qc_se[0]] - if args.pool_size is None: + # pool size estimate + if args.pool_size is None and args.vector: logging.info(f"Estimating fosmid pool size... ") size_estimate = EstimateFosmidPoolSize([Path(r) for r in qced_reads], Path(args.vector), Path(args.output).joinpath("temp_pool_size_estimate"), args.threads) if size_estimate is None: @@ -2322,20 +2324,24 @@ def fabfos_main(sys_args): if not Path(sample.assembled_fosmids).exists(): logging.error(f"assembly failed") sys.exit(1) + + # trim vector + if args.vector: + sample.assembled_fosmids = TrimBackbone(out_path, Path(args.vector), Path(sample.assembled_fosmids)) + fosmid_assembly = read_fasta(sample.assembled_fosmids) assembly_stats = get_assembly_stats(fosmid_assembly, sample.assembler) assembly_stats["est_n_fosmids"] = size_estimate # For mapping fosmid ends: if args.ends: - logging.info(f"Mapping end sequences... ") + logging.info(f"Mapping end sequences:\n") map_ends(executables, args.ends, sample) ends_mapping = parse_end_alignments(sample, fosmid_assembly, ends_stats) clone_map, multi_fosmid_map = assign_clones(ends_mapping, ends_stats, fosmid_assembly) clone_map = prune_and_scaffold_fosmids(sample, clone_map, multi_fosmid_map) final_contig_file = write_fosmid_assignments(sample, clone_map) write_fosmid_end_failures(sample, ends_stats) - logging.info(f"done.\n") else: # some function above does this if ends are provided and also adds end mappings to the headers final_contig_file = Path(sample.output_dir).joinpath(f"{sample.id}_contigs.fasta") diff --git a/src/fabfos/version.txt b/src/fabfos/version.txt index f88cf52..cd99d38 100644 --- a/src/fabfos/version.txt +++ b/src/fabfos/version.txt @@ -1 +1 @@ -1.13.0 \ No newline at end of file +1.14.0 \ No newline at end of file From c837b71199554a99d58e34a975331b7a71e6c72d Mon Sep 17 00:00:00 2001 From: Tony Liu Date: Thu, 21 Sep 2023 12:13:25 -0700 Subject: [PATCH 3/3] end mapping checked & filter min length --- dev.sh | 4 +- src/fabfos/addons.py | 8 +++ src/fabfos/cli.py | 2 + src/fabfos/fabfos.py | 107 ++++++++++++++++++++++++----------------- src/fabfos/version.txt | 2 +- 5 files changed, 75 insertions(+), 48 deletions(-) diff --git a/dev.sh b/dev.sh index fdd588a..8665c46 100755 --- a/dev.sh +++ b/dev.sh @@ -135,9 +135,9 @@ case $1 in --threads 12 \ --output ./manual_test \ --assembler megahit \ - -i --reads ./beaver_cecum_2ndhits/EKL/Raw_Data/EKL_Cecum_ligninases_pool_secondary_hits_ss01.fastq \ + -i --reads ./EKL_Cecum_ligninases_pool_secondary_hits_ss01.fastq \ -b ./ecoli_k12_mg1655.fasta \ - --ends ./beaver_cecum_2ndhits/endseqs.fasta \ + --ends ./endseqs_cec.fasta \ --ends-name-regex "\\w+_\\d+" \ --ends-fw-flag "FW" \ --vector ./pcc1.fasta diff --git a/src/fabfos/addons.py b/src/fabfos/addons.py index 8b9b9b0..c8ddb2a 100644 --- a/src/fabfos/addons.py +++ b/src/fabfos/addons.py @@ -76,6 +76,14 @@ def TrimBackbone(ws: Path, backbone: Path, contigs: Path): logging.info("done\n") return str(cleaned_contigs) +def FilterMinLength(contigs: Path, min_length: int): + logging.info(f"Filtering contigs to be at least {min_length}bp... ") + original = SeqIO.parse(contigs, "fasta") + filtered_contigs_path = contigs.parent.joinpath(f"contigs_{min_length}.fa") + SeqIO.write((e for e in original if len(e.seq)>=min_length), filtered_contigs_path, "fasta") + logging.info("done\n") + return filtered_contigs_path + def EstimateFosmidPoolSize( reads: list[Path], backbone: Path, diff --git a/src/fabfos/cli.py b/src/fabfos/cli.py index ac90ea2..be027d6 100644 --- a/src/fabfos/cli.py +++ b/src/fabfos/cli.py @@ -16,9 +16,11 @@ # copyright 2023 Tony Liu, Connor Morgan-Lang, Avery Noonan, # Zach Armstrong, and Steven J. Hallam + import os, sys from .fabfos import fabfos_main +# this is just an entry point def main(): try: fabfos_main(sys.argv[1:]) diff --git a/src/fabfos/fabfos.py b/src/fabfos/fabfos.py index 973ef4a..427992a 100755 --- a/src/fabfos/fabfos.py +++ b/src/fabfos/fabfos.py @@ -25,10 +25,11 @@ import subprocess import shutil import logging + # import time from Bio import SeqIO from pathlib import Path from packaging import version - from .addons import EstimateFosmidPoolSize, TrimBackbone + from .addons import EstimateFosmidPoolSize, TrimBackbone, FilterMinLength from .external_qc import Fastqc, AssemblyStats except (ImportWarning, ModuleNotFoundError): @@ -315,9 +316,11 @@ def assemble_fosmids(self, executables, num_threads): min_count, cov = determine_min_count(self.read_stats.num_reads_assembled, self.num_fosmids_estimate, k_max) with open(Path(self.output_dir).joinpath(f"estimated_coverage.txt"), "w") as f_cov: f_cov.write(f"{cov}\n") - assemble_fosmids(self, self.assembler, self.assembly_mode, k_min, k_max, min_count, executables, num_threads=num_threads) + _asm = assemble_fosmids(self, self.assembler, self.assembly_mode, k_min, k_max, min_count, executables, num_threads=num_threads) # clean_intermediates(self) self.assembled_fosmids = str(Path(self.output_dir).joinpath("temp_assembly").joinpath("final.contigs.fa")) + if Path(_asm).absolute() != Path(self.assembled_fosmids).absolute(): + os.system(f"mv {_asm} {self.assembled_fosmids}") return @@ -583,6 +586,9 @@ def get_options(sys_args: list): help="overwrite the output directory if it already exists") opts.add_argument("--ends-fw-flag", required=False, default=None, help='string that marks forward ends, ex. "_FW" would cause ABC_123_FW and ABC_123_RE to be assigned to forward and reverse respectively.') + opts.add_argument("-l", "--min-length", type=int, required=False, default=1000, + help="Assembled contigs shorter than this will be discarded [DEFAULT = 1000]") + misc.add_argument("-v", "--version", default=False, action="store_true", help="Print the FabFos version and exit.") misc.add_argument("--verbose", required=False, default=False, action="store_true", @@ -1227,7 +1233,7 @@ def assemble_fosmids(sample: Sample, assembler: str, assembly_mode: str, k_min: # sample.output_dir + os.sep + sample.id + "_scaffolds.fasta") # shutil.rmtree(sample.output_dir + "assembly" + os.sep) logging.info("Assembly done.\n") - return + return f_contigs def read_fastq_to_dict(fastq_file: str) -> dict: @@ -1613,7 +1619,6 @@ def get_exterior_positions(hits, clone): end = max(positions) return start, end - def assign_clones(ends_mapping, ends_stats: FosmidEnds, fosmid_assembly): """ Assigns fosmid-end sequences to assembled contigs in fosmid_assembly @@ -1774,7 +1779,7 @@ def prune_and_scaffold_fosmids(sample, clone_map, multi_fosmid_map): trailing_node = mate.contig # Scaffold the fosmids if suffix and prefix: - new_fosmid = FosmidClone(leading_node + ',' + trailing_node, prefix + suffix) + new_fosmid = FosmidClone(leading_node + ',' + trailing_node, prefix + "-" + suffix) new_fosmid.clone = clone new_fosmid.evidence = "twin-orphan multi-contig" new_fosmid.ends = "Both" @@ -1797,36 +1802,45 @@ def prune_and_scaffold_fosmids(sample, clone_map, multi_fosmid_map): return clone_map -def write_fosmid_assignments(sample, clone_map: list[FosmidClone]): +def write_fosmid_assignments(sample, clone_map: list[FosmidClone], min_length: int): ws = Path(sample.output_dir) - _fpaths = [ws.joinpath(f"{sample.id}_{f}.fasta") for f in [ - "fosmids_all_contigs", "fosmids_not_mapped", "fosmids_single_mapped", "fosmids_both_mapped", + _fpaths = [ws.joinpath(f"contigs_{f}.fasta") for f in [ + "not_mapped", "single_mapped", "both_mapped", f"less_than_{min_length}", ]] - _all, _none, _single, _complete = [open(p, "w") for p in _fpaths] + _none, _single, _complete, _short = [open(p, "w") for p in _fpaths] - def write_header(file, id: int, clone, mapping=None): + def write_header(file, id: int, clone, l, mapping=None): m = f" {mapping}" if mapping is not None else "" c = f"_{clone}" if clone != "None" else "" - file.write(f">{id:04}{c}{m}"+"\n") + file.write(f">{id:04}{c} length={l}{m}"+"\n") + clone_map = sorted(clone_map, key=lambda c: c.seq_length, reverse=True) for i, fosmid_clone in enumerate(clone_map): id = i+1 ends = {"F":"FW", "R":"RE"}.get(fosmid_clone.ends, fosmid_clone.ends) + contig_seq = fosmid_clone.sequence - write_header(_all, id, clone=str(fosmid_clone.clone), mapping=f"{fosmid_clone.evidence}|{ends}".replace(" ", "_").lower()) - fasta = { - "None": _none, - "Both": _complete, - "FW": _single, - "RE": _single, - }[ends] - write_header(fasta, id, fosmid_clone.clone, mapping=ends if ends in {"FW", "RE"} else None) - - for fa in [_all, fasta]: - fa.write(fosmid_clone.sequence) - fa.write("\n") - - for file_handle in _all, _none, _single, _complete: + if fosmid_clone.seq_length < min_length: + write_header(_short, id, clone=str(fosmid_clone.clone), l=len(contig_seq), mapping=f"{fosmid_clone.evidence}|{ends}".replace(" ", "_").lower()) + _short.write(contig_seq) + _short.write("\n") + else: + # write_header(_all, id, clone=str(fosmid_clone.clone), l=len(contig_seq), mapping=f"{fosmid_clone.evidence}|{ends}".replace(" ", "_").lower()) + fasta = { + "None": _none, + "Both": _complete, + "FW": _single, + "RE": _single, + }[ends] + write_header(fasta, id, clone=str(fosmid_clone.clone), l=len(contig_seq), mapping=ends if ends in {"FW", "RE"} else None) + + # for fa in [_all, fasta]: + for fa in [fasta]: + fa.write(contig_seq) + fa.write("\n") + + # for file_handle in _all, _none, _single, _complete: + for file_handle in _none, _single, _complete: file_handle.close() return _fpaths[0] @@ -2030,27 +2044,27 @@ def write_fosmid_end_failures(sample: Sample, ends_stats: FosmidEnds) -> None: return -def write_unique_fosmid_ends_to_bulk(args): - logging.info("Writing new fosmid end sequences to the legacy fosmid end file... ") - new_fosmid_ends = read_fasta(args.ends) - bulk_ends_file = args.output + "/FabFos_legacy_ends.fasta" - bulk_ends = read_fasta(bulk_ends_file) +# def write_unique_fosmid_ends_to_bulk(args): +# logging.info("Writing new fosmid end sequences to the legacy fosmid end file... ") +# new_fosmid_ends = read_fasta(args.ends) +# bulk_ends_file = args.output + "/FabFos_legacy_ends.fasta" +# bulk_ends = read_fasta(bulk_ends_file) - try: - legacy_ends = open(bulk_ends_file, 'a') - except IOError: - raise IOError("Cannot open " + bulk_ends_file + " for appending!") +# try: +# legacy_ends = open(bulk_ends_file, 'a') +# except IOError: +# raise IOError("Cannot open " + bulk_ends_file + " for appending!") - for new_end in new_fosmid_ends: - if new_end not in bulk_ends.keys(): - if len(new_fosmid_ends[new_end]) > 100: - legacy_ends.write(new_end + "\n") - legacy_ends.write(new_fosmid_ends[new_end] + "\n") +# for new_end in new_fosmid_ends: +# if new_end not in bulk_ends.keys(): +# if len(new_fosmid_ends[new_end]) > 100: +# legacy_ends.write(new_end + "\n") +# legacy_ends.write(new_fosmid_ends[new_end] + "\n") - legacy_ends.close() - logging.info("done.\n") +# legacy_ends.close() +# logging.info("done.\n") - return +# return def align_nanopore_to_background(args, executables, sample): @@ -2205,7 +2219,7 @@ def determine_min_count(num_reads: int, num_fosmids: int, k_max: int): :return: """ approx_coverage = (num_reads * k_max) / (int(num_fosmids) * 40000) - sys.stdout.write("Approximate fosmid coverage = " + str(approx_coverage) + "\n") + sys.stdout.write("Estimated fosmid coverage = " + str(approx_coverage) + "\n") sys.stdout.flush() min_count = 10 dist_tail = approx_coverage / 100 @@ -2271,6 +2285,8 @@ def fabfos_main(sys_args): out_path = Path(args.output) if out_path.exists(): if args.overwrite: + # logging.info(f"cleaning output folder [{out_path}]\n") + # time.sleep(3) os.system(f"rm -r {out_path}") else: logging.error(f"output folder [{out_path}] already exists\n") @@ -2320,7 +2336,7 @@ def fabfos_main(sys_args): assemble_nanopore_reads(sample, executables["canu"], args.skip_correction, args.threads) else: sample.assemble_fosmids(executables, args.threads) - + if not Path(sample.assembled_fosmids).exists(): logging.error(f"assembly failed") sys.exit(1) @@ -2340,10 +2356,11 @@ def fabfos_main(sys_args): ends_mapping = parse_end_alignments(sample, fosmid_assembly, ends_stats) clone_map, multi_fosmid_map = assign_clones(ends_mapping, ends_stats, fosmid_assembly) clone_map = prune_and_scaffold_fosmids(sample, clone_map, multi_fosmid_map) - final_contig_file = write_fosmid_assignments(sample, clone_map) + final_contig_file = write_fosmid_assignments(sample, clone_map, int(args.min_length)) write_fosmid_end_failures(sample, ends_stats) else: # some function above does this if ends are provided and also adds end mappings to the headers + sample.assembled_fosmids = str(FilterMinLength(Path(sample.assembled_fosmids), int(args.min_length))) final_contig_file = Path(sample.output_dir).joinpath(f"{sample.id}_contigs.fasta") shutil.copy(sample.assembled_fosmids, final_contig_file) diff --git a/src/fabfos/version.txt b/src/fabfos/version.txt index cd99d38..d19d089 100644 --- a/src/fabfos/version.txt +++ b/src/fabfos/version.txt @@ -1 +1 @@ -1.14.0 \ No newline at end of file +1.15.0 \ No newline at end of file