From 1c6143b5e1d9333a606bcf7f1336f2977dc1ad2f Mon Sep 17 00:00:00 2001 From: Tony Liu Date: Mon, 18 Sep 2023 16:08:46 -0700 Subject: [PATCH] trim out vector backbone --- src/fabfos/addons.py | 54 +++++++++++++++++++++++++++++++++++++++ src/fabfos/external_qc.py | 2 +- src/fabfos/fabfos.py | 14 +++++++--- src/fabfos/version.txt | 2 +- 4 files changed, 66 insertions(+), 6 deletions(-) diff --git a/src/fabfos/addons.py b/src/fabfos/addons.py index 93b75a2..8b9b9b0 100644 --- a/src/fabfos/addons.py +++ b/src/fabfos/addons.py @@ -18,10 +18,64 @@ from Bio import SeqIO from pathlib import Path +import logging import subprocess import os +def TrimBackbone(ws: Path, backbone: Path, contigs: Path): + OUT = ws.joinpath("temp_trim_vector"); os.makedirs(OUT, exist_ok=True) + logging.info("Removing vector backbone from contigs... ") + contigs = contigs.absolute() + backbone = backbone.absolute() + db = "raw_contigs" + log = "log.txt" + mapping = "vector_mapping.tsv" + + os.system(f"""\ + cd {OUT} + makeblastdb -in {contigs} -dbtype nucl -out {db} 1>>{log} 2>&1 + blastn -db {db} -outfmt "6 sseqid pident length sstart send" -query {backbone} -perc_identity 85 -out ./{mapping} 1>>{log} 2>&1 + """) + + hits = {} + with open(OUT.joinpath(mapping)) as f: + for l in f: + id, _, length, start, end = l.split("\t") + hits[id] = hits.get(id, [])+[(int(start), int(end))] + + cleaned_contigs = OUT.joinpath(f"no_vector.fa") + with open(cleaned_contigs, "w") as f: + original = SeqIO.parse(contigs, "fasta") + for entry in original: + if entry.id in hits: + # mark detected vector sequeces with x + _seq = list(entry.seq) + for s, e in hits[entry.id]: + for i in range(s-1, e): + _seq[i] = "x" + seqs = [] + _curr = [] + # remove vector and split contigs if need be + for ch in _seq: + if ch == "x": + if len(_curr)>0: + seqs.append("".join(_curr)) + _curr.clear() + else: + _curr.append(ch) + seqs.append("".join(_curr)) + seqs = [s for s in seqs if len(s)>0] + else: + seqs = [str(entry.seq)] + + for i, seq in enumerate(seqs): + f.write(f">{entry.id}_{i:02} length={len(seq)}\n") + f.write(seq+"\n") + + logging.info("done\n") + return str(cleaned_contigs) + def EstimateFosmidPoolSize( reads: list[Path], backbone: Path, diff --git a/src/fabfos/external_qc.py b/src/fabfos/external_qc.py index c1e5f01..4508b25 100644 --- a/src/fabfos/external_qc.py +++ b/src/fabfos/external_qc.py @@ -54,7 +54,7 @@ def AssemblyStats(ws: Path, reads: list[Path], assembly: Path, cpus: int, paired BAM=./temp.coverage.bam # align reads to assembly - minimap2 -a {pe_params} --secondary=no {ASM} {' '.join([str(f) for f in READ_FILES])} 2>>{LOG} | samtools sort --threads {cpus} -o $BAM --write-index - + minimap2 -a {pe_params} --secondary=no {ASM} {' '.join([str(f) for f in READ_FILES])} 2>>{LOG} | samtools sort --threads {cpus} -o $BAM --write-index - 2>>{LOG} # read coverage echo "{COV_HEADER}">{COV} diff --git a/src/fabfos/fabfos.py b/src/fabfos/fabfos.py index 2668621..973ef4a 100755 --- a/src/fabfos/fabfos.py +++ b/src/fabfos/fabfos.py @@ -28,7 +28,7 @@ from Bio import SeqIO from pathlib import Path from packaging import version - from .addons import EstimateFosmidPoolSize + from .addons import EstimateFosmidPoolSize, TrimBackbone from .external_qc import Fastqc, AssemblyStats except (ImportWarning, ModuleNotFoundError): @@ -2290,12 +2290,14 @@ def fabfos_main(sys_args): sample = Sample(sample_id) ghetto_sync_args(args, fos_father, sample) + # read qc sample.gather_reads(args.reads, args.reverse, args.parity, executables, sample.output_dir, args.type) Fastqc(out_path, [Path(r) for r in [sample.forward_reads, sample.reverse_reads] if r is not None]) qc_pe, qc_se = sample.qc_reads(args.background, args.parity, fos_father.adaptor_trim, executables, args.threads) qced_reads = qc_pe if len(qc_pe) > 0 else [qc_se[0]] - if args.pool_size is None: + # pool size estimate + if args.pool_size is None and args.vector: logging.info(f"Estimating fosmid pool size... ") size_estimate = EstimateFosmidPoolSize([Path(r) for r in qced_reads], Path(args.vector), Path(args.output).joinpath("temp_pool_size_estimate"), args.threads) if size_estimate is None: @@ -2322,20 +2324,24 @@ def fabfos_main(sys_args): if not Path(sample.assembled_fosmids).exists(): logging.error(f"assembly failed") sys.exit(1) + + # trim vector + if args.vector: + sample.assembled_fosmids = TrimBackbone(out_path, Path(args.vector), Path(sample.assembled_fosmids)) + fosmid_assembly = read_fasta(sample.assembled_fosmids) assembly_stats = get_assembly_stats(fosmid_assembly, sample.assembler) assembly_stats["est_n_fosmids"] = size_estimate # For mapping fosmid ends: if args.ends: - logging.info(f"Mapping end sequences... ") + logging.info(f"Mapping end sequences:\n") map_ends(executables, args.ends, sample) ends_mapping = parse_end_alignments(sample, fosmid_assembly, ends_stats) clone_map, multi_fosmid_map = assign_clones(ends_mapping, ends_stats, fosmid_assembly) clone_map = prune_and_scaffold_fosmids(sample, clone_map, multi_fosmid_map) final_contig_file = write_fosmid_assignments(sample, clone_map) write_fosmid_end_failures(sample, ends_stats) - logging.info(f"done.\n") else: # some function above does this if ends are provided and also adds end mappings to the headers final_contig_file = Path(sample.output_dir).joinpath(f"{sample.id}_contigs.fasta") diff --git a/src/fabfos/version.txt b/src/fabfos/version.txt index f88cf52..cd99d38 100644 --- a/src/fabfos/version.txt +++ b/src/fabfos/version.txt @@ -1 +1 @@ -1.13.0 \ No newline at end of file +1.14.0 \ No newline at end of file