Skip to content

Commit

Permalink
trim out vector backbone
Browse files Browse the repository at this point in the history
  • Loading branch information
Tony-xy-Liu committed Sep 18, 2023
1 parent 39b13d8 commit 1c6143b
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 6 deletions.
54 changes: 54 additions & 0 deletions src/fabfos/addons.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,64 @@

from Bio import SeqIO
from pathlib import Path
import logging

import subprocess
import os

def TrimBackbone(ws: Path, backbone: Path, contigs: Path):
OUT = ws.joinpath("temp_trim_vector"); os.makedirs(OUT, exist_ok=True)
logging.info("Removing vector backbone from contigs... ")
contigs = contigs.absolute()
backbone = backbone.absolute()
db = "raw_contigs"
log = "log.txt"
mapping = "vector_mapping.tsv"

os.system(f"""\
cd {OUT}
makeblastdb -in {contigs} -dbtype nucl -out {db} 1>>{log} 2>&1
blastn -db {db} -outfmt "6 sseqid pident length sstart send" -query {backbone} -perc_identity 85 -out ./{mapping} 1>>{log} 2>&1
""")

hits = {}
with open(OUT.joinpath(mapping)) as f:
for l in f:
id, _, length, start, end = l.split("\t")
hits[id] = hits.get(id, [])+[(int(start), int(end))]

cleaned_contigs = OUT.joinpath(f"no_vector.fa")
with open(cleaned_contigs, "w") as f:
original = SeqIO.parse(contigs, "fasta")
for entry in original:
if entry.id in hits:
# mark detected vector sequeces with x
_seq = list(entry.seq)
for s, e in hits[entry.id]:
for i in range(s-1, e):
_seq[i] = "x"
seqs = []
_curr = []
# remove vector and split contigs if need be
for ch in _seq:
if ch == "x":
if len(_curr)>0:
seqs.append("".join(_curr))
_curr.clear()
else:
_curr.append(ch)
seqs.append("".join(_curr))
seqs = [s for s in seqs if len(s)>0]
else:
seqs = [str(entry.seq)]

for i, seq in enumerate(seqs):
f.write(f">{entry.id}_{i:02} length={len(seq)}\n")
f.write(seq+"\n")

logging.info("done\n")
return str(cleaned_contigs)

def EstimateFosmidPoolSize(
reads: list[Path],
backbone: Path,
Expand Down
2 changes: 1 addition & 1 deletion src/fabfos/external_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def AssemblyStats(ws: Path, reads: list[Path], assembly: Path, cpus: int, paired
BAM=./temp.coverage.bam
# align reads to assembly
minimap2 -a {pe_params} --secondary=no {ASM} {' '.join([str(f) for f in READ_FILES])} 2>>{LOG} | samtools sort --threads {cpus} -o $BAM --write-index -
minimap2 -a {pe_params} --secondary=no {ASM} {' '.join([str(f) for f in READ_FILES])} 2>>{LOG} | samtools sort --threads {cpus} -o $BAM --write-index - 2>>{LOG}
# read coverage
echo "{COV_HEADER}">{COV}
Expand Down
14 changes: 10 additions & 4 deletions src/fabfos/fabfos.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from Bio import SeqIO
from pathlib import Path
from packaging import version
from .addons import EstimateFosmidPoolSize
from .addons import EstimateFosmidPoolSize, TrimBackbone
from .external_qc import Fastqc, AssemblyStats

except (ImportWarning, ModuleNotFoundError):
Expand Down Expand Up @@ -2290,12 +2290,14 @@ def fabfos_main(sys_args):
sample = Sample(sample_id)
ghetto_sync_args(args, fos_father, sample)

# read qc
sample.gather_reads(args.reads, args.reverse, args.parity, executables, sample.output_dir, args.type)
Fastqc(out_path, [Path(r) for r in [sample.forward_reads, sample.reverse_reads] if r is not None])
qc_pe, qc_se = sample.qc_reads(args.background, args.parity, fos_father.adaptor_trim, executables, args.threads)
qced_reads = qc_pe if len(qc_pe) > 0 else [qc_se[0]]

if args.pool_size is None:
# pool size estimate
if args.pool_size is None and args.vector:
logging.info(f"Estimating fosmid pool size... ")
size_estimate = EstimateFosmidPoolSize([Path(r) for r in qced_reads], Path(args.vector), Path(args.output).joinpath("temp_pool_size_estimate"), args.threads)
if size_estimate is None:
Expand All @@ -2322,20 +2324,24 @@ def fabfos_main(sys_args):
if not Path(sample.assembled_fosmids).exists():
logging.error(f"assembly failed")
sys.exit(1)

# trim vector
if args.vector:
sample.assembled_fosmids = TrimBackbone(out_path, Path(args.vector), Path(sample.assembled_fosmids))

fosmid_assembly = read_fasta(sample.assembled_fosmids)
assembly_stats = get_assembly_stats(fosmid_assembly, sample.assembler)
assembly_stats["est_n_fosmids"] = size_estimate

# For mapping fosmid ends:
if args.ends:
logging.info(f"Mapping end sequences... ")
logging.info(f"Mapping end sequences:\n")
map_ends(executables, args.ends, sample)
ends_mapping = parse_end_alignments(sample, fosmid_assembly, ends_stats)
clone_map, multi_fosmid_map = assign_clones(ends_mapping, ends_stats, fosmid_assembly)
clone_map = prune_and_scaffold_fosmids(sample, clone_map, multi_fosmid_map)
final_contig_file = write_fosmid_assignments(sample, clone_map)
write_fosmid_end_failures(sample, ends_stats)
logging.info(f"done.\n")
else:
# some function above does this if ends are provided and also adds end mappings to the headers
final_contig_file = Path(sample.output_dir).joinpath(f"{sample.id}_contigs.fasta")
Expand Down
2 changes: 1 addition & 1 deletion src/fabfos/version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.13.0
1.14.0

0 comments on commit 1c6143b

Please sign in to comment.