Skip to content

Commit

Permalink
Merge pull request #4 from Tony-xy-Liu/fix_endmapping
Browse files Browse the repository at this point in the history
check implementation
  • Loading branch information
Tony-xy-Liu authored Sep 21, 2023
2 parents e5d4dcd + c837b71 commit 66d17e0
Show file tree
Hide file tree
Showing 7 changed files with 263 additions and 76 deletions.
9 changes: 6 additions & 3 deletions dev.sh
Original file line number Diff line number Diff line change
Expand Up @@ -133,15 +133,18 @@ case $1 in
python -m $NAME \
--overwrite \
--threads 12 \
--output ./no_miffed_test \
--output ./manual_test \
--assembler megahit \
-i --reads ./beaver_cecum_2ndhits/EKL/Raw_Data/EKL_Cecum_ligninases_pool_secondary_hits_ss01.fastq \
-i --reads ./EKL_Cecum_ligninases_pool_secondary_hits_ss01.fastq \
-b ./ecoli_k12_mg1655.fasta \
--ends ./beaver_cecum_2ndhits/endseqs.fasta \
--ends ./endseqs_cec.fasta \
--ends-name-regex "\\w+_\\d+" \
--ends-fw-flag "FW" \
--vector ./pcc1.fasta

# --reads ./beaver_cecum_2ndhits/EKL/Raw_Data/EKL_Cecum_ligninases_pool_secondary_hits_ss01.fastq \
# --parity se \

# --pool-size 20

# -i --reads ./beaver_cecum_2ndhits/EKL/Raw_Data/EKL_Cecum_ligninases_pool_secondary_hits_ss10.fastq \
Expand Down
6 changes: 5 additions & 1 deletion envs/base.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ channels:
- bioconda
- conda-forge
dependencies:
- python=3.11
- python=3.10
- packaging=21.3
- bwa=0.7
- samtools=1.15
Expand All @@ -15,3 +15,7 @@ dependencies:
- canu=2.2
- biopython=1.81
- vsearch=2.23
- fastqc
- minimap2
- bedtools
- quast
62 changes: 62 additions & 0 deletions src/fabfos/addons.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,72 @@

from Bio import SeqIO
from pathlib import Path
import logging

import subprocess
import os

def TrimBackbone(ws: Path, backbone: Path, contigs: Path):
OUT = ws.joinpath("temp_trim_vector"); os.makedirs(OUT, exist_ok=True)
logging.info("Removing vector backbone from contigs... ")
contigs = contigs.absolute()
backbone = backbone.absolute()
db = "raw_contigs"
log = "log.txt"
mapping = "vector_mapping.tsv"

os.system(f"""\
cd {OUT}
makeblastdb -in {contigs} -dbtype nucl -out {db} 1>>{log} 2>&1
blastn -db {db} -outfmt "6 sseqid pident length sstart send" -query {backbone} -perc_identity 85 -out ./{mapping} 1>>{log} 2>&1
""")

hits = {}
with open(OUT.joinpath(mapping)) as f:
for l in f:
id, _, length, start, end = l.split("\t")
hits[id] = hits.get(id, [])+[(int(start), int(end))]

cleaned_contigs = OUT.joinpath(f"no_vector.fa")
with open(cleaned_contigs, "w") as f:
original = SeqIO.parse(contigs, "fasta")
for entry in original:
if entry.id in hits:
# mark detected vector sequeces with x
_seq = list(entry.seq)
for s, e in hits[entry.id]:
for i in range(s-1, e):
_seq[i] = "x"
seqs = []
_curr = []
# remove vector and split contigs if need be
for ch in _seq:
if ch == "x":
if len(_curr)>0:
seqs.append("".join(_curr))
_curr.clear()
else:
_curr.append(ch)
seqs.append("".join(_curr))
seqs = [s for s in seqs if len(s)>0]
else:
seqs = [str(entry.seq)]

for i, seq in enumerate(seqs):
f.write(f">{entry.id}_{i:02} length={len(seq)}\n")
f.write(seq+"\n")

logging.info("done\n")
return str(cleaned_contigs)

def FilterMinLength(contigs: Path, min_length: int):
logging.info(f"Filtering contigs to be at least {min_length}bp... ")
original = SeqIO.parse(contigs, "fasta")
filtered_contigs_path = contigs.parent.joinpath(f"contigs_{min_length}.fa")
SeqIO.write((e for e in original if len(e.seq)>=min_length), filtered_contigs_path, "fasta")
logging.info("done\n")
return filtered_contigs_path

def EstimateFosmidPoolSize(
reads: list[Path],
backbone: Path,
Expand Down
2 changes: 2 additions & 0 deletions src/fabfos/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@
# copyright 2023 Tony Liu, Connor Morgan-Lang, Avery Noonan,
# Zach Armstrong, and Steven J. Hallam


import os, sys
from .fabfos import fabfos_main

# this is just an entry point
def main():
try:
fabfos_main(sys.argv[1:])
Expand Down
82 changes: 82 additions & 0 deletions src/fabfos/external_qc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import os, sys
from pathlib import Path
import logging

def _read_counts(ws: Path, read_file: Path):
#https://bioinformatics.stackexchange.com/questions/935/fast-way-to-count-number-of-reads-and-number-of-bases-in-a-fastq-file
read_sizes = ws.joinpath("temp.readcount.txt")
os.system(f"""\
cat {read_file} \
| awk 'NR % 4 == 2' \
| wc -cl >{read_sizes} \
""")

num_reads, nucleotides = 0, 0
with open(read_sizes) as f:
toks = f.readline()[:-1].strip()
if "\t" in toks: toks = toks.split("\t")
else: toks = [t for t in toks.split(" ") if len(t)>0]
num_reads, nucleotides = toks
os.unlink(read_sizes)

return int(nucleotides), int(num_reads)

# runs fastqc on reads
def Fastqc(ws: Path, reads: list[Path]):
logging.info("Running Fastqc...")
OUT = ws.joinpath("fastqc"); os.makedirs(OUT, exist_ok=True)
LOG = OUT.joinpath("log.txt")
os.system(f"""\
fastqc -o {OUT} {" ".join(str(r) for r in reads)} 1>{LOG} 2>{LOG}
""")
logging.info("done.\n")
return OUT

# gets read coverage of contigs and various assembly stats
def AssemblyStats(ws: Path, reads: list[Path], assembly: Path, cpus: int, paired_end=True):
logging.info("Calculating assembly statistics...")
LOG = "log.txt"
OUT = ws.joinpath("temp_assembly_stats"); os.makedirs(OUT, exist_ok=True)
READ_FILES = [r.absolute() for r in reads]
ASM = assembly.absolute()
nucleotides, read_count = 0, 0
for r in READ_FILES:
_n, _c = _read_counts(OUT, r)
nucleotides+=_n
read_count+=_c

pe_params = "-x sr" if paired_end else ""

COV = f"./assembly_coverage.tsv"
COV_HEADER = "\t".join("contig, start, end, coverage".split(", "))
os.system(f"""\
cd {OUT}
BAM=./temp.coverage.bam
# align reads to assembly
minimap2 -a {pe_params} --secondary=no {ASM} {' '.join([str(f) for f in READ_FILES])} 2>>{LOG} | samtools sort --threads {cpus} -o $BAM --write-index - 2>>{LOG}
# read coverage
echo "{COV_HEADER}">{COV}
bedtools genomecov -ibam $BAM -bg >>{COV}
mv {COV} ../
samtools flagstat $BAM >./stats.txt
# quast for asm stats
quast -t {cpus} -o ./quast {ASM} 1>>{LOG}
mv ./quast ../
""")
# with open(OUT.joinpath(COV)) as f:
# with open(ws.joinpath(COV), "w") as out:
# out.write("\t".join("contig, start, end, coverage".split(", ")))
# out.writelines(f.readlines())
# for l in f:
# # out.wr
# # con, s, e, cov = l.split("\t")
# # con = int(con.split("_")[-1])
# # con = f"{con:04}"
# # out.write("\t".join([con, s, e, cov]))
# os.unlink(OUT.joinpath(COV))

logging.info("done.\n")
return OUT
Loading

0 comments on commit 66d17e0

Please sign in to comment.