Skip to content

Commit

Permalink
allow user to set non-default JSON projects file
Browse files Browse the repository at this point in the history
  • Loading branch information
ArtPoon committed Feb 25, 2020
1 parent c01ec25 commit b9fdd20
Show file tree
Hide file tree
Showing 9 changed files with 67 additions and 30 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,7 @@
.DS_Store
*/.DS_Store
*.log
examples/
*.pdf
*.csv

6 changes: 4 additions & 2 deletions bin/micall
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,12 @@ def parseArgs():
help="<optional> Path to bowtie2 script.")
parser.add_argument('--bt2build', default='bowtie2-build-s',
help="<optional> Path to bowtie2-build script.")
parser.add_argument('--threads', '-p', type=int, default=4,
parser.add_argument('--threads', '-t', type=int, default=4,
help="Number of threads for bowtie2 (default 4)")

parser.add_argument('--projects', '-p', type=argparse.FileType('rb'), required=False,
help='<optional> Specify a custom projects JSON file.')

if len(sys.argv) == 1:
parser.print_help()
sys.exit()
Expand Down Expand Up @@ -200,7 +203,6 @@ if __name__ == '__main__':

print("Using {} version {}".format(bowtie2.path, bowtie2.version))

#print(args)
if args.outdir is None:
# default write outputs to same location as inputs
if args.fastq1:
Expand Down
45 changes: 32 additions & 13 deletions micall/core/aln2counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,25 +487,39 @@ def _create_consensus_writer(self, conseq_file):
def write_consensus_header(self, conseq_file):
self._create_consensus_writer(conseq_file).writeheader()

def write_consensus(self, conseq_file):
def write_consensus(self, conseq_file, min_coverage=100):
"""
Generate nucleotide consensus sequences at varying mixture
cutoffs, and write to user-specified file.
:param conseq_file: csv.DictWriter object
:param min_coverage: depth of coverage below which the nucleotide
will be reported in lower-case.
:return:
"""
conseq_writer = self._create_consensus_writer(conseq_file)

for mixture_cutoff in self.conseq_mixture_cutoffs:
consensus = ''
offset = None
for seed_amino in self.seed_aminos[0]:
if offset is None:
if not seed_amino.counts:
continue
offset = seed_amino.consensus_index*3
offset = seed_amino.consensus_index * 3

for seed_nuc in seed_amino.nucleotides:
consensus += seed_nuc.get_consensus(mixture_cutoff)
cnuc = seed_nuc.get_consensus(mixture_cutoff)
coverage = sum(seed_nuc.counts.values())
consensus += cnuc.upper() if (coverage >= min_coverage) else cnuc.lower()

if offset is not None:
conseq_writer.writerow(
{'region': self.seed,
'q-cutoff': self.qcut,
'consensus-percent-cutoff': format_cutoff(mixture_cutoff),
'offset': offset,
'sequence': consensus})
conseq_writer.writerow({
'region': self.seed,
'q-cutoff': self.qcut,
'consensus-percent-cutoff': format_cutoff(mixture_cutoff),
'offset': offset,
'sequence': consensus
})

def _create_nuc_variants_writer(self, nuc_variants_file):
return csv.DictWriter(nuc_variants_file,
Expand Down Expand Up @@ -813,7 +827,8 @@ def aln2counts(aligned_csv,
failed_align_csv=None,
nuc_variants_csv=None,
callback=None,
coverage_summary_csv=None):
coverage_summary_csv=None,
json=None):
"""
Analyze aligned reads for nucleotide and amino acid frequencies.
Generate consensus sequences.
Expand All @@ -827,11 +842,15 @@ def aln2counts(aligned_csv,
@param nuc_variants_csv: Open file handle to write the most frequent nucleotide sequence
variants.
@param callback: a function to report progress with three optional
parameters - callback(message, progress, max_progress)
@param coverage_summary_csv Open file handle to write coverage depth.
parameters - callback(message, progress, max_progress)
@param coverage_summary_csv: Open file handle to write coverage depth.
@param json: specify a custom JSON project file; None loads the default file.
"""
# load project information
projects = project_config.ProjectConfig.loadDefault()
if json is None:
projects = project_config.ProjectConfig.loadDefault()
else:
projects = project_config.ProjectConfig.load(json)

# initialize reporter classes
insert_writer = InsertionWriter(coord_ins_csv)
Expand Down
12 changes: 9 additions & 3 deletions micall/core/prelim_map.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#! /usr/bin/env python
#! /usr/bin/env python3

"""
Shipyard-style bowtie2
Expand Down Expand Up @@ -37,7 +37,7 @@ def prelim_map(fastq1, fastq2, prelim_csv,
bt2_path='bowtie2', bt2build_path='bowtie2-build-s',
nthreads=BOWTIE_THREADS, callback=None,
rdgopen=READ_GAP_OPEN, rfgopen=REF_GAP_OPEN, stderr=sys.stderr,
gzip=False, work_path='', keep=False):
gzip=False, work_path='', keep=False, json=None):
""" Run the preliminary mapping step.
@param fastq1: the file name for the forward reads in FASTQ format
Expand All @@ -52,6 +52,8 @@ def prelim_map(fastq1, fastq2, prelim_csv,
@param stderr: where to write the standard error output from bowtie2 calls.
@param gzip: if True, FASTQ files are compressed
@param work_path: optional path to store working files
@param keep: if False, delete temporary files
@param json: specify a custom JSON project file; None loads the default file.
"""

bowtie2 = Bowtie2(execname=bt2_path)
Expand Down Expand Up @@ -92,7 +94,11 @@ def prelim_map(fastq1, fastq2, prelim_csv,
max_progress=total_reads)

# generate initial reference files
projects = project_config.ProjectConfig.loadDefault()
if json is None:
projects = project_config.ProjectConfig.loadDefault()
else:
projects = project_config.ProjectConfig.load(json)

ref_path = os.path.join(work_path, 'micall.fasta')
with open(ref_path, 'w') as ref:
projects.writeSeedFasta(ref)
Expand Down
9 changes: 7 additions & 2 deletions micall/core/remap.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ def remap(fastq1, fastq2, prelim_csv, remap_csv, remap_counts_csv=None,
bt2_path='bowtie2', bt2build_path='bowtie2-build-s',
nthreads=BOWTIE_THREADS, callback=None, count_threshold=10,
rdgopen=READ_GAP_OPEN, rfgopen=REF_GAP_OPEN, stderr=sys.stderr,
gzip=False, debug_file_prefix=None, keep=False):
gzip=False, debug_file_prefix=None, keep=False, json=None):
"""
Iterative re-map reads from raw paired FASTQ files to a reference sequence set that
is being updated as the consensus of the reads that were mapped to the last set.
Expand All @@ -404,6 +404,7 @@ def remap(fastq1, fastq2, prelim_csv, remap_csv, remap_counts_csv=None,
@param count_threshold: minimum number of reads that map to a region for it to be remapped
@param rdgopen: read gap open penalty
@param rfgopen: reference gap open penalty
@param json: specify a custom JSON project file; None loads the default file.
"""

reffile = os.path.join(work_path, 'temp.fasta')
Expand Down Expand Up @@ -441,7 +442,11 @@ def remap(fastq1, fastq2, prelim_csv, remap_csv, remap_counts_csv=None,
worker_pool = multiprocessing.Pool(processes=nthreads) if nthreads > 1 else None

# retrieve reference sequences used for preliminary mapping
projects = project_config.ProjectConfig.loadDefault()
if json is None:
projects = project_config.ProjectConfig.loadDefault()
else:
projects = project_config.ProjectConfig.load(json)

seeds = {}
for seed, vals in projects.config['regions'].items():
seqs = vals['reference']
Expand Down
16 changes: 8 additions & 8 deletions micall/tests/aln2counts_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import csv
import StringIO
from io import StringIO
import sys
import unittest

Expand Down Expand Up @@ -30,23 +30,23 @@ def add_override(self,
query,
aligned_query,
aligned_reference,
score=sys.maxint):
score=sys.maxsize):
self.overrides[(reference, query)] = (aligned_reference,
aligned_query,
score)


class SequenceReportTest(unittest.TestCase):
def setUp(self):
self.insertion_file = StringIO.StringIO()
self.insertion_file = StringIO()
insert_writer = InsertionWriter(
insert_file=self.insertion_file)
projects = project_config.ProjectConfig()

# Content of seed regions is irrelevant. For R-NO-COORD, there is
# no coordinate reference, so we use the seed reference for display, but
# only the length matters.
projects.load(StringIO.StringIO("""\
projects.load(StringIO("""\
{
"projects": {
"R1": {
Expand Down Expand Up @@ -148,11 +148,11 @@ def setUp(self):
self.report = StubbedSequenceReport(insert_writer,
projects,
conseq_mixture_cutoffs)
self.report_file = StringIO.StringIO()
self.report_file = StringIO()

def prepareReads(self, aligned_reads_text):
full_text = "refname,qcut,rank,count,offset,seq\n" + aligned_reads_text
dummy_file = StringIO.StringIO(full_text)
dummy_file = StringIO(full_text)
return csv.DictReader(dummy_file)

def testEmptyAminoReport(self):
Expand Down Expand Up @@ -643,7 +643,7 @@ def testMultipleCoordinateInsertionReport(self):
""" Two coordinate regions map the same seed region, the consensus
has an insertion relative to only one of them.
"""
self.report.projects.load(StringIO.StringIO("""\
self.report.projects.load(StringIO("""\
{
"projects": {
"R3": {
Expand Down Expand Up @@ -788,7 +788,7 @@ def testGoodAlignmentWithGiantSeed(self):
Even when the consensus maps to the end of the seed, it should still
only require a low alignment score.
"""
self.report.projects.load(StringIO.StringIO("""\
self.report.projects.load(StringIO("""\
{
"projects": {
"R3": {
Expand Down
2 changes: 1 addition & 1 deletion micall/tests/censor_fastq_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import StringIO
from io import StringIO
import unittest

from micall.core.censor_fastq import censor
Expand Down
2 changes: 1 addition & 1 deletion micall/tests/coverage_plots_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from mock import patch, call
from unittest import TestCase
from StringIO import StringIO
from io import StringIO
from micall.utils.coverage_plots import coverage_plot
from micall.core.project_config import ProjectConfig

Expand Down
1 change: 1 addition & 0 deletions micall/utils/coverage.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ if (length(args) != 2) {
input.csv <- args[1]
out.prefix <- args[2]


df <- read.csv(input.csv)

# guess if this is a nuc or amino CSV
Expand Down

0 comments on commit b9fdd20

Please sign in to comment.