From b9fdd20d5c93b35888d5e259743acffd2b33b36f Mon Sep 17 00:00:00 2001 From: Art Poon Date: Mon, 24 Feb 2020 21:11:34 -0500 Subject: [PATCH] allow user to set non-default JSON projects file --- .gitignore | 4 +++ bin/micall | 6 ++-- micall/core/aln2counts.py | 45 ++++++++++++++++++++--------- micall/core/prelim_map.py | 12 ++++++-- micall/core/remap.py | 9 ++++-- micall/tests/aln2counts_test.py | 16 +++++----- micall/tests/censor_fastq_test.py | 2 +- micall/tests/coverage_plots_test.py | 2 +- micall/utils/coverage.R | 1 + 9 files changed, 67 insertions(+), 30 deletions(-) diff --git a/.gitignore b/.gitignore index 260965d72..240976a78 100755 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,7 @@ .DS_Store */.DS_Store *.log +examples/ +*.pdf +*.csv + diff --git a/bin/micall b/bin/micall index 8bba90351..15ebd664f 100755 --- a/bin/micall +++ b/bin/micall @@ -53,9 +53,12 @@ def parseArgs(): help=" Path to bowtie2 script.") parser.add_argument('--bt2build', default='bowtie2-build-s', help=" Path to bowtie2-build script.") - parser.add_argument('--threads', '-p', type=int, default=4, + parser.add_argument('--threads', '-t', type=int, default=4, help="Number of threads for bowtie2 (default 4)") + parser.add_argument('--projects', '-p', type=argparse.FileType('rb'), required=False, + help=' Specify a custom projects JSON file.') + if len(sys.argv) == 1: parser.print_help() sys.exit() @@ -200,7 +203,6 @@ if __name__ == '__main__': print("Using {} version {}".format(bowtie2.path, bowtie2.version)) - #print(args) if args.outdir is None: # default write outputs to same location as inputs if args.fastq1: diff --git a/micall/core/aln2counts.py b/micall/core/aln2counts.py index fc9625f53..1f900516f 100755 --- a/micall/core/aln2counts.py +++ b/micall/core/aln2counts.py @@ -487,8 +487,17 @@ def _create_consensus_writer(self, conseq_file): def write_consensus_header(self, conseq_file): self._create_consensus_writer(conseq_file).writeheader() - def write_consensus(self, conseq_file): + def write_consensus(self, conseq_file, min_coverage=100): + """ + Generate nucleotide consensus sequences at varying mixture + cutoffs, and write to user-specified file. + :param conseq_file: csv.DictWriter object + :param min_coverage: depth of coverage below which the nucleotide + will be reported in lower-case. + :return: + """ conseq_writer = self._create_consensus_writer(conseq_file) + for mixture_cutoff in self.conseq_mixture_cutoffs: consensus = '' offset = None @@ -496,16 +505,21 @@ def write_consensus(self, conseq_file): if offset is None: if not seed_amino.counts: continue - offset = seed_amino.consensus_index*3 + offset = seed_amino.consensus_index * 3 + for seed_nuc in seed_amino.nucleotides: - consensus += seed_nuc.get_consensus(mixture_cutoff) + cnuc = seed_nuc.get_consensus(mixture_cutoff) + coverage = sum(seed_nuc.counts.values()) + consensus += cnuc.upper() if (coverage >= min_coverage) else cnuc.lower() + if offset is not None: - conseq_writer.writerow( - {'region': self.seed, - 'q-cutoff': self.qcut, - 'consensus-percent-cutoff': format_cutoff(mixture_cutoff), - 'offset': offset, - 'sequence': consensus}) + conseq_writer.writerow({ + 'region': self.seed, + 'q-cutoff': self.qcut, + 'consensus-percent-cutoff': format_cutoff(mixture_cutoff), + 'offset': offset, + 'sequence': consensus + }) def _create_nuc_variants_writer(self, nuc_variants_file): return csv.DictWriter(nuc_variants_file, @@ -813,7 +827,8 @@ def aln2counts(aligned_csv, failed_align_csv=None, nuc_variants_csv=None, callback=None, - coverage_summary_csv=None): + coverage_summary_csv=None, + json=None): """ Analyze aligned reads for nucleotide and amino acid frequencies. Generate consensus sequences. @@ -827,11 +842,15 @@ def aln2counts(aligned_csv, @param nuc_variants_csv: Open file handle to write the most frequent nucleotide sequence variants. @param callback: a function to report progress with three optional - parameters - callback(message, progress, max_progress) - @param coverage_summary_csv Open file handle to write coverage depth. + parameters - callback(message, progress, max_progress) + @param coverage_summary_csv: Open file handle to write coverage depth. + @param json: specify a custom JSON project file; None loads the default file. """ # load project information - projects = project_config.ProjectConfig.loadDefault() + if json is None: + projects = project_config.ProjectConfig.loadDefault() + else: + projects = project_config.ProjectConfig.load(json) # initialize reporter classes insert_writer = InsertionWriter(coord_ins_csv) diff --git a/micall/core/prelim_map.py b/micall/core/prelim_map.py index 44a4201a9..19ee87e7d 100755 --- a/micall/core/prelim_map.py +++ b/micall/core/prelim_map.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python +#! /usr/bin/env python3 """ Shipyard-style bowtie2 @@ -37,7 +37,7 @@ def prelim_map(fastq1, fastq2, prelim_csv, bt2_path='bowtie2', bt2build_path='bowtie2-build-s', nthreads=BOWTIE_THREADS, callback=None, rdgopen=READ_GAP_OPEN, rfgopen=REF_GAP_OPEN, stderr=sys.stderr, - gzip=False, work_path='', keep=False): + gzip=False, work_path='', keep=False, json=None): """ Run the preliminary mapping step. @param fastq1: the file name for the forward reads in FASTQ format @@ -52,6 +52,8 @@ def prelim_map(fastq1, fastq2, prelim_csv, @param stderr: where to write the standard error output from bowtie2 calls. @param gzip: if True, FASTQ files are compressed @param work_path: optional path to store working files + @param keep: if False, delete temporary files + @param json: specify a custom JSON project file; None loads the default file. """ bowtie2 = Bowtie2(execname=bt2_path) @@ -92,7 +94,11 @@ def prelim_map(fastq1, fastq2, prelim_csv, max_progress=total_reads) # generate initial reference files - projects = project_config.ProjectConfig.loadDefault() + if json is None: + projects = project_config.ProjectConfig.loadDefault() + else: + projects = project_config.ProjectConfig.load(json) + ref_path = os.path.join(work_path, 'micall.fasta') with open(ref_path, 'w') as ref: projects.writeSeedFasta(ref) diff --git a/micall/core/remap.py b/micall/core/remap.py index dda2c7726..026709692 100644 --- a/micall/core/remap.py +++ b/micall/core/remap.py @@ -383,7 +383,7 @@ def remap(fastq1, fastq2, prelim_csv, remap_csv, remap_counts_csv=None, bt2_path='bowtie2', bt2build_path='bowtie2-build-s', nthreads=BOWTIE_THREADS, callback=None, count_threshold=10, rdgopen=READ_GAP_OPEN, rfgopen=REF_GAP_OPEN, stderr=sys.stderr, - gzip=False, debug_file_prefix=None, keep=False): + gzip=False, debug_file_prefix=None, keep=False, json=None): """ Iterative re-map reads from raw paired FASTQ files to a reference sequence set that is being updated as the consensus of the reads that were mapped to the last set. @@ -404,6 +404,7 @@ def remap(fastq1, fastq2, prelim_csv, remap_csv, remap_counts_csv=None, @param count_threshold: minimum number of reads that map to a region for it to be remapped @param rdgopen: read gap open penalty @param rfgopen: reference gap open penalty + @param json: specify a custom JSON project file; None loads the default file. """ reffile = os.path.join(work_path, 'temp.fasta') @@ -441,7 +442,11 @@ def remap(fastq1, fastq2, prelim_csv, remap_csv, remap_counts_csv=None, worker_pool = multiprocessing.Pool(processes=nthreads) if nthreads > 1 else None # retrieve reference sequences used for preliminary mapping - projects = project_config.ProjectConfig.loadDefault() + if json is None: + projects = project_config.ProjectConfig.loadDefault() + else: + projects = project_config.ProjectConfig.load(json) + seeds = {} for seed, vals in projects.config['regions'].items(): seqs = vals['reference'] diff --git a/micall/tests/aln2counts_test.py b/micall/tests/aln2counts_test.py index 0f7aaefe8..32b749bd3 100644 --- a/micall/tests/aln2counts_test.py +++ b/micall/tests/aln2counts_test.py @@ -1,5 +1,5 @@ import csv -import StringIO +from io import StringIO import sys import unittest @@ -30,7 +30,7 @@ def add_override(self, query, aligned_query, aligned_reference, - score=sys.maxint): + score=sys.maxsize): self.overrides[(reference, query)] = (aligned_reference, aligned_query, score) @@ -38,7 +38,7 @@ def add_override(self, class SequenceReportTest(unittest.TestCase): def setUp(self): - self.insertion_file = StringIO.StringIO() + self.insertion_file = StringIO() insert_writer = InsertionWriter( insert_file=self.insertion_file) projects = project_config.ProjectConfig() @@ -46,7 +46,7 @@ def setUp(self): # Content of seed regions is irrelevant. For R-NO-COORD, there is # no coordinate reference, so we use the seed reference for display, but # only the length matters. - projects.load(StringIO.StringIO("""\ + projects.load(StringIO("""\ { "projects": { "R1": { @@ -148,11 +148,11 @@ def setUp(self): self.report = StubbedSequenceReport(insert_writer, projects, conseq_mixture_cutoffs) - self.report_file = StringIO.StringIO() + self.report_file = StringIO() def prepareReads(self, aligned_reads_text): full_text = "refname,qcut,rank,count,offset,seq\n" + aligned_reads_text - dummy_file = StringIO.StringIO(full_text) + dummy_file = StringIO(full_text) return csv.DictReader(dummy_file) def testEmptyAminoReport(self): @@ -643,7 +643,7 @@ def testMultipleCoordinateInsertionReport(self): """ Two coordinate regions map the same seed region, the consensus has an insertion relative to only one of them. """ - self.report.projects.load(StringIO.StringIO("""\ + self.report.projects.load(StringIO("""\ { "projects": { "R3": { @@ -788,7 +788,7 @@ def testGoodAlignmentWithGiantSeed(self): Even when the consensus maps to the end of the seed, it should still only require a low alignment score. """ - self.report.projects.load(StringIO.StringIO("""\ + self.report.projects.load(StringIO("""\ { "projects": { "R3": { diff --git a/micall/tests/censor_fastq_test.py b/micall/tests/censor_fastq_test.py index 820bb1dcf..e4816a3bd 100644 --- a/micall/tests/censor_fastq_test.py +++ b/micall/tests/censor_fastq_test.py @@ -1,4 +1,4 @@ -import StringIO +from io import StringIO import unittest from micall.core.censor_fastq import censor diff --git a/micall/tests/coverage_plots_test.py b/micall/tests/coverage_plots_test.py index a5a578047..5c9da8e21 100644 --- a/micall/tests/coverage_plots_test.py +++ b/micall/tests/coverage_plots_test.py @@ -1,6 +1,6 @@ from mock import patch, call from unittest import TestCase -from StringIO import StringIO +from io import StringIO from micall.utils.coverage_plots import coverage_plot from micall.core.project_config import ProjectConfig diff --git a/micall/utils/coverage.R b/micall/utils/coverage.R index fad1b9ca9..806c2e5a3 100755 --- a/micall/utils/coverage.R +++ b/micall/utils/coverage.R @@ -6,6 +6,7 @@ if (length(args) != 2) { input.csv <- args[1] out.prefix <- args[2] + df <- read.csv(input.csv) # guess if this is a nuc or amino CSV