allow user to set non-default JSON projects file

PoonLab · Feb 25, 2020 · b9fdd20 · b9fdd20
1 parent c01ec25
commit b9fdd20
Show file tree

Hide file tree

Showing 9 changed files with 67 additions and 30 deletions.
diff --git a/.gitignore b/.gitignore
@@ -16,3 +16,7 @@
 .DS_Store
 */.DS_Store
 *.log
+examples/
+*.pdf
+*.csv
+
diff --git a/bin/micall b/bin/micall
@@ -53,9 +53,12 @@ def parseArgs():
                         help="<optional> Path to bowtie2 script.")
     parser.add_argument('--bt2build', default='bowtie2-build-s',
                         help="<optional> Path to bowtie2-build script.")
-    parser.add_argument('--threads', '-p', type=int, default=4,
+    parser.add_argument('--threads', '-t', type=int, default=4,
                         help="Number of threads for bowtie2 (default 4)")
 
+    parser.add_argument('--projects', '-p', type=argparse.FileType('rb'), required=False,
+                        help='<optional> Specify a custom projects JSON file.')
+
     if len(sys.argv) == 1:
         parser.print_help()
         sys.exit()
@@ -200,7 +203,6 @@ if __name__ == '__main__':
 
     print("Using {} version {}".format(bowtie2.path, bowtie2.version))
 
-    #print(args)
     if args.outdir is None:
         # default write outputs to same location as inputs
         if args.fastq1:

diff --git a/micall/core/aln2counts.py b/micall/core/aln2counts.py
@@ -487,25 +487,39 @@ def _create_consensus_writer(self, conseq_file):
     def write_consensus_header(self, conseq_file):
         self._create_consensus_writer(conseq_file).writeheader()
 
-    def write_consensus(self, conseq_file):
+    def write_consensus(self, conseq_file, min_coverage=100):
+        """
+        Generate nucleotide consensus sequences at varying mixture
+        cutoffs, and write to user-specified file.
+        :param conseq_file: csv.DictWriter object
+        :param min_coverage: depth of coverage below which the nucleotide
+        will be reported in lower-case.
+        :return:
+        """
         conseq_writer = self._create_consensus_writer(conseq_file)
+
         for mixture_cutoff in self.conseq_mixture_cutoffs:
             consensus = ''
             offset = None
             for seed_amino in self.seed_aminos[0]:
                 if offset is None:
                     if not seed_amino.counts:
                         continue
-                    offset = seed_amino.consensus_index*3
+                    offset = seed_amino.consensus_index * 3
+
                 for seed_nuc in seed_amino.nucleotides:
-                    consensus += seed_nuc.get_consensus(mixture_cutoff)
+                    cnuc = seed_nuc.get_consensus(mixture_cutoff)
+                    coverage = sum(seed_nuc.counts.values())
+                    consensus += cnuc.upper() if (coverage >= min_coverage) else cnuc.lower()
+
             if offset is not None:
-                conseq_writer.writerow(
-                    {'region': self.seed,
-                     'q-cutoff': self.qcut,
-                     'consensus-percent-cutoff': format_cutoff(mixture_cutoff),
-                     'offset': offset,
-                     'sequence': consensus})
+                conseq_writer.writerow({
+                    'region': self.seed,
+                    'q-cutoff': self.qcut,
+                    'consensus-percent-cutoff': format_cutoff(mixture_cutoff),
+                    'offset': offset,
+                    'sequence': consensus
+                })
 
     def _create_nuc_variants_writer(self, nuc_variants_file):
         return csv.DictWriter(nuc_variants_file,
@@ -813,7 +827,8 @@ def aln2counts(aligned_csv,
                failed_align_csv=None,
                nuc_variants_csv=None,
                callback=None,
-               coverage_summary_csv=None):
+               coverage_summary_csv=None,
+               json=None):
     """
     Analyze aligned reads for nucleotide and amino acid frequencies.
     Generate consensus sequences.
@@ -827,11 +842,15 @@ def aln2counts(aligned_csv,
     @param nuc_variants_csv:    Open file handle to write the most frequent nucleotide sequence
                                 variants.
     @param callback: a function to report progress with three optional
-        parameters - callback(message, progress, max_progress)
-    @param coverage_summary_csv Open file handle to write coverage depth.
+                     parameters - callback(message, progress, max_progress)
+    @param coverage_summary_csv:  Open file handle to write coverage depth.
+    @param json:  specify a custom JSON project file; None loads the default file.
     """
     # load project information
-    projects = project_config.ProjectConfig.loadDefault()
+    if json is None:
+        projects = project_config.ProjectConfig.loadDefault()
+    else:
+        projects = project_config.ProjectConfig.load(json)
 
     # initialize reporter classes
     insert_writer = InsertionWriter(coord_ins_csv)

diff --git a/micall/core/prelim_map.py b/micall/core/prelim_map.py
@@ -1,4 +1,4 @@
-#! /usr/bin/env python
+#! /usr/bin/env python3
 
 """
 Shipyard-style bowtie2
@@ -37,7 +37,7 @@ def prelim_map(fastq1, fastq2, prelim_csv,
                bt2_path='bowtie2', bt2build_path='bowtie2-build-s',
                nthreads=BOWTIE_THREADS, callback=None,
                rdgopen=READ_GAP_OPEN, rfgopen=REF_GAP_OPEN, stderr=sys.stderr,
-               gzip=False, work_path='', keep=False):
+               gzip=False, work_path='', keep=False, json=None):
     """ Run the preliminary mapping step.
 
     @param fastq1: the file name for the forward reads in FASTQ format
@@ -52,6 +52,8 @@ def prelim_map(fastq1, fastq2, prelim_csv,
     @param stderr: where to write the standard error output from bowtie2 calls.
     @param gzip: if True, FASTQ files are compressed
     @param work_path:  optional path to store working files
+    @param keep: if False, delete temporary files
+    @param json: specify a custom JSON project file; None loads the default file.
     """
 
     bowtie2 = Bowtie2(execname=bt2_path)
@@ -92,7 +94,11 @@ def prelim_map(fastq1, fastq2, prelim_csv,
                  max_progress=total_reads)
 
     # generate initial reference files
-    projects = project_config.ProjectConfig.loadDefault()
+    if json is None:
+        projects = project_config.ProjectConfig.loadDefault()
+    else:
+        projects = project_config.ProjectConfig.load(json)
+
     ref_path = os.path.join(work_path, 'micall.fasta')
     with open(ref_path, 'w') as ref:
         projects.writeSeedFasta(ref)

diff --git a/micall/core/remap.py b/micall/core/remap.py
@@ -383,7 +383,7 @@ def remap(fastq1, fastq2, prelim_csv, remap_csv, remap_counts_csv=None,
           bt2_path='bowtie2', bt2build_path='bowtie2-build-s',
           nthreads=BOWTIE_THREADS, callback=None, count_threshold=10,
           rdgopen=READ_GAP_OPEN, rfgopen=REF_GAP_OPEN, stderr=sys.stderr,
-          gzip=False, debug_file_prefix=None, keep=False):
+          gzip=False, debug_file_prefix=None, keep=False, json=None):
     """
     Iterative re-map reads from raw paired FASTQ files to a reference sequence set that
     is being updated as the consensus of the reads that were mapped to the last set.
@@ -404,6 +404,7 @@ def remap(fastq1, fastq2, prelim_csv, remap_csv, remap_counts_csv=None,
     @param count_threshold:  minimum number of reads that map to a region for it to be remapped
     @param rdgopen: read gap open penalty
     @param rfgopen: reference gap open penalty
+    @param json:  specify a custom JSON project file; None loads the default file.
     """
 
     reffile = os.path.join(work_path, 'temp.fasta')
@@ -441,7 +442,11 @@ def remap(fastq1, fastq2, prelim_csv, remap_csv, remap_counts_csv=None,
     worker_pool = multiprocessing.Pool(processes=nthreads) if nthreads > 1 else None
 
     # retrieve reference sequences used for preliminary mapping
-    projects = project_config.ProjectConfig.loadDefault()
+    if json is None:
+        projects = project_config.ProjectConfig.loadDefault()
+    else:
+        projects = project_config.ProjectConfig.load(json)
+
     seeds = {}
     for seed, vals in projects.config['regions'].items():
         seqs = vals['reference']

diff --git a/micall/tests/aln2counts_test.py b/micall/tests/aln2counts_test.py
@@ -1,5 +1,5 @@
 import csv
-import StringIO
+from io import StringIO
 import sys
 import unittest
 
@@ -30,23 +30,23 @@ def add_override(self,
                      query,
                      aligned_query,
                      aligned_reference,
-                     score=sys.maxint):
+                     score=sys.maxsize):
         self.overrides[(reference, query)] = (aligned_reference,
                                               aligned_query,
                                               score)
 
 
 class SequenceReportTest(unittest.TestCase):
     def setUp(self):
-        self.insertion_file = StringIO.StringIO()
+        self.insertion_file = StringIO()
         insert_writer = InsertionWriter(
             insert_file=self.insertion_file)
         projects = project_config.ProjectConfig()
 
         # Content of seed regions is irrelevant. For R-NO-COORD, there is
         # no coordinate reference, so we use the seed reference for display, but
         # only the length matters.
-        projects.load(StringIO.StringIO("""\
+        projects.load(StringIO("""\
 {
   "projects": {
     "R1": {
@@ -148,11 +148,11 @@ def setUp(self):
         self.report = StubbedSequenceReport(insert_writer,
                                             projects,
                                             conseq_mixture_cutoffs)
-        self.report_file = StringIO.StringIO()
+        self.report_file = StringIO()
 
     def prepareReads(self, aligned_reads_text):
         full_text = "refname,qcut,rank,count,offset,seq\n" + aligned_reads_text
-        dummy_file = StringIO.StringIO(full_text)
+        dummy_file = StringIO(full_text)
         return csv.DictReader(dummy_file)
 
     def testEmptyAminoReport(self):
@@ -643,7 +643,7 @@ def testMultipleCoordinateInsertionReport(self):
         """ Two coordinate regions map the same seed region, the consensus
         has an insertion relative to only one of them.
         """
-        self.report.projects.load(StringIO.StringIO("""\
+        self.report.projects.load(StringIO("""\
 {
   "projects": {
     "R3": {
@@ -788,7 +788,7 @@ def testGoodAlignmentWithGiantSeed(self):
         Even when the consensus maps to the end of the seed, it should still
         only require a low alignment score.
         """
-        self.report.projects.load(StringIO.StringIO("""\
+        self.report.projects.load(StringIO("""\
 {
   "projects": {
     "R3": {

diff --git a/micall/tests/censor_fastq_test.py b/micall/tests/censor_fastq_test.py
@@ -1,4 +1,4 @@
-import StringIO
+from io import StringIO
 import unittest
 
 from micall.core.censor_fastq import censor

diff --git a/micall/tests/coverage_plots_test.py b/micall/tests/coverage_plots_test.py
@@ -1,6 +1,6 @@
 from mock import patch, call
 from unittest import TestCase
-from StringIO import StringIO
+from io import StringIO
 from micall.utils.coverage_plots import coverage_plot
 from micall.core.project_config import ProjectConfig
 

diff --git a/micall/utils/coverage.R b/micall/utils/coverage.R
@@ -6,6 +6,7 @@ if (length(args) != 2) {
 input.csv <- args[1]
 out.prefix <- args[2]
 
+
 df <- read.csv(input.csv)
 
 # guess if this is a nuc or amino CSV