--longreads option added for FASTQ long reads inputs such as PacBio, …

…ONT, etc. Gives better mapping results
phac-nml · Nov 7, 2024 · 3963d92 · 3963d92
1 parent 6e38b40
commit 3963d92
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 5 deletions.
diff --git a/ectyper/commandLineOptions.py b/ectyper/commandLineOptions.py
@@ -61,6 +61,13 @@ def checkdbversion():
         nargs="+"
     )
 
+    parser.add_argument(
+        "--longreads",
+        action="store_true",
+        default=False,
+        help="Enable for raw long reads FASTQ inputs (ONT, PacBio, other sequencing platforms). [default %(default)s]"
+    )
+
     parser.add_argument(
         "--maxdirdepth",
         help="Maximum number of directories to descend when searching an input directory of files [default %(default)s levels]. Only works on path inputs not containing '*' wildcard",

diff --git a/ectyper/genomeFunctions.py b/ectyper/genomeFunctions.py
@@ -211,7 +211,7 @@ def create_bowtie_base(temp_dir, reference, cores):
     return bowtie_base
 
 
-def assemble_reads(reads, bowtie_base, combined_fasta, temp_dir, cores=1):
+def assemble_reads(reads, bowtie_base, combined_fasta, temp_dir, cores=1, longreads=False):
     """
     Assembles fastq reads to the specified reference file.
     :param reads: The fastq file to assemble
@@ -232,14 +232,15 @@ def assemble_reads(reads, bowtie_base, combined_fasta, temp_dir, cores=1):
     bowtie_run = [
         'bowtie2',
         '--threads',f'{cores}',
-        '--local',
         '--score-min L,1,-0.5',
         '--np 5',
         '--no-unal',
         '-x', bowtie_base,
         '-U', reads,
         '-S', sam_reads
     ]
+    if longreads == True: #for nanopore reads do local alignment as long reads are longer than references
+            bowtie_run.append('--local')
 
     subprocess_util.run_subprocess(bowtie_run)
 
@@ -377,7 +378,8 @@ def assemble_fastq(raw_files_dict, temp_dir, combined_fasta, bowtie_base, args):
                   bowtie_base=bowtie_base,
                   combined_fasta=combined_fasta,
                   temp_dir=temp_dir,
-                  cores=cores)
+                  cores=cores,
+                  longreads=args.longreads)
 
     all_fasta_files_dict = dict.fromkeys(raw_files_dict['fasta']) #add assembled genomes as new keys
     with Pool(processes=args.cores) as pool:

diff --git a/test/test_O_serotyping.py b/test/test_O_serotyping.py
@@ -138,7 +138,7 @@ def test_Ecoli_O17H18(caplog):
          rows = outfp.readlines()
     secondrow=rows[1:][0] #check only second row
     assert "Escherichia coli" in secondrow.split('\t')
-    assert "O17/O77/O44/O106\tH18\tO17/O77/O44/O106:H18\tWARNING MIXED O-TYPE" in secondrow
+    assert "O17/O44/O77/O106\tH18\tO17/O44/O77/O106:H18\tWARNING MIXED O-TYPE" in secondrow
 
 def test_download_refseq_mash(caplog, tmpdir):
     caplog.set_level(logging.DEBUG)

diff --git a/test/test_complex_inputs.py b/test/test_complex_inputs.py
@@ -91,7 +91,7 @@ def test_multiple_inputs(caplog):
         output_tsv_lines = fp.readlines()
     with open(output_blastn_antigens) as fp:
         output_blastn_antigens_lines  = fp.readlines()   
-    assert any([True if 'O17/O77/O44/O106:H18' in line else False for line in output_tsv_lines]), "No matches of 'O17/O77/O44/O106:H18' serotype"
+    assert any([True if 'O17/O44/O77/O106:H18' in line else False for line in output_tsv_lines]), "No matches of 'O17/O44/O77/O106:H18' serotype"
     assert any([True if 'O28/O42:H25' in line else False for line in output_tsv_lines]), "No matches of 'O28/O42:H25' serotype"
     assert any([True if 'EscherichiaO17H18' in line else False for line in output_blastn_antigens_lines]), "No matches of 'EscherichiaO17H18' in BLAST output"
     assert any([True if 'EscherichiaO28H5' in line else False for line in output_blastn_antigens_lines]), "No matches of 'EscherichiaO28H5' in BLAST output"