allow anchor selection directly in read_anchors(), change ft.loc_rang…

…e ot ft.locs.range
rnajena · Nov 8, 2024 · 0f0f2b8 · 0f0f2b8
1 parent 49982b6
commit 0f0f2b8
Show file tree

Hide file tree

Showing 5 changed files with 24 additions and 20 deletions.
diff --git a/anchorna/cli.py b/anchorna/cli.py
@@ -19,7 +19,7 @@
 
 from sugar import read
 from anchorna.core import combine, cutout, find_my_anchors
-from anchorna.io import export_dialign, export_jalview, read_anchors, load_selected_anchors
+from anchorna.io import export_dialign, export_jalview, read_anchors
 from anchorna.util import _apply_mode
 
 
@@ -108,7 +108,7 @@ def _tutorial_seqs(subset=False):
     seqs = read(files('anchorna.tests.data').joinpath('pesti56.gff.zip'))
     if subset:
         seqs = seqs[18:28]
-        start, stop = zip(*[seq.fts.get('cds').loc_range for seq in seqs])
+        start, stop = zip(*[seq.fts.get('cds').locs.range for seq in seqs])
         start = max(start)
         stop = min(stop)
         stop = start + (stop-start) // 3 * 3 + 6
@@ -169,7 +169,7 @@ def _cmd_go(fname, fname_anchor, pbar=True, continue_with=None,
     anchors.write(fname_anchor)
 
 def _cmd_print(fname_anchor, verbose=False, mode='aa'):
-    anchors = load_selected_anchors(fname_anchor)
+    anchors = read_anchors(fname_anchor)
     if anchors.no_cds:
         mode = 'aa'
     try:
@@ -183,7 +183,7 @@ def _cmd_load(fname_anchor):
 def _cmd_export(fname_anchor, out, mode='aa', score_use_fluke=None, fmt='gff',
                 fname=None):
     assert mode in ('nt', 'cds', 'aa')
-    anchors = load_selected_anchors(fname_anchor)
+    anchors = read_anchors(fname_anchor)
     if anchors.no_cds:
         mode = 'aa'
     if fmt in ('jalview', 'dialign'):
@@ -240,17 +240,17 @@ def _cmd_view(fname_anchor, fname, mode='aa', align=None, score_use_fluke=None):
         subprocess.run(f'jalview {fname_seq} --features {fname_export}'.split())
 
 def _cmd_combine(fname_anchor, out):
-    lot_of_anchors = [load_selected_anchors(fn) for fn in fname_anchor]
+    lot_of_anchors = [read_anchors(fn) for fn in fname_anchor]
     anchors = combine(lot_of_anchors)
     anchors.write(out)
 
-def _cmd_cutout(fname, fname_anchor, pos1, pos2, out, fmt, mode='nt', score_use_fluke=None):
+def _cmd_cutout(fname, fname_anchor, pos1, pos2, out, fmt, mode='nt', score_use_fluke=None, gap=None):
     assert mode in ('nt', 'cds', 'aa')
     seqs = read(fname)
-    anchors = load_selected_anchors(fname_anchor)
+    anchors = read_anchors(fname_anchor)
     if anchors.no_cds:
         mode = 'aa'
-    seqs2 = cutout(seqs, anchors, pos1, pos2, mode=mode, score_use_fluke=score_use_fluke)
+    seqs2 = cutout(seqs, anchors, pos1, pos2, mode=mode, score_use_fluke=score_use_fluke, gap=gap)
     if out is None:
         print(seqs2.tofmtstr(fmt or 'fasta'))
     else:
@@ -342,7 +342,7 @@ def run_cmdline(cmd_args=None):
             'Each position has 3 parts ABC where B and C are optional. '
             'Part A: Is a number or number prepended with letter a to specify the anchor number, '
             'use special words "start" and "end" for start or end of sequence, '
-            'use special words "ATG" and "*" for start or stop codon of sequence (only allowed in mode "seq") '
+            'use special words "ATG" and "*" for start or stop codon of sequence (only allowed in mode "nt") '
             'Part B: One of the characters <, >, ^, for start, end or middle of word (anchor) specified in A, '
             'default is < for the first anchor and > for the second anchor, must be omitted for A=start or A=end. '
             'Part C: Additional character offset in the form +X or -X. '
@@ -400,6 +400,7 @@ def run_cmdline(cmd_args=None):
 
     p_cutout.add_argument('-o', '--out', help='output file name (by default prints fasta to stdout)')
     p_cutout.add_argument('--fmt', help='output format (default: autodetect from file extension)')
+    p_cutout.add_argument('--gap', help='specify gap character(s), allow gaps in sequences (default: no special handling for gaps)')
 
     p_print.add_argument('fname_anchor', help='anchor file name')
     p_print.add_argument('-v', '--verbose', help=msg, action='store_true')

diff --git a/anchorna/core.py b/anchorna/core.py
@@ -300,7 +300,7 @@ def _transform_cutout_index(A, B, C, id_, seq, mode):
     return i
 
 
-def cutout(seqs, anchors, pos1, pos2, mode='nt', score_use_fluke=None):
+def cutout(seqs, anchors, pos1, pos2, mode='nt', score_use_fluke=None, gap=None):
     """
     Cutout subsequences from pos1 to pos2 (i.e. between two anchors)
 
@@ -320,7 +320,7 @@ def cutout(seqs, anchors, pos1, pos2, mode='nt', score_use_fluke=None):
             continue
         i = _transform_cutout_index(la, lb, lc, id_, seqs[id_], mode)
         j = _transform_cutout_index(ra, rb, rc, id_, seqs[id_], mode)
-        seq2 = seqs[id_][i:j]
+        seq2 = seqs[id_].sl(gap=gap)[i:j]
         if mode == 'nt':
             seq2.meta.offset = i
             seq2.meta.pop('fts', None)

diff --git a/anchorna/io.py b/anchorna/io.py
@@ -51,7 +51,7 @@ def write_anchors(anchors, fname, mode=None):
         fts.write(fname, 'gff', header=header)
 
 
-def read_anchors(fname, check_header=True):
+def _read_anchors(fname, check_header=True):
     """
     Read anchors from GFF file
 
@@ -99,16 +99,19 @@ def _parse_selection(anchors, selection):
     return anchors2
 
 
-def load_selected_anchors(fname):
+def read_anchors(fname, check_header=True):
     """
-    Read anchors and select or remove some of them
+    Read anchors from GFF file
 
-    See ``anchorna combine -h``
+    Offsets are restored from comments.
+    Additionally, anchors can be selected and/or removed with a special syntax,
+    see ``anchorna combine -h``
     """
+    fname = str(fname)
     if '|' not in fname:
-        return read_anchors(fname)
+        return _read_anchors(fname, check_header=check_header)
     fname, selection = fname.split('|', 1)
-    anchors = read_anchors(fname.strip())
+    anchors = _read_anchors(fname.strip(), check_header=check_header)
     selection = selection.lower()
     if '|' not in selection:
         return _parse_selection(anchors, selection)

diff --git a/anchorna/tests/test_anchorna.py b/anchorna/tests/test_anchorna.py
@@ -15,7 +15,7 @@
 
 from anchorna import cutout, read_anchors
 from anchorna.cli import run_cmdline
-from anchorna.io import load_json, load_selected_anchors, write_json
+from anchorna.io import load_json, write_json
 
 
 _IDS = (  # Representative sequences of pesti virus
@@ -147,7 +147,7 @@ def test_anchorna_workflow_subset():
         assert '' == check(f'anchorna cutout anchors.gff a6> a10< -o {fname}')
         assert '' == check(f'anchorna go --fname {fname} --no-pbar anchors_cutout2.gff --search-range=1000')
         assert '' == check('anchorna combine anchors.gff||a7:a10 anchors_cutout2.gff -o anchors_combined2.gff')
-        assert read_anchors('anchors_combined2.gff') == load_selected_anchors('anchors.gff')
+        assert read_anchors('anchors_combined2.gff') == read_anchors('anchors.gff')
 
         # check --no-remove option and --continue-with option
         assert '' == check('anchorna go --no-remove --no-pbar anchors2.gff')

diff --git a/anchorna/util.py b/anchorna/util.py
@@ -341,7 +341,7 @@ def fts2anchors(fts, no_cds=False):
         if ft.type in ('anchor', 'fluke'):
             if not ft.meta.name.startswith(aname):
                 raise ValueError(f'Fluke with name {ft.meta.name} not part of anchor {aname}')
-            start, stop = ft.loc_range
+            start, stop = ft.locs.range
             fluke = Fluke(seqid=ft.seqid, score=ft.meta.score, start=start, stop=stop,
                           word=ft.meta._gff.word, poor=hasattr(ft.meta._gff, 'poor'),
                           median_score = float(ft.meta._gff.get('median_score', 1)))