From 6de341ccce378ace92a5d3391543fdee77e79142 Mon Sep 17 00:00:00 2001 From: Li Yao Date: Fri, 26 Apr 2024 16:01:05 -0400 Subject: [PATCH 1/4] Add support for csi index --- pybedtools/bedtool.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/pybedtools/bedtool.py b/pybedtools/bedtool.py index 1b8b491d..11ebcf72 100644 --- a/pybedtools/bedtool.py +++ b/pybedtools/bedtool.py @@ -700,7 +700,11 @@ def tabix_intervals(self, interval_or_string, check_coordinates=False): # tabix expects 1-based coords, but BEDTools works with # zero-based. pybedtools and pysam also work with zero-based. So we can # pass zero-based directly to the pysam tabix interface. - tbx = pysam.TabixFile(self.fn) + try: + tbx = pysam.TabixFile(self.fn) + except OSError: + # if the file is indexed using csi, we need to specify the path for index + tbx = pysam.TabixFile(self.fn, index=self.fn+".csi") # If an interval is passed, use its coordinates directly if isinstance(interval_or_string, Interval): @@ -749,10 +753,14 @@ def tabix_contigs(self): "-- please use the .tabix() method" ) - tbx = pysam.TabixFile(self.fn) + try: + tbx = pysam.TabixFile(self.fn) + except OSError: + # if the file is indexed using csi, we need to specify the path for index + tbx = pysam.TabixFile(self.fn, index=self.fn+".csi") return tbx.contigs - def tabix(self, in_place=True, force=False, is_sorted=False): + def tabix(self, in_place=True, force=False, is_sorted=False, use_csi=False): """ Prepare a BedTool for use with Tabix. @@ -773,6 +781,10 @@ def tabix(self, in_place=True, force=False, is_sorted=False): is_sorted : bool If True (default is False), then assume the file is already sorted so that BedTool.bgzip() doesn't have to do that work. + + use_csi : bool + If True (default is False), then generate csi instead of tbi index. + This can be useful when working with chromosomes larger than 512 Mbp, such as barley """ # Return quickly if nothing to do if self._tabixed() and not force: @@ -781,18 +793,18 @@ def tabix(self, in_place=True, force=False, is_sorted=False): # Make sure it's BGZIPed fn = self.bgzip(in_place=in_place, force=force) - pysam.tabix_index(fn, force=force, preset=self.file_type) + pysam.tabix_index(fn, force=force, preset=self.file_type, csi=use_csi) return BedTool(fn) def _tabixed(self): """ Verifies that we're working with a tabixed file: a string filename - pointing to a BGZIPed file with a .tbi file in the same dir. + pointing to a BGZIPed file with a .tbi or .csi file in the same dir. """ if ( isinstance(self.fn, str) and isBGZIP(self.fn) - and os.path.exists(self.fn + ".tbi") + and (os.path.exists(self.fn + ".tbi") or os.path.exists(self.fn + ".csi")) ): return True From 970175d666653563e7561c8f6f858cbc43bd8c8a Mon Sep 17 00:00:00 2001 From: liyao001 Date: Fri, 26 Apr 2024 23:26:05 -0400 Subject: [PATCH 2/4] Extended original test case for `tabix` by testing both tbi and csi indexes --- pybedtools/test/test_1.py | 55 ++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/pybedtools/test/test_1.py b/pybedtools/test/test_1.py index b9d67fe5..54292ee1 100644 --- a/pybedtools/test/test_1.py +++ b/pybedtools/test/test_1.py @@ -133,35 +133,36 @@ def test_tuple_creation(): def test_tabix(): - try: - a = pybedtools.example_bedtool("a.bed") - t = a.tabix(force=True) - assert t._tabixed() - results = t.tabix_intervals("chr1:99-200") - results = str(results) - print(results) - assert results == fix( - """ - chr1 1 100 feature1 0 + - chr1 100 200 feature2 0 + - chr1 150 500 feature3 0 -""" - ) + for idx_type in ("tbi", "csi"): + try: + a = pybedtools.example_bedtool("a.bed") + t = a.tabix(force=True, use_csi=True if idx_type == "csi" else False) + assert t._tabixed() + results = t.tabix_intervals("chr1:99-200") + results = str(results) + print(results) + assert results == fix( + """ + chr1 1 100 feature1 0 + + chr1 100 200 feature2 0 + + chr1 150 500 feature3 0 -""" + ) - assert str(t.tabix_intervals(a[2])) == fix( - """ - chr1 100 200 feature2 0 + - chr1 150 500 feature3 0 -""" - ) + assert str(t.tabix_intervals(a[2])) == fix( + """ + chr1 100 200 feature2 0 + + chr1 150 500 feature3 0 -""" + ) - finally: - # clean up - fns = [ - pybedtools.example_filename("a.bed.gz"), - pybedtools.example_filename("a.bed.gz.tbi"), - ] - for fn in fns: - if os.path.exists(fn): - os.unlink(fn) + finally: + # clean up + fns = [ + pybedtools.example_filename("a.bed.gz"), + pybedtools.example_filename("a.bed.gz." + idx_type), + ] + for fn in fns: + if os.path.exists(fn): + os.unlink(fn) def test_tabix_intervals(): From e61a315148c4870df84298a5f76f0d482d3a6af9 Mon Sep 17 00:00:00 2001 From: liyao001 Date: Fri, 26 Apr 2024 23:34:08 -0400 Subject: [PATCH 3/4] Extended original test case for `tabix` by testing both tbi and csi indexes (tabix_contigs) --- pybedtools/test/test_1.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pybedtools/test/test_1.py b/pybedtools/test/test_1.py index 54292ee1..20a68393 100644 --- a/pybedtools/test/test_1.py +++ b/pybedtools/test/test_1.py @@ -178,6 +178,12 @@ def test_tabix_intervals(): assert len(a.tabix_intervals("chr1")) == 1 +def test_tabix_contigs_csi(): + a = pybedtools.example_bedtool("a.bed") + a = a.tabix(force=True, use_csi=True) + assert a.tabix_contigs() == ["chr1"] + + # ---------------------------------------------------------------------------- # Streaming and non-file BedTool tests # ---------------------------------------------------------------------------- From 3a5ebc8d66214bfbeb9629b9b27a5b2dd0309217 Mon Sep 17 00:00:00 2001 From: Li Yao Date: Fri, 5 Jul 2024 22:44:15 -0400 Subject: [PATCH 4/4] Update a docstring to trigger github actions --- pybedtools/bedtool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pybedtools/bedtool.py b/pybedtools/bedtool.py index 11ebcf72..96ada4b0 100644 --- a/pybedtools/bedtool.py +++ b/pybedtools/bedtool.py @@ -783,7 +783,7 @@ def tabix(self, in_place=True, force=False, is_sorted=False, use_csi=False): so that BedTool.bgzip() doesn't have to do that work. use_csi : bool - If True (default is False), then generate csi instead of tbi index. + If True (default is False), then generate a csi instead of tbi index. This can be useful when working with chromosomes larger than 512 Mbp, such as barley """ # Return quickly if nothing to do