Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for csi index #407

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 18 additions & 6 deletions pybedtools/bedtool.py
Original file line number Diff line number Diff line change
Expand Up @@ -700,7 +700,11 @@ def tabix_intervals(self, interval_or_string, check_coordinates=False):
# tabix expects 1-based coords, but BEDTools works with
# zero-based. pybedtools and pysam also work with zero-based. So we can
# pass zero-based directly to the pysam tabix interface.
tbx = pysam.TabixFile(self.fn)
try:
tbx = pysam.TabixFile(self.fn)
except OSError:
# if the file is indexed using csi, we need to specify the path for index
tbx = pysam.TabixFile(self.fn, index=self.fn+".csi")

# If an interval is passed, use its coordinates directly
if isinstance(interval_or_string, Interval):
Expand Down Expand Up @@ -749,10 +753,14 @@ def tabix_contigs(self):
"-- please use the .tabix() method"
)

tbx = pysam.TabixFile(self.fn)
try:
tbx = pysam.TabixFile(self.fn)
except OSError:
# if the file is indexed using csi, we need to specify the path for index
tbx = pysam.TabixFile(self.fn, index=self.fn+".csi")
return tbx.contigs

def tabix(self, in_place=True, force=False, is_sorted=False):
def tabix(self, in_place=True, force=False, is_sorted=False, use_csi=False):
"""
Prepare a BedTool for use with Tabix.

Expand All @@ -773,6 +781,10 @@ def tabix(self, in_place=True, force=False, is_sorted=False):
is_sorted : bool
If True (default is False), then assume the file is already sorted
so that BedTool.bgzip() doesn't have to do that work.

use_csi : bool
If True (default is False), then generate a csi instead of tbi index.
This can be useful when working with chromosomes larger than 512 Mbp, such as barley
"""
# Return quickly if nothing to do
if self._tabixed() and not force:
Expand All @@ -781,18 +793,18 @@ def tabix(self, in_place=True, force=False, is_sorted=False):
# Make sure it's BGZIPed
fn = self.bgzip(in_place=in_place, force=force)

pysam.tabix_index(fn, force=force, preset=self.file_type)
pysam.tabix_index(fn, force=force, preset=self.file_type, csi=use_csi)
return BedTool(fn)

def _tabixed(self):
"""
Verifies that we're working with a tabixed file: a string filename
pointing to a BGZIPed file with a .tbi file in the same dir.
pointing to a BGZIPed file with a .tbi or .csi file in the same dir.
"""
if (
isinstance(self.fn, str)
and isBGZIP(self.fn)
and os.path.exists(self.fn + ".tbi")
and (os.path.exists(self.fn + ".tbi") or os.path.exists(self.fn + ".csi"))
):
return True

Expand Down
61 changes: 34 additions & 27 deletions pybedtools/test/test_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,35 +133,36 @@ def test_tuple_creation():


def test_tabix():
try:
a = pybedtools.example_bedtool("a.bed")
t = a.tabix(force=True)
assert t._tabixed()
results = t.tabix_intervals("chr1:99-200")
results = str(results)
print(results)
assert results == fix(
"""
chr1 1 100 feature1 0 +
chr1 100 200 feature2 0 +
chr1 150 500 feature3 0 -"""
)
for idx_type in ("tbi", "csi"):
try:
a = pybedtools.example_bedtool("a.bed")
t = a.tabix(force=True, use_csi=True if idx_type == "csi" else False)
assert t._tabixed()
results = t.tabix_intervals("chr1:99-200")
results = str(results)
print(results)
assert results == fix(
"""
chr1 1 100 feature1 0 +
chr1 100 200 feature2 0 +
chr1 150 500 feature3 0 -"""
)

assert str(t.tabix_intervals(a[2])) == fix(
"""
chr1 100 200 feature2 0 +
chr1 150 500 feature3 0 -"""
)
assert str(t.tabix_intervals(a[2])) == fix(
"""
chr1 100 200 feature2 0 +
chr1 150 500 feature3 0 -"""
)

finally:
# clean up
fns = [
pybedtools.example_filename("a.bed.gz"),
pybedtools.example_filename("a.bed.gz.tbi"),
]
for fn in fns:
if os.path.exists(fn):
os.unlink(fn)
finally:
# clean up
fns = [
pybedtools.example_filename("a.bed.gz"),
pybedtools.example_filename("a.bed.gz." + idx_type),
]
for fn in fns:
if os.path.exists(fn):
os.unlink(fn)


def test_tabix_intervals():
Expand All @@ -177,6 +178,12 @@ def test_tabix_intervals():
assert len(a.tabix_intervals("chr1")) == 1


def test_tabix_contigs_csi():
a = pybedtools.example_bedtool("a.bed")
a = a.tabix(force=True, use_csi=True)
assert a.tabix_contigs() == ["chr1"]


# ----------------------------------------------------------------------------
# Streaming and non-file BedTool tests
# ----------------------------------------------------------------------------
Expand Down