Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for polars dataframes #388

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ cython
matplotlib
numpydoc
pandas
polars
pyyaml
sphinx
pysam
1 change: 1 addition & 0 deletions docker/pbt-test-py2/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ RUN conda install -c daler \
numpydoc \
pip \
pandas \
polars \
pyyaml \
sphinx \
pysam
Expand Down
1 change: 1 addition & 0 deletions docker/pbt-test-py3/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ RUN conda install -c daler \
numpydoc \
pip \
pandas \
polars \
pyyaml \
sphinx \
pysam
Expand Down
42 changes: 42 additions & 0 deletions docs/source/dataframe.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
.. include:: includeme.rst

.. _saveresults:

Exporting to a dataframe
==================

If you want to export the results as a dataframe for more analysis, use
the :meth:`BedTool.to_dataframe` method to export to a pandas dataframe or the :meth:`BedTool.to_polars_dataframe` method to export to a polars dataframe. This method also lets you optionally specify column names for the dataframes instead of the default columns names that pybedtools uses. You can use the same arguments you would normally use while reading a file into a pandas (`names=`) or polars (`new_columns=`) dataframe. By default, pybedtools assumes that there is no header line in the bed file. If your bed file already has names in the first row, you can set the `disable_auto_names` argument to `False`.

.. doctest::
:options: +NORMALIZE_WHITESPACE

>>> import pandas
>>> import polars
>>> a = pybedtools.example_bedtool('a.bed')
<BLANKLINE>

>>> pandas_df = a.to_dataframe()
>>> print(pandas_df)
chrom start end name score strand
0 chr1 1 100 feature1 0 +
1 chr1 100 200 feature2 0 +
2 chr1 150 500 feature3 0 -
3 chr1 900 950 feature4 0 +
<BLANKLINE>

>>> polars_df = a.to_polars_dataframe()
>>> print(polars_df)
——————————————————————————————————————————————————————
│ chrom ┆ start ┆ end ┆ name ┆ score ┆ strand │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ str ┆ i64 ┆ str │
══════════════════════════════════════════════════════
│ chr1 ┆ 1 ┆ 100 ┆ feature1 ┆ 0 ┆ + │
│ chr1 ┆ 100 ┆ 200 ┆ feature2 ┆ 0 ┆ + │
│ chr1 ┆ 150 ┆ 500 ┆ feature3 ┆ 0 ┆ - │
│ chr1 ┆ 900 ┆ 950 ┆ feature4 ┆ 0 ┆ + │
——————————————————————————————————————————————————————
<BLANKLINE>

You can also generate a :class:`BedTool` object from a pandas or polars dataframe using the :meth:`BedTool.from_dataframe` or :meth:`BedTool.from_polars_dataframe` method respectively.
1 change: 1 addition & 0 deletions docs/source/tutorial-contents.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Tutorial Contents
create-a-bedtool-tutorial
intersections
save-results
dataframe
default-arguments
piping
intervals
Expand Down
89 changes: 89 additions & 0 deletions pybedtools/bedtool.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,47 @@ def from_dataframe(
)
return BedTool(fn)

@classmethod
def from_polars_dataframe(
self,
polars_df,
outfile=None,
separator="\t",
has_header=False,
**kwargs
):
"""
Creates a BedTool from a polars.DataFrame.

If `outfile` is None, a temporary file will be used. Otherwise it can
be a specific filename or an open file handle. Additional kwargs will
be passed to `polars.DataFrame.write_csv`.

The fields of the resulting BedTool will match the order of columns in
the dataframe.
"""
try:
import polars
except ImportError:
raise ImportError("polars must be installed to use dataframes")
if outfile is None:
outfile = self._tmp()
default_kwargs = dict(separator=separator, has_header=has_header)
default_kwargs.update(kwargs)
polars_df.write_csv(outfile, **default_kwargs)

if isinstance(outfile, six.string_types):
fn = outfile
else:
try:
fn = outfile.name
except AttributeError:
raise ValueError(
"`outfile` is not a string and doesn't have a `name` attribute. "
"Unable to determine filename."
)
return BedTool(fn)

def split(self, func, *args, **kwargs):
"""
Split each feature using a user-defined function.
Expand Down Expand Up @@ -3715,6 +3756,54 @@ def to_dataframe(self, disable_auto_names=False, *args, **kwargs):
else:
return pandas.DataFrame()

def to_polars_dataframe(self, disable_auto_names=False, *args, **kwargs):
"""
Create a polars.DataFrame, passing args and kwargs to polars.read_csv
The separator kwarg `separator` is given a tab `\\t` value by default.

Parameters
----------
disable_auto_names : bool
By default, the created dataframe fills in column names
automatically according to the detected filetype (e.g., "chrom",
"start", "end" for a BED3 file). Set this argument to True to
disable this behavior.
"""
# Complain if BAM or if not a file
if self._isbam:
raise ValueError("BAM not supported for converting to DataFrame")
if not isinstance(self.fn, six.string_types):
raise ValueError("use .saveas() to make sure self.fn is a file")

try:
import polars
except ImportError:
raise ImportError("polars must be installed to convert to polars.DataFrame")
# Otherwise we're good:
names = kwargs.get("new_columns", None)
if names is None and not disable_auto_names:
try:
_names = settings._column_names[self.file_type][: self.field_count()]
if len(_names) < self.field_count():
warn(
"Default names for filetype %s are:\n%s\nbut file has "
"%s fields; you can supply custom names with the "
"`names` kwarg" % (self.file_type, _names, self.field_count())
)
_names = None
except KeyError:
_names = None
kwargs["new_columns"] = _names

has_header = kwargs.get("has_header", False)
if disable_auto_names:
has_header = True
kwargs["has_header"] = has_header
if os.path.isfile(self.fn) and os.path.getsize(self.fn) > 0:
return polars.read_csv(self.fn, *args, separator="\t", **kwargs)
else:
return polars.DataFrame()

def tail(self, lines=10, as_string=False):
"""
Like `head`, but prints last 10 lines of the file by default.
Expand Down
85 changes: 85 additions & 0 deletions pybedtools/test/test_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -2041,3 +2041,88 @@ def test_new_head():
# however, printing should still complain:
with pytest.raises(pybedtools.cbedtools.MalformedBedLineError):
print(a)


def test_from_polars_dataframe():
try:
import polars
except ImportError:
pytest.xfail("polars not installed; skipping test")

a = pybedtools.example_bedtool("a.bed")

results = a.to_polars_dataframe()
assert results[0, "name"] == "feature1"
assert list(results.columns) == ["chrom", "start", "end", "name", "score", "strand"]
assert results[3, "strand"] == "+"

# reverse should work, too:
df = a.to_polars_dataframe()
a2 = pybedtools.BedTool.from_polars_dataframe(df)
assert a2 == a

# try converting only part of the dataframe to a BedTool
a3 = pybedtools.BedTool.from_polars_dataframe(
df.filter(polars.col("start") < 100).select(["chrom", "start", "end", "name"])
)
assert a3 == fix(
"""
chr1 1 100 feature1
"""
), str(a3)

d = pybedtools.example_bedtool("d.gff")
results = d.to_polars_dataframe()
assert list(results.columns) == [
"seqname",
"source",
"feature",
"start",
"end",
"score",
"strand",
"frame",
"attributes",
]
assert results[0, "seqname"] == "chr1"
assert results[4, "attributes"] == "ID=rRNA1;"

# get a gff file with too many fields...
x = pybedtools.example_bedtool("c.gff")
x = x.intersect(x, c=True)
with warnings.catch_warnings(record=True) as w:
# trigger the warning
x.to_polars_dataframe()
# assert a few things
assert len(w) == 1
assert issubclass(w[-1].category, UserWarning)
assert str(w[-1].message).startswith("Default names for filetype")

names = [
"seqname",
"source",
"feature",
"start",
"end",
"score",
"strand",
"frame",
"attributes",
"count",
]
results = x.to_polars_dataframe(new_columns=names)
assert list(results.columns) == [
"seqname",
"source",
"feature",
"start",
"end",
"score",
"strand",
"frame",
"attributes",
"count",
]
assert results[0, "seqname"] == "chr1"
assert results[13, "count"] == 3

1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
numpy
pandas
polars
pysam
six