daler · DevangThakkar · May 11, 2023
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -2,6 +2,7 @@ cython
 matplotlib
 numpydoc
 pandas
+polars
 pyyaml
 sphinx
 pysam
diff --git a/docker/pbt-test-py2/Dockerfile b/docker/pbt-test-py2/Dockerfile
@@ -30,6 +30,7 @@ RUN conda install -c daler \
     numpydoc \
     pip \
     pandas \
+    polars \
     pyyaml \
     sphinx \
     pysam

diff --git a/docker/pbt-test-py3/Dockerfile b/docker/pbt-test-py3/Dockerfile
@@ -30,6 +30,7 @@ RUN conda install -c daler \
     numpydoc \
     pip \
     pandas \
+    polars \
     pyyaml \
     sphinx \
     pysam

diff --git a/docs/source/dataframe.rst b/docs/source/dataframe.rst
@@ -0,0 +1,42 @@
+.. include:: includeme.rst
+
+.. _saveresults:
+
+Exporting to a dataframe
+==================
+
+If you want to export the results as a dataframe for more analysis, use
+the :meth:`BedTool.to_dataframe` method to export to a pandas dataframe or the :meth:`BedTool.to_polars_dataframe` method to export to a polars dataframe. This method also lets you optionally specify column names for the dataframes instead of the default columns names that pybedtools uses. You can use the same arguments you would normally use while reading a file into a pandas (`names=`) or polars (`new_columns=`) dataframe. By default, pybedtools assumes that there is no header line in the bed file. If your bed file already has names in the first row, you can set the `disable_auto_names` argument to `False`.
+
+.. doctest::
+    :options: +NORMALIZE_WHITESPACE
+
+    >>> import pandas
+    >>> import polars
+    >>> a = pybedtools.example_bedtool('a.bed')
+    <BLANKLINE>
+
+    >>> pandas_df = a.to_dataframe()
+    >>> print(pandas_df)
+        chrom   start    end        name   score  strand
+    0    chr1       1    100    feature1       0       +
+    1    chr1     100    200    feature2       0       +
+    2    chr1     150    500    feature3       0       -
+    3    chr1     900    950    feature4       0       +
+    <BLANKLINE>
+
+    >>> polars_df = a.to_polars_dataframe()
+    >>> print(polars_df)
+    ——————————————————————————————————————————————————————
+    │ chrom ┆ start ┆ end   ┆ name      ┆ score ┆ strand │
+    │ ---   ┆ ---   ┆ ---   ┆ ---       ┆ ---   ┆ ---    │
+    │ str   ┆ i64   ┆ i64   ┆ str       ┆ i64   ┆ str    │
+    ══════════════════════════════════════════════════════
+    │ chr1  ┆ 1     ┆ 100   ┆ feature1  ┆ 0     ┆ +      │
+    │ chr1  ┆ 100   ┆ 200   ┆ feature2  ┆ 0     ┆ +      │
+    │ chr1  ┆ 150   ┆ 500   ┆ feature3  ┆ 0     ┆ -      │
+    │ chr1  ┆ 900   ┆ 950   ┆ feature4  ┆ 0     ┆ +      │
+    ——————————————————————————————————————————————————————
+    <BLANKLINE>
+
+You can also generate a :class:`BedTool` object from a pandas or polars dataframe using the  :meth:`BedTool.from_dataframe` or :meth:`BedTool.from_polars_dataframe` method respectively.
diff --git a/docs/source/tutorial-contents.rst b/docs/source/tutorial-contents.rst
@@ -11,6 +11,7 @@ Tutorial Contents
     create-a-bedtool-tutorial
     intersections
     save-results
+    dataframe
     default-arguments
     piping
     intervals

diff --git a/pybedtools/bedtool.py b/pybedtools/bedtool.py
@@ -644,6 +644,47 @@ def from_dataframe(
                 )
         return BedTool(fn)
 
+    @classmethod
+    def from_polars_dataframe(
+        self,
+        polars_df,
+        outfile=None,
+        separator="\t",
+        has_header=False,
+        **kwargs
+    ):
+        """
+        Creates a BedTool from a polars.DataFrame.
+
+        If `outfile` is None, a temporary file will be used. Otherwise it can
+        be a specific filename or an open file handle. Additional kwargs will
+        be passed to `polars.DataFrame.write_csv`.
+
+        The fields of the resulting BedTool will match the order of columns in
+        the dataframe.
+        """
+        try:
+            import polars
+        except ImportError:
+            raise ImportError("polars must be installed to use dataframes")
+        if outfile is None:
+            outfile = self._tmp()
+        default_kwargs = dict(separator=separator, has_header=has_header)
+        default_kwargs.update(kwargs)
+        polars_df.write_csv(outfile, **default_kwargs)
+
+        if isinstance(outfile, six.string_types):
+            fn = outfile
+        else:
+            try:
+                fn = outfile.name
+            except AttributeError:
+                raise ValueError(
+                    "`outfile` is not a string and doesn't have a `name` attribute. "
+                    "Unable to determine filename."
+                )
+        return BedTool(fn)
+
     def split(self, func, *args, **kwargs):
         """
         Split each feature using a user-defined function.
@@ -3715,6 +3756,54 @@ def to_dataframe(self, disable_auto_names=False, *args, **kwargs):
         else:
             return pandas.DataFrame()
 
+    def to_polars_dataframe(self, disable_auto_names=False, *args, **kwargs):
+        """
+        Create a polars.DataFrame, passing args and kwargs to polars.read_csv
+        The separator kwarg `separator` is given a tab `\\t` value by default.
+
+        Parameters
+        ----------
+        disable_auto_names : bool
+            By default, the created dataframe fills in column names
+            automatically according to the detected filetype (e.g., "chrom",
+            "start", "end" for a BED3 file). Set this argument to True to
+            disable this behavior.
+        """
+        # Complain if BAM or if not a file
+        if self._isbam:
+            raise ValueError("BAM not supported for converting to DataFrame")
+        if not isinstance(self.fn, six.string_types):
+            raise ValueError("use .saveas() to make sure self.fn is a file")
+
+        try:
+            import polars
+        except ImportError:
+            raise ImportError("polars must be installed to convert to polars.DataFrame")
+        # Otherwise we're good:
+        names = kwargs.get("new_columns", None)
+        if names is None and not disable_auto_names:
+            try:
+                _names = settings._column_names[self.file_type][: self.field_count()]
+                if len(_names) < self.field_count():
+                    warn(
+                        "Default names for filetype %s are:\n%s\nbut file has "
+                        "%s fields; you can supply custom names with the "
+                        "`names` kwarg" % (self.file_type, _names, self.field_count())
+                    )
+                    _names = None
+            except KeyError:
+                _names = None
+            kwargs["new_columns"] = _names
+
+        has_header = kwargs.get("has_header", False)
+        if disable_auto_names:
+            has_header = True
+        kwargs["has_header"] = has_header
+        if os.path.isfile(self.fn) and os.path.getsize(self.fn) > 0:
+            return polars.read_csv(self.fn, *args, separator="\t", **kwargs)
+        else:
+            return polars.DataFrame()
+
     def tail(self, lines=10, as_string=False):
         """
         Like `head`, but prints last 10 lines of the file by default.

diff --git a/pybedtools/test/test_1.py b/pybedtools/test/test_1.py
@@ -2041,3 +2041,88 @@ def test_new_head():
     # however, printing should still complain:
     with pytest.raises(pybedtools.cbedtools.MalformedBedLineError):
         print(a)
+
+
+def test_from_polars_dataframe():
+    try:
+        import polars
+    except ImportError:
+        pytest.xfail("polars not installed; skipping test")
+
+    a = pybedtools.example_bedtool("a.bed")
+
+    results = a.to_polars_dataframe()
+    assert results[0, "name"] == "feature1"
+    assert list(results.columns) == ["chrom", "start", "end", "name", "score", "strand"]
+    assert results[3, "strand"] == "+"
+
+    # reverse should work, too:
+    df = a.to_polars_dataframe()
+    a2 = pybedtools.BedTool.from_polars_dataframe(df)
+    assert a2 == a
+
+    # try converting only part of the dataframe to a BedTool
+    a3 = pybedtools.BedTool.from_polars_dataframe(
+        df.filter(polars.col("start") < 100).select(["chrom", "start", "end", "name"])
+    )
+    assert a3 == fix(
+        """
+        chr1    1   100 feature1
+        """
+    ), str(a3)
+
+    d = pybedtools.example_bedtool("d.gff")
+    results = d.to_polars_dataframe()
+    assert list(results.columns) == [
+        "seqname",
+        "source",
+        "feature",
+        "start",
+        "end",
+        "score",
+        "strand",
+        "frame",
+        "attributes",
+    ]
+    assert results[0, "seqname"] == "chr1"
+    assert results[4, "attributes"] == "ID=rRNA1;"
+
+    # get a gff file with too many fields...
+    x = pybedtools.example_bedtool("c.gff")
+    x = x.intersect(x, c=True)
+    with warnings.catch_warnings(record=True) as w:
+        # trigger the warning
+        x.to_polars_dataframe()
+        # assert a few things
+        assert len(w) == 1
+        assert issubclass(w[-1].category, UserWarning)
+        assert str(w[-1].message).startswith("Default names for filetype")
+
+    names = [
+        "seqname",
+        "source",
+        "feature",
+        "start",
+        "end",
+        "score",
+        "strand",
+        "frame",
+        "attributes",
+        "count",
+    ]
+    results = x.to_polars_dataframe(new_columns=names)
+    assert list(results.columns) == [
+        "seqname",
+        "source",
+        "feature",
+        "start",
+        "end",
+        "score",
+        "strand",
+        "frame",
+        "attributes",
+        "count",
+    ]
+    assert results[0, "seqname"] == "chr1"
+    assert results[13, "count"] == 3
+
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 numpy
 pandas
+polars
 pysam
 six
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,6 +2,7 @@ cython @@
     matplotlib
     numpydoc
     pandas
+    polars
     pyyaml
     sphinx
     pysam