docs(python): Add links to read_excel "engine_options" and "read_op…

…tions" docstring
pola-rs · Jan 10, 2025 · 3462240 · 3462240
1 parent 17556e4
commit 3462240
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 47 deletions.
diff --git a/py-polars/polars/_typing.py b/py-polars/polars/_typing.py
@@ -235,7 +235,7 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
 BufferInfo: TypeAlias = tuple[int, int, int]
 
 # type alias for supported spreadsheet engines
-ExcelSpreadsheetEngine: TypeAlias = Literal["xlsx2csv", "openpyxl", "calamine"]
+ExcelSpreadsheetEngine: TypeAlias = Literal["calamine", "openpyxl", "xlsx2csv"]
 
 
 class SeriesBuffers(TypedDict):

diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py
@@ -255,14 +255,12 @@ def read_excel(
     """
     Read Excel spreadsheet data into a DataFrame.
 
+    .. versionadded:: 1.20
+        Support loading data from named table objects with `table_name` parameter.
     .. versionadded:: 1.18
         Support loading data from a list (or glob pattern) of multiple workbooks.
     .. versionchanged:: 1.0
         Default engine is now "calamine" (was "xlsx2csv").
-    .. versionadded:: 0.20.6
-        Added "calamine" fastexcel engine for Excel Workbooks (.xlsx, .xlsb, .xls).
-    .. versionadded:: 0.19.3
-        Added "openpyxl" engine, and added `schema_overrides` parameter.
 
     Parameters
     ----------
@@ -283,34 +281,34 @@ def read_excel(
         the workbook, so additionally specifying a sheet id or name is optional;
         if one of those parameters *is* specified, an error will be raised if
         the named table is not found in that particular sheet.
-    engine : {'calamine', 'xlsx2csv', 'openpyxl'}
+    engine : {'calamine', 'openpyxl', 'xlsx2csv'}
         Library used to parse the spreadsheet file; defaults to "calamine".
 
         * "calamine": this engine can be used for reading all major types of Excel
           Workbook (`.xlsx`, `.xlsb`, `.xls`) and is *dramatically* faster than the
           other options, using the `fastexcel` module to bind the Calamine parser.
-        * "xlsx2csv": converts the data to an in-memory CSV before using the native
-          polars `read_csv` method to parse the result. You can pass `engine_options`
-          and `read_options` to refine the conversion.
         * "openpyxl": this engine is significantly slower than `xlsx2csv` but supports
           additional automatic type inference; potentially useful if you are otherwise
           unable to parse your sheet with the `xlsx2csv` engine in conjunction with the
           `schema_overrides` parameter.
+        * "xlsx2csv": converts the data to an in-memory CSV before using the native
+          polars `read_csv` method to parse the result. You can pass `engine_options`
+          and `read_options` to refine the conversion.
     engine_options
         Additional options passed to the underlying engine's primary parsing
         constructor (given below), if supported:
 
         * "calamine": n/a (can only provide `read_options`)
-        * "xlsx2csv": `Xlsx2csv`
-        * "openpyxl": `load_workbook`
+        * "openpyxl": `load_workbook <https://openpyxl.readthedocs.io/en/stable/api/openpyxl.reader.excel.html#openpyxl.reader.excel.load_workbook>`_
+        * "xlsx2csv": `Xlsx2csv <https://github.com/dilshod/xlsx2csv/blob/f35734aa453d65102198a77e7b8cd04928e6b3a2/xlsx2csv.py#L157>`_
     read_options
         Options passed to the underlying engine method that reads the sheet data.
         Where supported, this allows for additional control over parsing. The
         specific read methods associated with each engine are:
 
-        * "calamine": `ExcelReader.load_sheet_by_name`
-        * "xlsx2csv": `pl.read_csv`
+        * "calamine": `ExcelReader.load_sheet_by_name <https://fastexcel.toucantoco.dev/fastexcel.html#ExcelReader.load_sheet_by_name>`_
         * "openpyxl": n/a (can only provide `engine_options`)
+        * "xlsx2csv": see :meth:`read_csv`
     has_header
         Indicate if the first row of the table data is a header or not. If False,
         column names will be autogenerated in the following format: `column_x`, with

diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py
@@ -283,7 +283,7 @@ def test_read_excel_all_sheets(
 
 @pytest.mark.parametrize(
     "engine",
-    ["xlsx2csv", "calamine", "openpyxl"],
+    ["calamine", "openpyxl", "xlsx2csv"],
 )
 def test_read_excel_basic_datatypes(engine: ExcelSpreadsheetEngine) -> None:
     df = pl.DataFrame(
@@ -471,7 +471,7 @@ def test_read_mixed_dtype_columns(
     )
 
 
-@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"])
+@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
 def test_write_excel_bytes(engine: ExcelSpreadsheetEngine) -> None:
     df = pl.DataFrame({"colx": [1.5, -2, 0], "coly": ["a", None, "c"]})
 
@@ -634,7 +634,7 @@ def test_unsupported_binary_workbook(path_xlsb: Path) -> None:
         pl.read_excel(path_xlsb, engine="openpyxl")
 
 
-@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"])
+@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
 def test_read_excel_all_sheets_with_sheet_name(path_xlsx: Path, engine: str) -> None:
     with pytest.raises(
         ValueError,
@@ -793,7 +793,7 @@ def test_excel_round_trip(write_params: dict[str, Any]) -> None:
         assert_frame_equal(df, xldf)
 
 
-@pytest.mark.parametrize("engine", ["xlsx2csv", "calamine"])
+@pytest.mark.parametrize("engine", ["calamine", "xlsx2csv"])
 def test_excel_write_column_and_row_totals(engine: ExcelSpreadsheetEngine) -> None:
     df = pl.DataFrame(
         {
@@ -828,7 +828,7 @@ def test_excel_write_column_and_row_totals(engine: ExcelSpreadsheetEngine) -> No
         assert xldf.row(-1) == (None, 0.0, 0.0, 0, 0, None, 0.0, 0)
 
 
-@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"])
+@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
 def test_excel_write_compound_types(engine: ExcelSpreadsheetEngine) -> None:
     df = pl.DataFrame(
         {"x": [[1, 2], [3, 4], [5, 6]], "y": ["a", "b", "c"], "z": [9, 8, 7]}
@@ -925,7 +925,7 @@ def test_excel_write_to_file_object(
         assert_frame_equal(df, pl.read_excel(src, engine=engine))
 
 
-@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"])
+@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
 def test_excel_read_no_headers(engine: ExcelSpreadsheetEngine) -> None:
     df = pl.DataFrame(
         {"colx": [1, 2, 3], "coly": ["aaa", "bbb", "ccc"], "colz": [0.5, 0.0, -1.0]}
@@ -938,7 +938,7 @@ def test_excel_read_no_headers(engine: ExcelSpreadsheetEngine) -> None:
     assert_frame_equal(df, expected)
 
 
-@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"])
+@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
 def test_excel_write_sparklines(engine: ExcelSpreadsheetEngine) -> None:
     from xlsxwriter import Workbook
 
@@ -1217,7 +1217,7 @@ def test_excel_mixed_calamine_float_data(io_files_path: Path) -> None:
     )
 
 
-@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"])
+@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
 @pytest.mark.may_fail_auto_streaming  # read->scan_csv dispatch, _read_spreadsheet_xlsx2csv needs to be changed not to call `_reorder_columns` on the df
 def test_excel_type_inference_with_nulls(engine: ExcelSpreadsheetEngine) -> None:
     df = pl.DataFrame(
@@ -1255,36 +1255,26 @@ def test_excel_type_inference_with_nulls(engine: ExcelSpreadsheetEngine) -> None
         assert_frame_equal(df.select(reversed_cols), read_df)
 
 
-def test_drop_empty_rows(path_empty_rows_excel: Path) -> None:
-    df1 = pl.read_excel(source=path_empty_rows_excel, engine="xlsx2csv")
+@pytest.mark.parametrize("engine", ["calamine", "openpyxl", "xlsx2csv"])
+def test_drop_empty_rows(
+    path_empty_rows_excel: Path, engine: ExcelSpreadsheetEngine
+) -> None:
+    df1 = pl.read_excel(
+        source=path_empty_rows_excel,
+        engine=engine,
+    )  # check default
     assert df1.shape == (8, 4)
+
     df2 = pl.read_excel(
-        source=path_empty_rows_excel, engine="xlsx2csv", drop_empty_rows=True
+        source=path_empty_rows_excel,
+        engine=engine,
+        drop_empty_rows=True,
     )
     assert df2.shape == (8, 4)
+
     df3 = pl.read_excel(
-        source=path_empty_rows_excel, engine="xlsx2csv", drop_empty_rows=False
+        source=path_empty_rows_excel,
+        engine=engine,
+        drop_empty_rows=False,
     )
     assert df3.shape == (10, 4)
-
-    df4 = pl.read_excel(source=path_empty_rows_excel, engine="openpyxl")
-    assert df4.shape == (8, 4)
-    df5 = pl.read_excel(
-        source=path_empty_rows_excel, engine="openpyxl", drop_empty_rows=True
-    )
-    assert df5.shape == (8, 4)
-    df6 = pl.read_excel(
-        source=path_empty_rows_excel, engine="openpyxl", drop_empty_rows=False
-    )
-    assert df6.shape == (10, 4)
-
-    df7 = pl.read_excel(source=path_empty_rows_excel, engine="calamine")
-    assert df7.shape == (8, 4)
-    df8 = pl.read_excel(
-        source=path_empty_rows_excel, engine="calamine", drop_empty_rows=True
-    )
-    assert df8.shape == (8, 4)
-    df9 = pl.read_excel(
-        source=path_empty_rows_excel, engine="calamine", drop_empty_rows=False
-    )
-    assert df9.shape == (10, 4)