feat: changing column enabler from decorator to function

PySparky · Oct 17, 2024 · e899d7b · e899d7b
1 parent ea99b1e
commit e899d7b
Show file tree

Hide file tree

Showing 13 changed files with 185 additions and 72 deletions.
diff --git a/dist/pysparky-0.1.0-py3-none-any.whl b/dist/pysparky-0.1.0-py3-none-any.whl
diff --git a/dist/pysparky-0.1.0.tar.gz b/dist/pysparky-0.1.0.tar.gz
diff --git a/example/dev.ipynb b/example/dev.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -14,7 +14,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -23,25 +23,6 @@
      "text": [
       "3.5.2\n"
      ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "24/10/03 15:42:41 WARN Utils: Your hostname, codespaces-0aafae resolves to a loopback address: 127.0.0.1; using 10.0.1.110 instead (on interface eth0)\n",
-      "24/10/03 15:42:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
-      "Setting default log level to \"WARN\".\n",
-      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
-      "24/10/03 15:42:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
-      "24/10/03 15:42:44 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "24/10/03 15:42:56 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors\n"
-     ]
     }
    ],
    "source": [
@@ -60,13 +41,62 @@
     "spark = SparkSession.builder.getOrCreate()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pysparky import functions as F_"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": []
   },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Column<'trim(regexp_replace(hi, \\s+,  , 1))'>"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "F_.single_space_and_trim(\"hi\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Column<'CASE WHEN (hi = 1) THEN Ture END'>"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "F_.when_mapping(\"hi\", {1: \"Ture\"})"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 3,
@@ -128,11 +158,7 @@
     "se.convert_1d_list_to_dataframe(\n",
     "    spark, my_list, [\"ID1\", \"ID2\", \"ID3\", \"ID4\"], axis=\"row\"\n",
     ").show()\n",
-    "se.convert_1d_list_to_dataframe(spark, my_list, \"ID\", axis=\"column\").show()\n",
-    "spark.convert_1d_list_to_dataframe(\n",
-    "    my_list, [\"ID1\", \"ID2\", \"ID3\", \"ID4\"], axis=\"row\"\n",
-    ").show()\n",
-    "spark.convert_1d_list_to_dataframe(my_list, \"ID\", axis=\"column\").show()"
+    "se.convert_1d_list_to_dataframe(spark, my_list, \"ID\", axis=\"column\").show()"
    ]
   },
   {
@@ -201,6 +227,33 @@
     "result == [(3, 4)]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(Column<'hello'>,)"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "hello_columns"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/pysparky.egg-info/PKG-INFO b/pysparky.egg-info/PKG-INFO
@@ -35,7 +35,6 @@ python -m build
 
 # TODO
 - Change pytest test case
-- Build mkdocs -> to make it standard to ingest to MkDocs
 - Build wheels for PyPi
 
 # Reference:

diff --git a/pysparky.egg-info/SOURCES.txt b/pysparky.egg-info/SOURCES.txt
@@ -4,11 +4,13 @@ pyproject.toml
 pysparky/__init__.py
 pysparky/debug.py
 pysparky/decorator.py
+pysparky/enabler.py
 pysparky/quality.py
 pysparky/reader_options.py
 pysparky/schema_ext.py
 pysparky/spark_ext.py
 pysparky/transformation_ext.py
+pysparky/typing.py
 pysparky/utils.py
 pysparky.egg-info/PKG-INFO
 pysparky.egg-info/SOURCES.txt
@@ -21,6 +23,7 @@ pysparky/functions/general.py
 pysparky/functions/math_.py
 tests/test_debug.py
 tests/test_decorator.py
+tests/test_enabler.py
 tests/test_quality.py
 tests/test_schema_ext.py
 tests/test_spark_ext.py

diff --git a/pysparky/enabler.py b/pysparky/enabler.py
@@ -0,0 +1,25 @@
+from pyspark.sql import Column
+from pyspark.sql import functions as F
+
+from pysparky.typing import ColumnOrName
+
+
+def column_or_name_enabler(*columns: ColumnOrName) -> tuple[Column, ...]:
+    """
+    Enables PySpark functions to accept either column names (as strings) or Column objects.
+
+    Parameters:
+    columns (ColumnOrName): Column names (as strings) or Column objects to be converted.
+
+    Returns:
+    tuple[Column]: A tuple of Column objects.
+
+    Example:
+    >>> column_or_name_enabler("col1", "col2", F.col("col3"))
+    (Column<b'col1'>, Column<b'col2'>, Column<b'col3'>)
+    """
+    return tuple(
+        map(
+            lambda column: F.col(column) if isinstance(column, str) else column, columns
+        )
+    )
diff --git a/pysparky/functions/conditions.py b/pysparky/functions/conditions.py
@@ -1,17 +1,18 @@
 from functools import reduce
 from operator import and_, or_
-from typing import Union
 
 from pyspark.sql import Column
 from pyspark.sql import functions as F
 
+from pysparky.typing import ColumnOrName
 
-def condition_and(*conditions: Union[Column, str]) -> Column:
+
+def condition_and(*conditions: ColumnOrName) -> Column:
     """
     Combines multiple conditions using logical AND.
 
     Args:
-        *conditions (Union[Column, str]): Multiple PySpark Column objects or SQL expression strings representing conditions.
+        *conditions (ColumnOrName): Multiple PySpark Column objects or SQL expression strings representing conditions.
 
     Returns:
         Column: A single PySpark Column object representing the combined condition.
@@ -29,12 +30,12 @@ def condition_and(*conditions: Union[Column, str]) -> Column:
     return reduce(and_, parsed_conditions, F.lit(True))
 
 
-def condition_or(*conditions: Union[Column, str]) -> Column:
+def condition_or(*conditions: ColumnOrName) -> Column:
     """
     Combines multiple conditions using logical OR.
 
     Args:
-        *conditions (Union[Column, str]): Multiple PySpark Column objects or SQL expression strings representing conditions.
+        *conditions (ColumnOrName): Multiple PySpark Column objects or SQL expression strings representing conditions.
 
     Returns:
         Column: A single PySpark Column object representing the combined condition.

diff --git a/pysparky/functions/general.py b/pysparky/functions/general.py
@@ -7,6 +7,8 @@
 from pyspark.sql import functions as F
 
 from pysparky import decorator, utils
+from pysparky.enabler import column_or_name_enabler
+from pysparky.typing import ColumnOrName
 
 
 @decorator.extension_enabler(Column)
@@ -67,7 +69,7 @@ def chain(self, func, *args, **kwargs) -> Column:
 @decorator.extension_enabler(Column)
 @decorator.pyspark_column_or_name_enabler("column_or_name")
 def startswiths(
-    column_or_name: str | Column, list_of_strings: list[str]
+    column_or_name: ColumnOrName, list_of_strings: list[str]
 ) -> pyspark.sql.Column:
     """
     Creates a PySpark Column expression to check if the given column starts with any string in the list.
@@ -88,9 +90,8 @@ def startswiths(
 
 
 @decorator.extension_enabler(Column)
-@decorator.pyspark_column_or_name_enabler("column_or_name")
 def replace_strings_to_none(
-    column_or_name: str | Column,
+    column_or_name: ColumnOrName,
     list_of_null_string: list[str],
     customize_output: Any = None,
 ) -> pyspark.sql.Column:
@@ -104,14 +105,13 @@ def replace_strings_to_none(
         Column: A Spark DataFrame column with the values replaced.
     """
 
-    return F.when(column_or_name.isin(list_of_null_string), customize_output).otherwise(
-        column_or_name
-    )
+    (column,) = column_or_name_enabler(column_or_name)
+
+    return F.when(column.isin(list_of_null_string), customize_output).otherwise(column)
 
 
 @decorator.extension_enabler(Column)
-@decorator.pyspark_column_or_name_enabler("column_or_name")
-def single_space_and_trim(column_or_name: str | Column) -> Column:
+def single_space_and_trim(column_or_name: ColumnOrName) -> Column:
     """
     Replaces multiple white spaces with a single space and trims the column.
 
@@ -126,8 +126,7 @@ def single_space_and_trim(column_or_name: str | Column) -> Column:
 
 
 @decorator.extension_enabler(Column)
-@decorator.pyspark_column_or_name_enabler("column_or_name")
-def get_value_from_map(column_or_name: str | Column, dict_: dict) -> Column:
+def get_value_from_map(column_or_name: ColumnOrName, dict_: dict) -> Column:
     """
     Retrieves a value from a map (dictionary) using a key derived from a specified column in a DataFrame.
 
@@ -153,12 +152,13 @@ def get_value_from_map(column_or_name: str | Column, dict_: dict) -> Column:
         |         2|    b|
         +----------+-----+
     """
-    return utils.create_map_from_dict(dict_)[column_or_name]
+    (column,) = column_or_name_enabler(column_or_name)
+
+    return utils.create_map_from_dict(dict_)[column]
 
 
 @decorator.extension_enabler(Column)
-@decorator.pyspark_column_or_name_enabler("column_or_name")
-def when_mapping(column_or_name: Column, dict_: dict) -> Column:
+def when_mapping(column_or_name: ColumnOrName, dict_: dict) -> Column:
     """
     Applies a series of conditional mappings to a PySpark Column based on a dictionary of conditions and values.
 
@@ -169,7 +169,11 @@ def when_mapping(column_or_name: Column, dict_: dict) -> Column:
     Returns:
         Column: A new PySpark Column with the conditional mappings applied.
     """
-    result_column = F  # initiate as an functions
-    for condition, value in dict_.items():
-        result_column = result_column.when(column_or_name == condition, value)
+    (column,) = column_or_name_enabler(column_or_name)
+
+    def reducer(result_column: Column, condition_value: tuple[Any, Any]) -> Column:
+        condition, value = condition_value
+        return result_column.when(column == condition, value)
+
+    result_column: Column = functools.reduce(reducer, dict_.items(), F)  # type: ignore
     return result_column