feat: remove shcema check for union

PySparky · Oct 4, 2024 · 2c14afc · 2c14afc
1 parent 4bbf790
commit 2c14afc
Show file tree

Hide file tree

Showing 5 changed files with 16 additions and 25 deletions.
diff --git a/dist/pysparky-0.1.0-py3-none-any.whl b/dist/pysparky-0.1.0-py3-none-any.whl
diff --git a/dist/pysparky-0.1.0.tar.gz b/dist/pysparky-0.1.0.tar.gz
diff --git a/pysparky/utils.py b/pysparky/utils.py
@@ -75,8 +75,4 @@ def union_dataframes(*dataframes: DataFrame | list[DataFrame]) -> DataFrame:
     if isinstance(dataframes[0], list):
         dataframes = dataframes[0]
 
-    # Check if all DataFrames have the same schema
-    if not all(sdf.schema == dataframes[0].schema for sdf in dataframes):
-        raise ValueError("All DataFrames must have the same schema")
-
-    return reduce(DataFrame.union, dataframes)
+    return reduce(lambda df1, df2: df1.union(df2), dataframes)
diff --git a/run_pytests.py b/run_pytests.py
@@ -0,0 +1,15 @@
+import sys
+
+import pytest
+
+sys.path.append(".")
+
+sys.dont_write_bytecode = True
+
+args = ["--verbose", "-p", "no:cacheprovider"]
+# args += ["-k", "test_dataframe_transform"] # uncomment for specific test
+
+result = pytest.main(args)
+
+print(f"{result=}")
+assert result == pytest.ExitCode.OK, "Test run was not successful."
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -119,26 +119,6 @@ def test_union_dataframes(spark):
     assert result_data == expected_result
 
 
-def test_union_different_schema(spark):
-    schema1 = T.StructType(
-        [
-            T.StructField("name", T.StringType(), True),
-            T.StructField("age", T.IntegerType(), True),
-        ]
-    )
-    schema2 = T.StructType(
-        [
-            T.StructField("name", T.StringType(), True),
-            T.StructField("salary", T.IntegerType(), True),
-        ]
-    )
-    df1 = spark.createDataFrame([("Alice", 30)], schema1)
-    df2 = spark.createDataFrame([("Bob", 50000)], schema2)
-
-    with pytest.raises(ValueError, match="All DataFrames must have the same schema"):
-        utils.union_dataframes(df1, df2)
-
-
 def test_union_list_dataframes(spark):
     data1 = {"id": [1, 2, 3], "value": [10, 20, 30]}
     data2 = {"id": [4, 5, 6], "value": [40, 50, 60]}