quintoandar · ralphrass · Oct 8, 2024 · Oct 4, 2024 · Oct 4, 2024 · Oct 4, 2024
@@ -78,6 +78,9 @@ def _get_alter_table_add_query(self, columns: List[Diff], table_name: str) -> st
     def _get_alter_column_type_query(self, column: Diff, table_name: str) -> str:
         """Creates CQL statement to alter columns' types.
 
+            In Cassandra 3.4.x to 3.11.x alter type is not allowed.
+            This method creates a temp column to comply.
+
         Args:
             columns: list of Diff objects with ALTER_TYPE kind.
             table_name: table name.
@@ -86,10 +89,23 @@ def _get_alter_column_type_query(self, column: Diff, table_name: str) -> str:
             Alter column type query.
 
         """
-        parsed_columns = self._get_parsed_columns([column])
+        temp_column_name = f"{column.column}_temp"
+
+        add_temp_column_query = (
+            f"ALTER TABLE {table_name} ADD {temp_column_name} {column.value};"
+        )
+        copy_data_to_temp_query = (
+            f"UPDATE {table_name} SET {temp_column_name} = {column.column};"
+        )
+
+        drop_old_column_query = f"ALTER TABLE {table_name} DROP {column.column};"
+        rename_temp_column_query = (
+            f"ALTER TABLE {table_name} RENAME {temp_column_name} TO {column.column};"
+        )
 
         return (
-            f"ALTER TABLE {table_name} ALTER {parsed_columns.replace(' ', ' TYPE ')};"
+            f"{add_temp_column_query} {copy_data_to_temp_query} "
+            f"{drop_old_column_query} {rename_temp_column_query};"
         )
 
     @staticmethod

@@ -576,14 +576,16 @@ def construct(
 
         pre_hook_df = self.run_pre_hooks(dataframe)
 
-        output_df = reduce(
-            lambda df, feature: feature.transform(df),
-            self.keys + [self.timestamp],
-            pre_hook_df,
+        output_df = pre_hook_df
+        for feature in self.keys + [self.timestamp]:
+            output_df = feature.transform(output_df)
+
+        output_df = self.incremental_strategy.filter_with_incremental_strategy(
+            dataframe=output_df, start_date=start_date, end_date=end_date
         )
 
         if self._windows and end_date is not None:
-            # run aggregations for each window
+            # Run aggregations for each window
             agg_list = [
                 self._aggregate(
                     dataframe=output_df,
@@ -603,13 +605,12 @@ def construct(
 
             # keeping this logic to maintain the same behavior for already implemented
             # feature sets
-
             if self._windows[0].slide == "1 day":
                 base_df = self._get_base_dataframe(
                     client=client, dataframe=output_df, end_date=end_date
                 )
 
-                # left join each aggregation result to our base dataframe
+                # Left join each aggregation result to our base dataframe
                 output_df = reduce(
                     lambda left, right: self._dataframe_join(
                         left,
@@ -635,19 +636,21 @@ def construct(
         else:
             output_df = self._aggregate(output_df, features=self.features)
 
-        output_df = self.incremental_strategy.filter_with_incremental_strategy(
-            dataframe=output_df, start_date=start_date, end_date=end_date
-        )
-
         output_df = output_df.select(*self.columns).replace(  # type: ignore
             float("nan"), None
         )
+
         if not output_df.isStreaming and self.deduplicate_rows:
             output_df = self._filter_duplicated_rows(output_df)
 
         post_hook_df = self.run_post_hooks(output_df)
 
+        # Eager evaluation, only if needed and managable
         if not output_df.isStreaming and self.eager_evaluation:
-            post_hook_df.cache().count()
+            # Small dataframes only
+            if output_df.count() < 1_000_000:
+                post_hook_df.cache().count()
+            else:
+                post_hook_df.cache()  # Cache without materialization for large volumes
 
         return post_hook_df