Merge pull request #13 from wd60622/expose-df-timestamp-and-widen

methods on dataframe
wd60622 · Sep 12, 2023 · e4d4380 · e4d4380
2 parents 397658b + f7f27b1
commit e4d4380
Show file tree

Hide file tree

Showing 4 changed files with 109 additions and 7 deletions.
diff --git a/latent_calendar/extensions.py b/latent_calendar/extensions.py
@@ -101,7 +101,8 @@
 )
 from latent_calendar.transformers import (
     create_raw_to_vocab_transformer,
-    CalandarTimestampFeatures,
+    create_timestamp_feature_pipeline,
+    LongToWide,
 )
 
 
@@ -112,11 +113,18 @@ class SeriesAccessor:
     def __init__(self, pandas_obj: pd.Series):
         self._obj = pandas_obj
 
-    def timestamp_features(self) -> pd.DataFrame:
+    def timestamp_features(
+        self, discretize: bool = True, minutes: int = 60, create_vocab: bool = True
+    ) -> pd.DataFrame:
         """Create day of week and proportion into day columns.
 
         Exposed as a method on Series for convenience.
 
+        Args:
+            discretize: Whether to discretize the hour column.
+            minutes: The number of minutes to discretize by. Ingored if `discretize` is False.
+            create_vocab: Whether to create the vocab column.
+
         Returns:
             DataFrame with features
 
@@ -148,7 +156,12 @@ def timestamp_features(self) -> pd.DataFrame:
 
         """
         name = self._obj.name or "timestamp"
-        transformer = CalandarTimestampFeatures(timestamp_col=name)
+        transformer = create_timestamp_feature_pipeline(
+            timestamp_col=name,
+            discretize=discretize,
+            minutes=minutes,
+            create_vocab=create_vocab,
+        )
 
         return transformer.fit_transform(self._obj.rename(name).to_frame())
 
@@ -257,13 +270,68 @@ def normalize(self, kind: str) -> pd.DataFrame:
 
         raise ValueError(f"kind must be one of ['max', 'probs'], got {kind}")
 
+    def timestamp_features(
+        self,
+        column: str,
+        discretize: bool = True,
+        minutes: int = 60,
+        create_vocab: bool = True,
+    ) -> pd.DataFrame:
+        """Create day of week and proportion into day columns for event level DataFrame
+
+        Exposed as a method on DataFrame for convenience. Use `cal.aggregate_events` instead to create the wide format DataFrame.
+
+        Args:
+            column: The name of the timestamp column.
+            discretize: Whether to discretize the hour column.
+            minutes: The number of minutes to discretize by. Ingored if `discretize` is False.
+            create_vocab: Whether to create the vocab column.
+
+        Returns:
+            DataFrame with features added
+
+        """
+        transformer = create_timestamp_feature_pipeline(
+            timestamp_col=column,
+            discretize=discretize,
+            create_vocab=create_vocab,
+            minutes=minutes,
+        )
+
+        return transformer.fit_transform(self._obj)
+
+    def widen(
+        self, column: str, as_int: bool = True, minutes: int = 60
+    ) -> pd.DataFrame:
+        """Transform an aggregated DataFrame to wide calendar format.
+
+        Wrapper around `LongToWide` transformer to transform to wide format.
+
+        Args:
+            column: column to widen
+            as_int: whether to cast the column to int
+            minutes: number of minutes to
+
+        Returns:
+            DataFrame in wide format
+
+        """
+        if not isinstance(self._obj.index, pd.MultiIndex):
+            raise ValueError(
+                "DataFrame is expected to have a MultiIndex with the last column as the vocab."
+            )
+
+        transformer = LongToWide(col=column, as_int=as_int, minutes=minutes)
+
+        return transformer.fit_transform(self._obj)
+
     def aggregate_events(
         self,
         by: Union[str, List[str]],
         timestamp_col: str,
         minutes: int = 60,
     ) -> pd.DataFrame:
-        """Transform DataFrame to wide format with groups as index.
+        """Transform event level DataFrame to wide format with groups as index.
 
         Wrapper around `create_raw_to_vocab_transformer` to transform to wide format.
 

diff --git a/latent_calendar/transformers.py b/latent_calendar/transformers.py
@@ -150,14 +150,16 @@ def get_feature_names_out(self, input_features=None):
 
 def create_timestamp_feature_pipeline(
     timestamp_col: str,
+    discretize: bool = True,
     minutes: int = 60,
     create_vocab: bool = True,
 ) -> Pipeline:
     """Create a pipeline that creates features from the timestamp column.
 
     Args:
         timestamp_col: The name of the timestamp column.
-        minutes: The number of minutes to discretize by.
+        discretize: Whether to discretize the hour column.
+        minutes: The number of minutes to discretize by. Ignored if discretize is False.
         create_vocab: Whether to create the vocab column.
 
     Returns:
@@ -177,15 +179,22 @@ def create_timestamp_feature_pipeline(
         ```
 
     """
+    if create_vocab and not discretize:
+        raise ValueError("Cannot create vocab without discretizing.")
+
     vocab_col = "hour"
     transformers = [
         (
             "timestamp_features",
             CalandarTimestampFeatures(timestamp_col=timestamp_col),
         ),
-        ("binning", HourDiscretizer(col=vocab_col, minutes=minutes)),
     ]
 
+    if discretize:
+        transformers.append(
+            ("binning", HourDiscretizer(col=vocab_col, minutes=minutes))
+        )
+
     if create_vocab:
         transformers.append(
             ("vocab_creation", VocabTransformer(hour_col=vocab_col)),

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "latent-calendar"
-version = "0.0.11"
+version = "0.0.12"
 description = "Analyzing and modeling weekly calendar distributions using latent components"
 authors = ["Will Dean <[email protected]>"]
 readme = "README.md"

diff --git a/tests/test_extensions.py b/tests/test_extensions.py
@@ -120,6 +120,31 @@ def test_long_dataframe_extensions(df_long) -> None:
     df_long.cal.plot_across_column("timestamp", "group")
 
 
+@pytest.fixture
+def df_agg(df_long) -> pd.DataFrame:
+    return (
+        df_long.cal.timestamp_features("timestamp")
+        .groupby(["group", "vocab"])
+        .size()
+        .rename("num_events")
+        .to_frame()
+    )
+
+
+def test_agg_dataframe_extensions(df_agg) -> None:
+    assert hasattr(df_agg, "cal")
+
+    df_wide = df_agg.cal.widen("num_events")
+    assert df_wide.shape == (3, TIME_SLOTS)
+
+    with pytest.raises(ValueError):
+        df_agg.reset_index(0).cal.widen("num_events")
+
+    df_false_order = df_agg.reorder_levels([1, 0]).cal.widen("num_events")
+    assert isinstance(df_false_order, pd.DataFrame)
+    assert df_false_order.sum().sum() == 0
+
+
 @pytest.fixture
 def df_wide() -> pd.DataFrame:
     nrows = 25