Skip to content

Commit

Permalink
Merge pull request #13 from wd60622/expose-df-timestamp-and-widen
Browse files Browse the repository at this point in the history
methods on dataframe
  • Loading branch information
wd60622 authored Sep 12, 2023
2 parents 397658b + f7f27b1 commit e4d4380
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 7 deletions.
76 changes: 72 additions & 4 deletions latent_calendar/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,8 @@
)
from latent_calendar.transformers import (
create_raw_to_vocab_transformer,
CalandarTimestampFeatures,
create_timestamp_feature_pipeline,
LongToWide,
)


Expand All @@ -112,11 +113,18 @@ class SeriesAccessor:
def __init__(self, pandas_obj: pd.Series):
self._obj = pandas_obj

def timestamp_features(self) -> pd.DataFrame:
def timestamp_features(
self, discretize: bool = True, minutes: int = 60, create_vocab: bool = True
) -> pd.DataFrame:
"""Create day of week and proportion into day columns.
Exposed as a method on Series for convenience.
Args:
discretize: Whether to discretize the hour column.
minutes: The number of minutes to discretize by. Ingored if `discretize` is False.
create_vocab: Whether to create the vocab column.
Returns:
DataFrame with features
Expand Down Expand Up @@ -148,7 +156,12 @@ def timestamp_features(self) -> pd.DataFrame:
"""
name = self._obj.name or "timestamp"
transformer = CalandarTimestampFeatures(timestamp_col=name)
transformer = create_timestamp_feature_pipeline(
timestamp_col=name,
discretize=discretize,
minutes=minutes,
create_vocab=create_vocab,
)

return transformer.fit_transform(self._obj.rename(name).to_frame())

Expand Down Expand Up @@ -257,13 +270,68 @@ def normalize(self, kind: str) -> pd.DataFrame:

raise ValueError(f"kind must be one of ['max', 'probs'], got {kind}")

def timestamp_features(
self,
column: str,
discretize: bool = True,
minutes: int = 60,
create_vocab: bool = True,
) -> pd.DataFrame:
"""Create day of week and proportion into day columns for event level DataFrame
Exposed as a method on DataFrame for convenience. Use `cal.aggregate_events` instead to create the wide format DataFrame.
Args:
column: The name of the timestamp column.
discretize: Whether to discretize the hour column.
minutes: The number of minutes to discretize by. Ingored if `discretize` is False.
create_vocab: Whether to create the vocab column.
Returns:
DataFrame with features added
"""
transformer = create_timestamp_feature_pipeline(
timestamp_col=column,
discretize=discretize,
create_vocab=create_vocab,
minutes=minutes,
)

return transformer.fit_transform(self._obj)

def widen(
self, column: str, as_int: bool = True, minutes: int = 60
) -> pd.DataFrame:
"""Transform an aggregated DataFrame to wide calendar format.
Wrapper around `LongToWide` transformer to transform to wide format.
Args:
column: column to widen
as_int: whether to cast the column to int
minutes: number of minutes to
Returns:
DataFrame in wide format
"""
if not isinstance(self._obj.index, pd.MultiIndex):
raise ValueError(
"DataFrame is expected to have a MultiIndex with the last column as the vocab."
)

transformer = LongToWide(col=column, as_int=as_int, minutes=minutes)

return transformer.fit_transform(self._obj)

def aggregate_events(
self,
by: Union[str, List[str]],
timestamp_col: str,
minutes: int = 60,
) -> pd.DataFrame:
"""Transform DataFrame to wide format with groups as index.
"""Transform event level DataFrame to wide format with groups as index.
Wrapper around `create_raw_to_vocab_transformer` to transform to wide format.
Expand Down
13 changes: 11 additions & 2 deletions latent_calendar/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,14 +150,16 @@ def get_feature_names_out(self, input_features=None):

def create_timestamp_feature_pipeline(
timestamp_col: str,
discretize: bool = True,
minutes: int = 60,
create_vocab: bool = True,
) -> Pipeline:
"""Create a pipeline that creates features from the timestamp column.
Args:
timestamp_col: The name of the timestamp column.
minutes: The number of minutes to discretize by.
discretize: Whether to discretize the hour column.
minutes: The number of minutes to discretize by. Ignored if discretize is False.
create_vocab: Whether to create the vocab column.
Returns:
Expand All @@ -177,15 +179,22 @@ def create_timestamp_feature_pipeline(
```
"""
if create_vocab and not discretize:
raise ValueError("Cannot create vocab without discretizing.")

vocab_col = "hour"
transformers = [
(
"timestamp_features",
CalandarTimestampFeatures(timestamp_col=timestamp_col),
),
("binning", HourDiscretizer(col=vocab_col, minutes=minutes)),
]

if discretize:
transformers.append(
("binning", HourDiscretizer(col=vocab_col, minutes=minutes))
)

if create_vocab:
transformers.append(
("vocab_creation", VocabTransformer(hour_col=vocab_col)),
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "latent-calendar"
version = "0.0.11"
version = "0.0.12"
description = "Analyzing and modeling weekly calendar distributions using latent components"
authors = ["Will Dean <[email protected]>"]
readme = "README.md"
Expand Down
25 changes: 25 additions & 0 deletions tests/test_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,31 @@ def test_long_dataframe_extensions(df_long) -> None:
df_long.cal.plot_across_column("timestamp", "group")


@pytest.fixture
def df_agg(df_long) -> pd.DataFrame:
return (
df_long.cal.timestamp_features("timestamp")
.groupby(["group", "vocab"])
.size()
.rename("num_events")
.to_frame()
)


def test_agg_dataframe_extensions(df_agg) -> None:
assert hasattr(df_agg, "cal")

df_wide = df_agg.cal.widen("num_events")
assert df_wide.shape == (3, TIME_SLOTS)

with pytest.raises(ValueError):
df_agg.reset_index(0).cal.widen("num_events")

df_false_order = df_agg.reorder_levels([1, 0]).cal.widen("num_events")
assert isinstance(df_false_order, pd.DataFrame)
assert df_false_order.sum().sum() == 0


@pytest.fixture
def df_wide() -> pd.DataFrame:
nrows = 25
Expand Down

0 comments on commit e4d4380

Please sign in to comment.