Skip to content

Commit

Permalink
test: Further testing for the parser helpers
Browse files Browse the repository at this point in the history
  • Loading branch information
pesap committed Oct 23, 2024
1 parent fe667d7 commit bc9e9b1
Show file tree
Hide file tree
Showing 2 changed files with 161 additions and 22 deletions.
74 changes: 53 additions & 21 deletions src/r2x/parser/parser_helpers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Set of helper functions for parsers."""
# ruff: noqa

from datetime import timedelta
from typing import Any
import polars as pl
import numpy as np
Expand Down Expand Up @@ -94,13 +95,21 @@ def handle_leap_year_adjustment(data_file: pl.DataFrame) -> pl.DataFrame:
Examples
--------
>>> df = pl.DataFrame({"date": ["2020-02-28"], "value": [1]})
>>> handle_leap_year_adjustment(df)
"""
feb_28 = data_file.slice(1392, 24)
before_feb_29 = data_file.slice(0, 1416)
after_feb_29 = data_file.slice(1416, len(data_file) - 1440)
return pl.concat([before_feb_29, feb_28, after_feb_29])
if len(data_file) != 8760:
raise ValueError("Data must contain 8760 rows for a non-leap year.")

# Get the index positions for February 28th
feb_28_start_index = 1392 # Start of February 28th (hour 0)
feb_28_end_index = 1416 # End of February 28th (hour 24)

# Slice February 28th data
feb_28_data = data_file[feb_28_start_index:feb_28_end_index]

# Create a new DataFrame with February 29th duplicated
adjusted_data = pl.concat([data_file, feb_28_data])

return adjusted_data


def fill_missing_timestamps(data_file: pl.DataFrame, hourly_time_index: pl.DataFrame) -> pl.DataFrame:
Expand All @@ -118,24 +127,47 @@ def fill_missing_timestamps(data_file: pl.DataFrame, hourly_time_index: pl.DataF
pl.DataFrame
DataFrame with missing timestamps filled.
Raises
------
ValueError
If the required columns are missing from the data_file.
Examples
--------
>>> df = pl.DataFrame({"year": [2020], "month": [2], "day": [28], "hour": [0], "value": [1]})
>>> hourly_index = pl.DataFrame({"datetime": pl.date_range("2020-01-01", "2020-12-31", freq="1H")})
>>> fill_missing_timestamps(df, hourly_index)
>>> import polars as pl
>>> from datetime import datetime
>>> df = pl.DataFrame(
... {"year": [2020, 2020], "month": [1, 1], "day": [1, 1], "hour": [0, 1], "value": [1, 2]}
... )
>>> hourly_time_index = pl.datetime_range(
... datetime(2020, 1, 1), datetime(2020, 1, 2), interval="1h", eager=True, closed="left"
... ).to_frame("datetime")
>>> fill_missing_timestamps(df, hourly_time_index)
"""
if "hour" in data_file.columns:
data_file = data_file.with_columns(
pl.datetime(pl.col("year"), pl.col("month"), pl.col("day"), pl.col("hour"))
)
if "day" not in data_file.columns:
data_file = data_file.with_columns(pl.datetime(pl.col("year"), pl.col("month"), 1)) # First day
else:
data_file = data_file.with_columns(pl.datetime(pl.col("year"), pl.col("month"), pl.col("day")))

upsample_data = hourly_time_index.join(data_file, on="datetime", how="left")
upsample_data = upsample_data.fill_null(strategy="forward")
return upsample_data
# Match case based on available columns
match data_file.columns:
# Case when "year", "month", "day", and "hour" are all present
case ["year", "month", "day", "hour", *_]:
data_file = data_file.with_columns(
pl.datetime(pl.col("year"), pl.col("month"), pl.col("day"), pl.col("hour"))
)
upsample_data = hourly_time_index.join(data_file, on="datetime", how="left")
return upsample_data.fill_null(strategy="forward")

# Case when "year", "month", and "day" are present but "hour" is missing
case ["year", "month", "day", *_]:
data_file = data_file.with_columns(pl.datetime(pl.col("year"), pl.col("month"), pl.col("day")))
upsample_data = hourly_time_index.join(data_file, on="datetime", how="left")
return upsample_data.fill_null(strategy="forward")

# Case when "day" is missing, but "year" and "month" are present
case ["year", "month", *_] if "day" not in data_file.columns:
data_file = data_file.with_columns(pl.datetime(pl.col("year"), pl.col("month"), 1)) # First day
upsample_data = hourly_time_index.join(data_file, on="datetime", how="left")
return upsample_data.fill_null(strategy="forward")

case _:
raise ValueError("The data_file must have at least 'year' and 'month' columns.")


def resample_data_to_hourly(data_file: pl.DataFrame) -> pl.DataFrame:
Expand Down
109 changes: 108 additions & 1 deletion tests/test_parser_helper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
import pytest
import polars as pl
from datetime import datetime

from r2x.parser.parser_helpers import field_filter, prepare_ext_field, resample_data_to_hourly
from r2x.parser.parser_helpers import (
field_filter,
fill_missing_timestamps,
prepare_ext_field,
reconcile_timeseries,
resample_data_to_hourly,
)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -98,3 +105,103 @@ def test_resample_data_to_hourly():
# Check the result length and values
assert len(result_2) == 2 # Expecting 2 hourly values
assert result_2["value"].to_list() == [1.0, 3.0] # Expected filled values


@pytest.fixture
def hourly_leap_year():
year = 2020
return pl.datetime_range(
datetime(year, 1, 1), datetime(year + 1, 1, 1), interval="1h", eager=True, closed="left"
).to_frame("datetime")


@pytest.fixture
def hourly_non_leap_year():
year = 2021
return pl.datetime_range(
datetime(year, 1, 1), datetime(year + 1, 1, 1), interval="1h", eager=True, closed="left"
).to_frame("datetime")


def test_fill_missing_timestamps():
"""Test filling missing timestamps and forward filling nulls."""
year = 2020
data_file = pl.DataFrame(
{
"year": [2020, 2020],
"month": [1, 1],
"day": [1, 1],
"hour": [0, 1], # Missing hour 1
"value": [1, 3],
}
)

hourly_time_index = pl.datetime_range(
datetime(year, 1, 1), datetime(year, 1, 2), interval="1h", eager=True, closed="left"
).to_frame("datetime")

# Call the function
result = fill_missing_timestamps(data_file, hourly_time_index)

# Assert that the result contains 24 rows (for each hour of the day)
assert len(result) == 24

data_file = pl.DataFrame({"year": [2020], "value": [1]})
with pytest.raises(ValueError):
_ = fill_missing_timestamps(data_file, hourly_time_index)


def test_reconcile_timeseries_non_leap_year(hourly_non_leap_year, hourly_leap_year):
# Extract year, month, day, and hour from the datetime column
data_file = hourly_leap_year.with_columns(
[
pl.col("datetime").dt.year().alias("year"),
pl.col("datetime").dt.month().alias("month"),
pl.col("datetime").dt.day().alias("day"),
pl.col("datetime").dt.hour().alias("hour"),
(pl.arange(0, hourly_leap_year.height)).alias("value"), # Sequential values
]
)

# Adjust data
result = reconcile_timeseries(data_file, hourly_non_leap_year)

# Expected result should remove Feb 29 data (1416 to 1440)
assert result.height == 8760
assert (result["value"] == list(range(1416)) + list(range(1440, 8784))).all()


def test_reconcile_timeseries_leap_year(hourly_non_leap_year, hourly_leap_year):
# Data file with non-leap year length (8760 hours), leap year hourly_time_index
data_file = pl.DataFrame(
{
"year": [2021] * 8760,
"month": [2] * 8760,
"day": [28] * 8760,
"hour": list(range(8760)),
"value": list(range(8760)),
}
)

# Leap year hourly_time_index (8784 hours)
hourly_time_index = pl.datetime_range(
datetime(2020, 1, 1), datetime(2021, 1, 1), interval="1h", eager=True, closed="left"
).to_frame("datetime")

# Adjust data
result = reconcile_timeseries(data_file, hourly_time_index)

# Check that the result has added Feb 29th data
assert result.height == 8784


def test_reconcile_timeseries_raises_assertion():
# Empty hourly_time_index
hourly_time_index = pl.DataFrame()

# Data file with arbitrary data
data_file = pl.DataFrame({"year": [2021], "month": [2], "day": [28], "hour": [0], "value": [1]})

# Check that AssertionError is raised
with pytest.raises(AssertionError):
reconcile_timeseries(data_file, hourly_time_index)

0 comments on commit bc9e9b1

Please sign in to comment.