test: Further testing for the parser helpers

NREL · Oct 23, 2024 · bc9e9b1 · bc9e9b1
1 parent fe667d7
commit bc9e9b1
Show file tree

Hide file tree

Showing 2 changed files with 161 additions and 22 deletions.
diff --git a/src/r2x/parser/parser_helpers.py b/src/r2x/parser/parser_helpers.py
@@ -1,6 +1,7 @@
 """Set of helper functions for parsers."""
 # ruff: noqa
 
+from datetime import timedelta
 from typing import Any
 import polars as pl
 import numpy as np
@@ -94,13 +95,21 @@ def handle_leap_year_adjustment(data_file: pl.DataFrame) -> pl.DataFrame:
 
     Examples
     --------
-    >>> df = pl.DataFrame({"date": ["2020-02-28"], "value": [1]})
-    >>> handle_leap_year_adjustment(df)
     """
-    feb_28 = data_file.slice(1392, 24)
-    before_feb_29 = data_file.slice(0, 1416)
-    after_feb_29 = data_file.slice(1416, len(data_file) - 1440)
-    return pl.concat([before_feb_29, feb_28, after_feb_29])
+    if len(data_file) != 8760:
+        raise ValueError("Data must contain 8760 rows for a non-leap year.")
+
+    # Get the index positions for February 28th
+    feb_28_start_index = 1392  # Start of February 28th (hour 0)
+    feb_28_end_index = 1416  # End of February 28th (hour 24)
+
+    # Slice February 28th data
+    feb_28_data = data_file[feb_28_start_index:feb_28_end_index]
+
+    # Create a new DataFrame with February 29th duplicated
+    adjusted_data = pl.concat([data_file, feb_28_data])
+
+    return adjusted_data
 
 
 def fill_missing_timestamps(data_file: pl.DataFrame, hourly_time_index: pl.DataFrame) -> pl.DataFrame:
@@ -118,24 +127,47 @@ def fill_missing_timestamps(data_file: pl.DataFrame, hourly_time_index: pl.DataF
     pl.DataFrame
         DataFrame with missing timestamps filled.
 
+    Raises
+    ------
+    ValueError
+        If the required columns are missing from the data_file.
+
     Examples
     --------
-    >>> df = pl.DataFrame({"year": [2020], "month": [2], "day": [28], "hour": [0], "value": [1]})
-    >>> hourly_index = pl.DataFrame({"datetime": pl.date_range("2020-01-01", "2020-12-31", freq="1H")})
-    >>> fill_missing_timestamps(df, hourly_index)
+    >>> import polars as pl
+    >>> from datetime import datetime
+    >>> df = pl.DataFrame(
+    ...     {"year": [2020, 2020], "month": [1, 1], "day": [1, 1], "hour": [0, 1], "value": [1, 2]}
+    ... )
+    >>> hourly_time_index = pl.datetime_range(
+    ...     datetime(2020, 1, 1), datetime(2020, 1, 2), interval="1h", eager=True, closed="left"
+    ... ).to_frame("datetime")
+    >>> fill_missing_timestamps(df, hourly_time_index)
     """
-    if "hour" in data_file.columns:
-        data_file = data_file.with_columns(
-            pl.datetime(pl.col("year"), pl.col("month"), pl.col("day"), pl.col("hour"))
-        )
-    if "day" not in data_file.columns:
-        data_file = data_file.with_columns(pl.datetime(pl.col("year"), pl.col("month"), 1))  # First day
-    else:
-        data_file = data_file.with_columns(pl.datetime(pl.col("year"), pl.col("month"), pl.col("day")))
-
-    upsample_data = hourly_time_index.join(data_file, on="datetime", how="left")
-    upsample_data = upsample_data.fill_null(strategy="forward")
-    return upsample_data
+    # Match case based on available columns
+    match data_file.columns:
+        # Case when "year", "month", "day", and "hour" are all present
+        case ["year", "month", "day", "hour", *_]:
+            data_file = data_file.with_columns(
+                pl.datetime(pl.col("year"), pl.col("month"), pl.col("day"), pl.col("hour"))
+            )
+            upsample_data = hourly_time_index.join(data_file, on="datetime", how="left")
+            return upsample_data.fill_null(strategy="forward")
+
+        # Case when "year", "month", and "day" are present but "hour" is missing
+        case ["year", "month", "day", *_]:
+            data_file = data_file.with_columns(pl.datetime(pl.col("year"), pl.col("month"), pl.col("day")))
+            upsample_data = hourly_time_index.join(data_file, on="datetime", how="left")
+            return upsample_data.fill_null(strategy="forward")
+
+        # Case when "day" is missing, but "year" and "month" are present
+        case ["year", "month", *_] if "day" not in data_file.columns:
+            data_file = data_file.with_columns(pl.datetime(pl.col("year"), pl.col("month"), 1))  # First day
+            upsample_data = hourly_time_index.join(data_file, on="datetime", how="left")
+            return upsample_data.fill_null(strategy="forward")
+
+        case _:
+            raise ValueError("The data_file must have at least 'year' and 'month' columns.")
 
 
 def resample_data_to_hourly(data_file: pl.DataFrame) -> pl.DataFrame:

diff --git a/tests/test_parser_helper.py b/tests/test_parser_helper.py
@@ -1,7 +1,14 @@
 import pytest
 import polars as pl
+from datetime import datetime
 
-from r2x.parser.parser_helpers import field_filter, prepare_ext_field, resample_data_to_hourly
+from r2x.parser.parser_helpers import (
+    field_filter,
+    fill_missing_timestamps,
+    prepare_ext_field,
+    reconcile_timeseries,
+    resample_data_to_hourly,
+)
 
 
 @pytest.mark.parametrize(
@@ -98,3 +105,103 @@ def test_resample_data_to_hourly():
     # Check the result length and values
     assert len(result_2) == 2  # Expecting 2 hourly values
     assert result_2["value"].to_list() == [1.0, 3.0]  # Expected filled values
+
+
+@pytest.fixture
+def hourly_leap_year():
+    year = 2020
+    return pl.datetime_range(
+        datetime(year, 1, 1), datetime(year + 1, 1, 1), interval="1h", eager=True, closed="left"
+    ).to_frame("datetime")
+
+
+@pytest.fixture
+def hourly_non_leap_year():
+    year = 2021
+    return pl.datetime_range(
+        datetime(year, 1, 1), datetime(year + 1, 1, 1), interval="1h", eager=True, closed="left"
+    ).to_frame("datetime")
+
+
+def test_fill_missing_timestamps():
+    """Test filling missing timestamps and forward filling nulls."""
+    year = 2020
+    data_file = pl.DataFrame(
+        {
+            "year": [2020, 2020],
+            "month": [1, 1],
+            "day": [1, 1],
+            "hour": [0, 1],  # Missing hour 1
+            "value": [1, 3],
+        }
+    )
+
+    hourly_time_index = pl.datetime_range(
+        datetime(year, 1, 1), datetime(year, 1, 2), interval="1h", eager=True, closed="left"
+    ).to_frame("datetime")
+
+    # Call the function
+    result = fill_missing_timestamps(data_file, hourly_time_index)
+
+    # Assert that the result contains 24 rows (for each hour of the day)
+    assert len(result) == 24
+
+    data_file = pl.DataFrame({"year": [2020], "value": [1]})
+    with pytest.raises(ValueError):
+        _ = fill_missing_timestamps(data_file, hourly_time_index)
+
+
+def test_reconcile_timeseries_non_leap_year(hourly_non_leap_year, hourly_leap_year):
+    # Extract year, month, day, and hour from the datetime column
+    data_file = hourly_leap_year.with_columns(
+        [
+            pl.col("datetime").dt.year().alias("year"),
+            pl.col("datetime").dt.month().alias("month"),
+            pl.col("datetime").dt.day().alias("day"),
+            pl.col("datetime").dt.hour().alias("hour"),
+            (pl.arange(0, hourly_leap_year.height)).alias("value"),  # Sequential values
+        ]
+    )
+
+    # Adjust data
+    result = reconcile_timeseries(data_file, hourly_non_leap_year)
+
+    # Expected result should remove Feb 29 data (1416 to 1440)
+    assert result.height == 8760
+    assert (result["value"] == list(range(1416)) + list(range(1440, 8784))).all()
+
+
+def test_reconcile_timeseries_leap_year(hourly_non_leap_year, hourly_leap_year):
+    # Data file with non-leap year length (8760 hours), leap year hourly_time_index
+    data_file = pl.DataFrame(
+        {
+            "year": [2021] * 8760,
+            "month": [2] * 8760,
+            "day": [28] * 8760,
+            "hour": list(range(8760)),
+            "value": list(range(8760)),
+        }
+    )
+
+    # Leap year hourly_time_index (8784 hours)
+    hourly_time_index = pl.datetime_range(
+        datetime(2020, 1, 1), datetime(2021, 1, 1), interval="1h", eager=True, closed="left"
+    ).to_frame("datetime")
+
+    # Adjust data
+    result = reconcile_timeseries(data_file, hourly_time_index)
+
+    # Check that the result has added Feb 29th data
+    assert result.height == 8784
+
+
+def test_reconcile_timeseries_raises_assertion():
+    # Empty hourly_time_index
+    hourly_time_index = pl.DataFrame()
+
+    # Data file with arbitrary data
+    data_file = pl.DataFrame({"year": [2021], "month": [2], "day": [28], "hour": [0], "value": [1]})
+
+    # Check that AssertionError is raised
+    with pytest.raises(AssertionError):
+        reconcile_timeseries(data_file, hourly_time_index)