From b4d35c8d2a01477e530e609514211a088ebc3dbd Mon Sep 17 00:00:00 2001
From: Jonah Gilbert <jonahmgilbert1@gmail.com>
Date: Thu, 15 Jun 2023 11:39:47 -0500
Subject: [PATCH 1/7] Input damages validation + tests

---
 src/dscim/preprocessing/input_damages.py |  47 ++++++++
 src/dscim/preprocessing/preprocessing.py |   2 +
 tests/test_input_damages.py              | 130 +++++++++++++++++++++++
 3 files changed, 179 insertions(+)

diff --git a/src/dscim/preprocessing/input_damages.py b/src/dscim/preprocessing/input_damages.py
index 80f7b2f5..09023d35 100644
--- a/src/dscim/preprocessing/input_damages.py
+++ b/src/dscim/preprocessing/input_damages.py
@@ -133,6 +133,7 @@ def concatenate_damage_output(damage_dir, basename, save_path):
             data[v] = data[v].astype("unicode")
 
     data.to_zarr(save_path, mode="w")
+    validate_damages("energy", save_path)
 
 
 def calculate_labor_impacts(input_path, file_prefix, variable, val_type):
@@ -431,6 +432,8 @@ def process_batch(g):
         store=save_path, mode="a", consolidated=True
     )
 
+    validate_damages("agriculture", save_path)
+
 
 def read_energy_files(df, seed="TINV_clim_price014_total_energy_fulladapt-histclim"):
     """Read energy CSV files and trasnform them to Xarray objects
@@ -818,6 +821,7 @@ def prep(
         for v in data.values():
             v.close()
         damages.close()
+        validate_damages("mortality", f"{outpath}/impacts-darwin-montecarlo-damages-v{mortality_version}.zarr")
 
 
 def coastal_inputs(
@@ -853,6 +857,7 @@ def coastal_inputs(
                 consolidated=True,
                 mode="w",
             )
+            validate_damages("coastal", f"{path}/coastal_damages_{version}-{adapt_type}-{vsl_valuation}.zarr")
     else:
         print(
             "vsl_valuation is not a dimension of the input dataset, subset adapt_type only"
@@ -863,3 +868,45 @@ def coastal_inputs(
             consolidated=True,
             mode="w",
         )
+        
+
+
+def validate_damages(sector, path):
+    inputs = xr.open_zarr(path)
+    inputs.close()
+    
+    # No repeated batch labels
+    batches_expected = np.sort([ 'batch' + str(i) for i in np.arange(0,15)])
+    batches_actual = np.sort(inputs.batch.values)
+    assert np.array_equal(batches_expected, batches_actual), f"Batches in the {sector} input damages zarr are not 0-14."
+
+    # Input damages have rcp 4.5 and rcp 8.5
+    rcps_expected = np.sort([ 'rcp' + str(i) for i in [45,85]])
+    rcps_actual = np.sort(inputs.rcp.values)
+    assert np.array_equal(rcps_expected, rcps_actual), f"RCPs in the {sector} input damages zarr are not rcp45 and rcp85."
+
+    # max batches and no repeated batches
+    regions = inputs.dims['region']
+    ssps = inputs.dims['ssp']
+    if "coastal" in sector:
+        dims = ['ssp','model','slr','batch','year','region']
+        chunk_sizes = [1,1,1,15,10,regions]
+        total_sizes = [ssps,2,10,15,90,regions]
+    else:
+        dims = ['ssp', 'rcp','model','gcm','batch','year','region']
+        chunk_sizes = [1,1,1,1,15,10,regions]
+        total_sizes = [ssps,2,2,33,15,90,regions]
+
+
+    chunk_len = np.arange(0,len(chunk_sizes))
+    chunks = [(chunk_sizes[i],) * int(total_sizes[i]/chunk_sizes[i]) for i in chunk_len]
+    dims_expected = dict(zip(dims, total_sizes))
+    chunks_expected = dict(zip(dims, chunks))  
+
+
+    assert(dims_expected == dict(inputs.dims))
+    for i in list(inputs.keys()):
+        assert chunks_expected['batch'] == dict(inputs[i].chunksizes)['batch'], f"Chunksize for batches need to equal 15 for the {sector} input damages."
+        if chunks_expected != dict(inputs[i].chunksizes):
+            warnings.warn("Non fatal: chunk sizes are different from expected.")
+
diff --git a/src/dscim/preprocessing/preprocessing.py b/src/dscim/preprocessing/preprocessing.py
index a253b389..1dd3dd89 100644
--- a/src/dscim/preprocessing/preprocessing.py
+++ b/src/dscim/preprocessing/preprocessing.py
@@ -96,6 +96,8 @@ def reduce_damages(
     delta = params["delta"]
     outpath = f"{c['paths']['reduced_damages_library']}/{sector}"
 
+    validate_damages(sector, damages)
+
     with xr.open_zarr(damages, chunks=None)[histclim] as ds:
         with xr.open_zarr(socioec, chunks=None) as gdppc:
             assert (
diff --git a/tests/test_input_damages.py b/tests/test_input_damages.py
index e1278339..46779983 100644
--- a/tests/test_input_damages.py
+++ b/tests/test_input_damages.py
@@ -1,4 +1,5 @@
 import os
+import warnings
 import numpy as np
 import xarray as xr
 import pandas as pd
@@ -23,6 +24,7 @@
     calculate_energy_damages,
     prep_mortality_damages,
     coastal_inputs,
+    validate_damages,
 )
 
 logger = logging.getLogger(__name__)
@@ -1356,3 +1358,131 @@ def test_error_coastal_inputs(
         str(excinfo.value)
         == "vsl_valuation is a coordinate in the input dataset but is set to None. Please provide a value for vsl_valuation by which to subset the input dataset."
     )
+
+def create_dummy_input_zarr(path, sector, file_type):
+    # Create dummy input data
+    batch_values = np.sort(['batch' + str(i) for i in np.arange(0, 15)])
+    rcp_values = np.sort(['rcp' + str(i) for i in [45, 85]])
+    ssp_values = np.arange(0, 2)
+    model_values = np.arange(0, 2)
+    if sector == "coastal":
+        slr_values = np.arange(0, 10)
+    else:
+        slr_values = np.arange(0, 33)
+    year_values = np.arange(0, 90)
+    region_values = np.arange(0, 5)
+
+    if file_type == "wrong_rcps":
+        # Create input data with wrong rcps
+        rcp_values = np.sort(['rcp' + str(i) for i in [45, 65, 85]])
+    elif file_type == "wrong_batches":
+        # Create input data with wrong batches
+        batch_values = np.sort(['batch' + str(i) for i in np.arange(0, 14)] + ["batch1",])
+        
+    if sector == "coastal":
+        data = np.ones((len(ssp_values), len(model_values), len(slr_values), len(batch_values), len(year_values), len(region_values)))
+    else:
+        data = np.ones((len(ssp_values), len(rcp_values), len(model_values), len(slr_values), len(batch_values), len(year_values), len(region_values)))
+
+    # Create xarray dataset
+    if "coastal" in sector:
+        dims = ['ssp', 'model', 'slr', 'batch', 'year', 'region']
+        coords={
+                "ssp": (["ssp"], ssp_values),
+                "model": (["model"], model_values),
+                "slr": (["slr"], slr_values),
+                "batch": (["batch"], batch_values),
+                "year": (["year"], year_values),
+                "region": (["region"], region_values),
+            }
+        chunkies = {
+                "ssp": 1,
+                "rcp": 1,
+                "model": 1,
+                "gcm": 1,
+                "batch": -1 if file_type != "wrong_chunk_sizes" else 5,
+                "year": 10,
+                "region": 5,
+            }
+    else:
+        dims = ['ssp', 'rcp', 'model', 'gcm', 'batch', 'year', 'region']
+        coords={
+                "ssp": (["ssp"], ssp_values),
+                "rcp": (["rcp"], rcp_values),
+                "model": (["model"], model_values),
+                "gcm": (["gcm"], slr_values),
+                "batch": (["batch"], batch_values),
+                "year": (["year"], year_values),
+                "region": (["region"], region_values),
+            }
+        chunkies = {
+                "ssp": 1,
+                "rcp": 1,
+                "model": 1,
+                "gcm": 1,
+                "batch": -1 if file_type != "wrong_chunk_sizes" else 5,
+                "year": 10,
+                "region": 5,
+            }
+
+    
+    ds = xr.Dataset(
+            {
+                "data": (
+                    dims,
+                    data,
+                ),
+            },
+            coords = coords,
+        ).chunk(chunkies)
+
+    # Save xarray dataset as Zarr
+    ds.to_zarr(path, mode='w')
+
+
+@pytest.mark.parametrize("sector", ["mortality", "coastal"])
+def test_validate_damages_correct(tmp_path, sector):
+    path = str(tmp_path / f"damages_correct_{sector}.zarr")
+    file_type = "correct"
+    create_dummy_input_zarr(path, sector, file_type)
+    validate_damages(sector, path)  # No assertion error should be raised
+
+def test_validate_damages_incorrect_batches(tmp_path):
+    sector = "mortality"
+    path = str(tmp_path / f"damages_incorrect_batches_{sector}.zarr")
+    file_type = "wrong_batches"
+    create_dummy_input_zarr(path, sector, file_type)
+    with pytest.raises(AssertionError) as e_info:
+        validate_damages(sector, path)
+    assert str(e_info.value) == f"Batches in the {sector} input damages zarr are not 0-14."
+
+def test_validate_damages_incorrect_rcps(tmp_path):
+    sector = "mortality"
+    path = str(tmp_path / f"damages_incorrect_rcps_{sector}.zarr")
+    file_type = "wrong_rcps"
+    create_dummy_input_zarr(path, sector, file_type)
+    with pytest.raises(AssertionError) as e_info:
+        validate_damages(sector, path)
+    assert str(e_info.value) == f"RCPs in the {sector} input damages zarr are not rcp45 and rcp85."
+
+
+@pytest.mark.parametrize("sector", ["mortality", "coastal"])
+def test_validate_damages_incorrect_chunk_sizes(tmp_path, sector):
+    path = str(tmp_path / f"damages_incorrect_chunk_sizes_{sector}.zarr")
+    file_type = "wrong_chunk_sizes"
+    create_dummy_input_zarr(path, sector, file_type)
+    with pytest.raises(AssertionError) as e_info:
+        validate_damages(sector, path)
+    assert str(e_info.value) == f"Batches in the {sector} input damages zarr are not 0-14."
+
+
+
+@pytest.mark.parametrize("sector", ["mortality", "coastal"])
+def test_validate_damages_incorrect_region_chunk_sizes(tmp_path, sector):
+    path = str(tmp_path / f"damages_incorrect_region_chunk_sizes_{sector}.zarr")
+    file_type = "wrong_region_chunk_sizes"
+    create_dummy_input_zarr(path, sector, file_type)
+    with pytest.warns(UserWarning) as warnings_info:
+        validate_damages(sector, path)
+    assert len(warnings_info) == 1
+    assert str(warnings_info[0].message) == "Non fatal: chunk sizes are different from expected."

From 3400f5179a1af7189a96b8d91458f9bc9cf45fa8 Mon Sep 17 00:00:00 2001
From: Jonah Gilbert <jonahmgilbert1@gmail.com>
Date: Thu, 15 Jun 2023 11:48:43 -0500
Subject: [PATCH 2/7] black

---
 src/dscim/preprocessing/input_damages.py |  58 +++++----
 tests/test_input_damages.py              | 151 ++++++++++++++---------
 2 files changed, 127 insertions(+), 82 deletions(-)

diff --git a/src/dscim/preprocessing/input_damages.py b/src/dscim/preprocessing/input_damages.py
index 09023d35..c4459fb8 100644
--- a/src/dscim/preprocessing/input_damages.py
+++ b/src/dscim/preprocessing/input_damages.py
@@ -821,7 +821,10 @@ def prep(
         for v in data.values():
             v.close()
         damages.close()
-        validate_damages("mortality", f"{outpath}/impacts-darwin-montecarlo-damages-v{mortality_version}.zarr")
+        validate_damages(
+            "mortality",
+            f"{outpath}/impacts-darwin-montecarlo-damages-v{mortality_version}.zarr",
+        )
 
 
 def coastal_inputs(
@@ -857,7 +860,10 @@ def coastal_inputs(
                 consolidated=True,
                 mode="w",
             )
-            validate_damages("coastal", f"{path}/coastal_damages_{version}-{adapt_type}-{vsl_valuation}.zarr")
+            validate_damages(
+                "coastal",
+                f"{path}/coastal_damages_{version}-{adapt_type}-{vsl_valuation}.zarr",
+            )
     else:
         print(
             "vsl_valuation is not a dimension of the input dataset, subset adapt_type only"
@@ -868,45 +874,49 @@ def coastal_inputs(
             consolidated=True,
             mode="w",
         )
-        
 
 
 def validate_damages(sector, path):
     inputs = xr.open_zarr(path)
     inputs.close()
-    
+
     # No repeated batch labels
-    batches_expected = np.sort([ 'batch' + str(i) for i in np.arange(0,15)])
+    batches_expected = np.sort(["batch" + str(i) for i in np.arange(0, 15)])
     batches_actual = np.sort(inputs.batch.values)
-    assert np.array_equal(batches_expected, batches_actual), f"Batches in the {sector} input damages zarr are not 0-14."
+    assert np.array_equal(
+        batches_expected, batches_actual
+    ), f"Batches in the {sector} input damages zarr are not 0-14."
 
     # Input damages have rcp 4.5 and rcp 8.5
-    rcps_expected = np.sort([ 'rcp' + str(i) for i in [45,85]])
+    rcps_expected = np.sort(["rcp" + str(i) for i in [45, 85]])
     rcps_actual = np.sort(inputs.rcp.values)
-    assert np.array_equal(rcps_expected, rcps_actual), f"RCPs in the {sector} input damages zarr are not rcp45 and rcp85."
+    assert np.array_equal(
+        rcps_expected, rcps_actual
+    ), f"RCPs in the {sector} input damages zarr are not rcp45 and rcp85."
 
     # max batches and no repeated batches
-    regions = inputs.dims['region']
-    ssps = inputs.dims['ssp']
+    regions = inputs.dims["region"]
+    ssps = inputs.dims["ssp"]
     if "coastal" in sector:
-        dims = ['ssp','model','slr','batch','year','region']
-        chunk_sizes = [1,1,1,15,10,regions]
-        total_sizes = [ssps,2,10,15,90,regions]
+        dims = ["ssp", "model", "slr", "batch", "year", "region"]
+        chunk_sizes = [1, 1, 1, 15, 10, regions]
+        total_sizes = [ssps, 2, 10, 15, 90, regions]
     else:
-        dims = ['ssp', 'rcp','model','gcm','batch','year','region']
-        chunk_sizes = [1,1,1,1,15,10,regions]
-        total_sizes = [ssps,2,2,33,15,90,regions]
-
+        dims = ["ssp", "rcp", "model", "gcm", "batch", "year", "region"]
+        chunk_sizes = [1, 1, 1, 1, 15, 10, regions]
+        total_sizes = [ssps, 2, 2, 33, 15, 90, regions]
 
-    chunk_len = np.arange(0,len(chunk_sizes))
-    chunks = [(chunk_sizes[i],) * int(total_sizes[i]/chunk_sizes[i]) for i in chunk_len]
+    chunk_len = np.arange(0, len(chunk_sizes))
+    chunks = [
+        (chunk_sizes[i],) * int(total_sizes[i] / chunk_sizes[i]) for i in chunk_len
+    ]
     dims_expected = dict(zip(dims, total_sizes))
-    chunks_expected = dict(zip(dims, chunks))  
-
+    chunks_expected = dict(zip(dims, chunks))
 
-    assert(dims_expected == dict(inputs.dims))
+    assert dims_expected == dict(inputs.dims)
     for i in list(inputs.keys()):
-        assert chunks_expected['batch'] == dict(inputs[i].chunksizes)['batch'], f"Chunksize for batches need to equal 15 for the {sector} input damages."
+        assert (
+            chunks_expected["batch"] == dict(inputs[i].chunksizes)["batch"]
+        ), f"Chunksize for batches need to equal 15 for the {sector} input damages."
         if chunks_expected != dict(inputs[i].chunksizes):
             warnings.warn("Non fatal: chunk sizes are different from expected.")
-
diff --git a/tests/test_input_damages.py b/tests/test_input_damages.py
index 46779983..b2a8f419 100644
--- a/tests/test_input_damages.py
+++ b/tests/test_input_damages.py
@@ -1359,10 +1359,11 @@ def test_error_coastal_inputs(
         == "vsl_valuation is a coordinate in the input dataset but is set to None. Please provide a value for vsl_valuation by which to subset the input dataset."
     )
 
+
 def create_dummy_input_zarr(path, sector, file_type):
     # Create dummy input data
-    batch_values = np.sort(['batch' + str(i) for i in np.arange(0, 15)])
-    rcp_values = np.sort(['rcp' + str(i) for i in [45, 85]])
+    batch_values = np.sort(["batch" + str(i) for i in np.arange(0, 15)])
+    rcp_values = np.sort(["rcp" + str(i) for i in [45, 85]])
     ssp_values = np.arange(0, 2)
     model_values = np.arange(0, 2)
     if sector == "coastal":
@@ -1374,70 +1375,93 @@ def create_dummy_input_zarr(path, sector, file_type):
 
     if file_type == "wrong_rcps":
         # Create input data with wrong rcps
-        rcp_values = np.sort(['rcp' + str(i) for i in [45, 65, 85]])
+        rcp_values = np.sort(["rcp" + str(i) for i in [45, 65, 85]])
     elif file_type == "wrong_batches":
         # Create input data with wrong batches
-        batch_values = np.sort(['batch' + str(i) for i in np.arange(0, 14)] + ["batch1",])
-        
+        batch_values = np.sort(
+            ["batch" + str(i) for i in np.arange(0, 14)]
+            + [
+                "batch1",
+            ]
+        )
+
     if sector == "coastal":
-        data = np.ones((len(ssp_values), len(model_values), len(slr_values), len(batch_values), len(year_values), len(region_values)))
+        data = np.ones(
+            (
+                len(ssp_values),
+                len(model_values),
+                len(slr_values),
+                len(batch_values),
+                len(year_values),
+                len(region_values),
+            )
+        )
     else:
-        data = np.ones((len(ssp_values), len(rcp_values), len(model_values), len(slr_values), len(batch_values), len(year_values), len(region_values)))
+        data = np.ones(
+            (
+                len(ssp_values),
+                len(rcp_values),
+                len(model_values),
+                len(slr_values),
+                len(batch_values),
+                len(year_values),
+                len(region_values),
+            )
+        )
 
     # Create xarray dataset
     if "coastal" in sector:
-        dims = ['ssp', 'model', 'slr', 'batch', 'year', 'region']
-        coords={
-                "ssp": (["ssp"], ssp_values),
-                "model": (["model"], model_values),
-                "slr": (["slr"], slr_values),
-                "batch": (["batch"], batch_values),
-                "year": (["year"], year_values),
-                "region": (["region"], region_values),
-            }
+        dims = ["ssp", "model", "slr", "batch", "year", "region"]
+        coords = {
+            "ssp": (["ssp"], ssp_values),
+            "model": (["model"], model_values),
+            "slr": (["slr"], slr_values),
+            "batch": (["batch"], batch_values),
+            "year": (["year"], year_values),
+            "region": (["region"], region_values),
+        }
         chunkies = {
-                "ssp": 1,
-                "rcp": 1,
-                "model": 1,
-                "gcm": 1,
-                "batch": -1 if file_type != "wrong_chunk_sizes" else 5,
-                "year": 10,
-                "region": 5,
-            }
+            "ssp": 1,
+            "rcp": 1,
+            "model": 1,
+            "gcm": 1,
+            "batch": -1 if file_type != "wrong_chunk_sizes" else 5,
+            "year": 10,
+            "region": 5,
+        }
     else:
-        dims = ['ssp', 'rcp', 'model', 'gcm', 'batch', 'year', 'region']
-        coords={
-                "ssp": (["ssp"], ssp_values),
-                "rcp": (["rcp"], rcp_values),
-                "model": (["model"], model_values),
-                "gcm": (["gcm"], slr_values),
-                "batch": (["batch"], batch_values),
-                "year": (["year"], year_values),
-                "region": (["region"], region_values),
-            }
+        dims = ["ssp", "rcp", "model", "gcm", "batch", "year", "region"]
+        coords = {
+            "ssp": (["ssp"], ssp_values),
+            "rcp": (["rcp"], rcp_values),
+            "model": (["model"], model_values),
+            "gcm": (["gcm"], slr_values),
+            "batch": (["batch"], batch_values),
+            "year": (["year"], year_values),
+            "region": (["region"], region_values),
+        }
         chunkies = {
-                "ssp": 1,
-                "rcp": 1,
-                "model": 1,
-                "gcm": 1,
-                "batch": -1 if file_type != "wrong_chunk_sizes" else 5,
-                "year": 10,
-                "region": 5,
-            }
-
-    
+            "ssp": 1,
+            "rcp": 1,
+            "model": 1,
+            "gcm": 1,
+            "batch": -1 if file_type != "wrong_chunk_sizes" else 5,
+            "year": 10,
+            "region": 5,
+        }
+
     ds = xr.Dataset(
-            {
-                "data": (
-                    dims,
-                    data,
-                ),
-            },
-            coords = coords,
-        ).chunk(chunkies)
+        {
+            "data": (
+                dims,
+                data,
+            ),
+        },
+        coords=coords,
+    ).chunk(chunkies)
 
     # Save xarray dataset as Zarr
-    ds.to_zarr(path, mode='w')
+    ds.to_zarr(path, mode="w")
 
 
 @pytest.mark.parametrize("sector", ["mortality", "coastal"])
@@ -1447,6 +1471,7 @@ def test_validate_damages_correct(tmp_path, sector):
     create_dummy_input_zarr(path, sector, file_type)
     validate_damages(sector, path)  # No assertion error should be raised
 
+
 def test_validate_damages_incorrect_batches(tmp_path):
     sector = "mortality"
     path = str(tmp_path / f"damages_incorrect_batches_{sector}.zarr")
@@ -1454,7 +1479,10 @@ def test_validate_damages_incorrect_batches(tmp_path):
     create_dummy_input_zarr(path, sector, file_type)
     with pytest.raises(AssertionError) as e_info:
         validate_damages(sector, path)
-    assert str(e_info.value) == f"Batches in the {sector} input damages zarr are not 0-14."
+    assert (
+        str(e_info.value) == f"Batches in the {sector} input damages zarr are not 0-14."
+    )
+
 
 def test_validate_damages_incorrect_rcps(tmp_path):
     sector = "mortality"
@@ -1463,7 +1491,10 @@ def test_validate_damages_incorrect_rcps(tmp_path):
     create_dummy_input_zarr(path, sector, file_type)
     with pytest.raises(AssertionError) as e_info:
         validate_damages(sector, path)
-    assert str(e_info.value) == f"RCPs in the {sector} input damages zarr are not rcp45 and rcp85."
+    assert (
+        str(e_info.value)
+        == f"RCPs in the {sector} input damages zarr are not rcp45 and rcp85."
+    )
 
 
 @pytest.mark.parametrize("sector", ["mortality", "coastal"])
@@ -1473,8 +1504,9 @@ def test_validate_damages_incorrect_chunk_sizes(tmp_path, sector):
     create_dummy_input_zarr(path, sector, file_type)
     with pytest.raises(AssertionError) as e_info:
         validate_damages(sector, path)
-    assert str(e_info.value) == f"Batches in the {sector} input damages zarr are not 0-14."
-
+    assert (
+        str(e_info.value) == f"Batches in the {sector} input damages zarr are not 0-14."
+    )
 
 
 @pytest.mark.parametrize("sector", ["mortality", "coastal"])
@@ -1485,4 +1517,7 @@ def test_validate_damages_incorrect_region_chunk_sizes(tmp_path, sector):
     with pytest.warns(UserWarning) as warnings_info:
         validate_damages(sector, path)
     assert len(warnings_info) == 1
-    assert str(warnings_info[0].message) == "Non fatal: chunk sizes are different from expected."
+    assert (
+        str(warnings_info[0].message)
+        == "Non fatal: chunk sizes are different from expected."
+    )

From 84e3cdbede73b6315e83ed25aaf474ba04e51beb Mon Sep 17 00:00:00 2001
From: Jonah Gilbert <jonahmgilbert1@gmail.com>
Date: Thu, 15 Jun 2023 11:52:21 -0500
Subject: [PATCH 3/7] Add missing import

---
 src/dscim/preprocessing/preprocessing.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/dscim/preprocessing/preprocessing.py b/src/dscim/preprocessing/preprocessing.py
index 1dd3dd89..94165f9c 100644
--- a/src/dscim/preprocessing/preprocessing.py
+++ b/src/dscim/preprocessing/preprocessing.py
@@ -11,6 +11,7 @@
 import xarray as xr
 from dask.distributed import Client, progress
 from dscim.utils.functions import ce_func, mean_func
+from dscim.preprocessing.input_damages import validate_damages 
 import yaml
 import time
 import argparse

From 5cd68f22cd73494de804308a983a315881bf95ef Mon Sep 17 00:00:00 2001
From: Jonah Gilbert <jonahmgilbert1@gmail.com>
Date: Thu, 15 Jun 2023 13:11:48 -0500
Subject: [PATCH 4/7] Attempt to monkeypatch validation function

---
 CHANGELOG.md                             |  4 ++++
 src/dscim/preprocessing/input_damages.py | 11 +++++-----
 src/dscim/preprocessing/preprocessing.py |  2 +-
 tests/test_input_damages.py              | 27 +++++++++++++++++-------
 tests/test_preprocessing.py              |  3 +++
 5 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 18dc91ac..f8ce06a3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## Unreleased
+### Added
+- Function to validate input damages. ([PR #85](https://github.com/ClimateImpactLab/dscim/pull/83), [@JMGilbert](https://github.com/JMGilbert))
+
 ## [0.4.0] - Unreleased
 ### Added
 - Functions to concatenate input damages across batches. ([PR #83](https://github.com/ClimateImpactLab/dscim/pull/83), [@davidrzhdu](https://github.com/davidrzhdu))
diff --git a/src/dscim/preprocessing/input_damages.py b/src/dscim/preprocessing/input_damages.py
index c4459fb8..7314458d 100644
--- a/src/dscim/preprocessing/input_damages.py
+++ b/src/dscim/preprocessing/input_damages.py
@@ -888,11 +888,12 @@ def validate_damages(sector, path):
     ), f"Batches in the {sector} input damages zarr are not 0-14."
 
     # Input damages have rcp 4.5 and rcp 8.5
-    rcps_expected = np.sort(["rcp" + str(i) for i in [45, 85]])
-    rcps_actual = np.sort(inputs.rcp.values)
-    assert np.array_equal(
-        rcps_expected, rcps_actual
-    ), f"RCPs in the {sector} input damages zarr are not rcp45 and rcp85."
+    if "coastal" not in sector:
+        rcps_expected = np.sort(["rcp" + str(i) for i in [45, 85]])
+        rcps_actual = np.sort(inputs.rcp.values)
+        assert np.array_equal(
+            rcps_expected, rcps_actual
+        ), f"RCPs in the {sector} input damages zarr are not rcp45 and rcp85."
 
     # max batches and no repeated batches
     regions = inputs.dims["region"]
diff --git a/src/dscim/preprocessing/preprocessing.py b/src/dscim/preprocessing/preprocessing.py
index 94165f9c..4ff89c86 100644
--- a/src/dscim/preprocessing/preprocessing.py
+++ b/src/dscim/preprocessing/preprocessing.py
@@ -11,7 +11,7 @@
 import xarray as xr
 from dask.distributed import Client, progress
 from dscim.utils.functions import ce_func, mean_func
-from dscim.preprocessing.input_damages import validate_damages 
+from dscim.preprocessing.input_damages import validate_damages
 import yaml
 import time
 import argparse
diff --git a/tests/test_input_damages.py b/tests/test_input_damages.py
index b2a8f419..8a648ce5 100644
--- a/tests/test_input_damages.py
+++ b/tests/test_input_damages.py
@@ -79,6 +79,8 @@ def test_concatenate_damage_output(tmp_path):
     """
     Test that concatenate_damage_output correctly concatenates damages across batches and saves to a single zarr file
     """
+    monkeypatch.setattr(dscim.preprocessing.input_damages, "validate_damages", 1 + 1)
+
     d = os.path.join(tmp_path, "concatenate_in")
     if not os.path.exists(d):
         os.makedirs(d)
@@ -435,6 +437,9 @@ def test_compute_ag_damages(
     """
     Test that compute_ag_damages correctly reshapes ag estimate runs for use in integration system and saves to zarr file
     """
+
+    monkeypatch.setattr(dscim.preprocessing.input_damages, "validate_damages", 1 + 1)
+
     rcp = ["rcp45", "rcp85"]
     gcm = ["ACCESS1-0", "GFDL-CM3"]
     model = ["low", "high"]
@@ -1006,6 +1011,9 @@ def test_prep_mortality_damages(
     """
     Test that prep_mortality_damages correctly reshapes different versions of mortality estimate runs for use in integration system and saves to zarr file
     """
+
+    monkeypatch.setattr(dscim.preprocessing.input_damages, "validate_damages", 1 + 1)
+
     for b in ["6", "9"]:
         ds_in = xr.Dataset(
             {
@@ -1157,6 +1165,9 @@ def test_coastal_inputs(
     """
     Test that coastal_inputs correctly reshapes different versions of coastal results for use in integration system and saves to zarr file (v0.21 and v0.22 have exactly the same structure, so testing either one should be sufficient)
     """
+
+    monkeypatch.setattr(dscim.preprocessing.input_damages, "validate_damages", 1 + 1)
+
     if version_test == "v0.21":
         ds_in = xr.Dataset(
             {
@@ -1371,7 +1382,7 @@ def create_dummy_input_zarr(path, sector, file_type):
     else:
         slr_values = np.arange(0, 33)
     year_values = np.arange(0, 90)
-    region_values = np.arange(0, 5)
+    region_values = np.arange(0, 6)
 
     if file_type == "wrong_rcps":
         # Create input data with wrong rcps
@@ -1422,12 +1433,11 @@ def create_dummy_input_zarr(path, sector, file_type):
         }
         chunkies = {
             "ssp": 1,
-            "rcp": 1,
             "model": 1,
-            "gcm": 1,
-            "batch": -1 if file_type != "wrong_chunk_sizes" else 5,
+            "slr": 1,
+            "batch": 5 if file_type == "wrong_chunk_sizes" else -1,
             "year": 10,
-            "region": 5,
+            "region": 3 if file_type == "wrong_region_chunk_sizes" else 6,
         }
     else:
         dims = ["ssp", "rcp", "model", "gcm", "batch", "year", "region"]
@@ -1445,9 +1455,9 @@ def create_dummy_input_zarr(path, sector, file_type):
             "rcp": 1,
             "model": 1,
             "gcm": 1,
-            "batch": -1 if file_type != "wrong_chunk_sizes" else 5,
+            "batch": 5 if file_type == "wrong_chunk_sizes" else -1,
             "year": 10,
-            "region": 5,
+            "region": 3 if file_type == "wrong_region_chunk_sizes" else 6,
         }
 
     ds = xr.Dataset(
@@ -1505,7 +1515,8 @@ def test_validate_damages_incorrect_chunk_sizes(tmp_path, sector):
     with pytest.raises(AssertionError) as e_info:
         validate_damages(sector, path)
     assert (
-        str(e_info.value) == f"Batches in the {sector} input damages zarr are not 0-14."
+        str(e_info.value)
+        == f"Chunksize for batches need to equal 15 for the {sector} input damages."
     )
 
 
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
index e17acc36..aadccfa0 100644
--- a/tests/test_preprocessing.py
+++ b/tests/test_preprocessing.py
@@ -292,6 +292,9 @@ def test_reduce_damages(tmp_path, recipe, eta):
     """
     Test that reduce_damages returns a Zarr file with damages reduced according to the expected file structure
     """
+
+    monkeypatch.setattr(dscim.preprocessing.input_damages, "validate_damages", 1 + 1)
+
     d = tmp_path / "reduction"
     d.mkdir()
     dummy_sector1_dir = d / "dummy_sector1"

From b33343bc4be6f03ed6335bfd93e8bf850666c0d2 Mon Sep 17 00:00:00 2001
From: Jonah Gilbert <jonahmgilbert1@gmail.com>
Date: Thu, 15 Jun 2023 13:16:11 -0500
Subject: [PATCH 5/7] Fix monkeypatch

---
 tests/test_input_damages.py | 5 ++++-
 tests/test_preprocessing.py | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/test_input_damages.py b/tests/test_input_damages.py
index 8a648ce5..3bc5a634 100644
--- a/tests/test_input_damages.py
+++ b/tests/test_input_damages.py
@@ -75,7 +75,7 @@ def test_parse_projection_filesys(tmp_path):
     pd.testing.assert_frame_equal(df_out_expected, df_out_actual)
 
 
-def test_concatenate_damage_output(tmp_path):
+def test_concatenate_damage_output(tmp_path, monkeypatch):
     """
     Test that concatenate_damage_output correctly concatenates damages across batches and saves to a single zarr file
     """
@@ -433,6 +433,7 @@ def test_calculate_labor_damages(
 def test_compute_ag_damages(
     tmp_path,
     econvars_fixture,
+    monkeypatch,
 ):
     """
     Test that compute_ag_damages correctly reshapes ag estimate runs for use in integration system and saves to zarr file
@@ -1007,6 +1008,7 @@ def test_prep_mortality_damages(
     tmp_path,
     version_test,
     econvars_fixture,
+    monkeypatch,
 ):
     """
     Test that prep_mortality_damages correctly reshapes different versions of mortality estimate runs for use in integration system and saves to zarr file
@@ -1161,6 +1163,7 @@ def test_error_prep_mortality_damages(tmp_path):
 def test_coastal_inputs(
     tmp_path,
     version_test,
+    monkeypatch,
 ):
     """
     Test that coastal_inputs correctly reshapes different versions of coastal results for use in integration system and saves to zarr file (v0.21 and v0.22 have exactly the same structure, so testing either one should be sufficient)
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
index aadccfa0..3ac9a1db 100644
--- a/tests/test_preprocessing.py
+++ b/tests/test_preprocessing.py
@@ -288,7 +288,7 @@ def test_reduce_damages_error_eta():
         ("risk_aversion", 10),
     ],
 )
-def test_reduce_damages(tmp_path, recipe, eta):
+def test_reduce_damages(tmp_path, recipe, eta, monkeypatch):
     """
     Test that reduce_damages returns a Zarr file with damages reduced according to the expected file structure
     """

From 3bbdfd1e31e1470a4ff1984cfa5e17f42b24a45a Mon Sep 17 00:00:00 2001
From: Jonah Gilbert <jonahmgilbert1@gmail.com>
Date: Thu, 15 Jun 2023 13:22:17 -0500
Subject: [PATCH 6/7] Fix flake8

---
 tests/test_input_damages.py | 9 +++++----
 tests/test_preprocessing.py | 3 ++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/test_input_damages.py b/tests/test_input_damages.py
index 3bc5a634..04a5d848 100644
--- a/tests/test_input_damages.py
+++ b/tests/test_input_damages.py
@@ -26,6 +26,7 @@
     coastal_inputs,
     validate_damages,
 )
+import dscim
 
 logger = logging.getLogger(__name__)
 
@@ -79,7 +80,7 @@ def test_concatenate_damage_output(tmp_path, monkeypatch):
     """
     Test that concatenate_damage_output correctly concatenates damages across batches and saves to a single zarr file
     """
-    monkeypatch.setattr(dscim.preprocessing.input_damages, "validate_damages", 1 + 1)
+    monkeypatch.setattr(dscim.preprocessing.input_damages, "validate_damages", True)
 
     d = os.path.join(tmp_path, "concatenate_in")
     if not os.path.exists(d):
@@ -439,7 +440,7 @@ def test_compute_ag_damages(
     Test that compute_ag_damages correctly reshapes ag estimate runs for use in integration system and saves to zarr file
     """
 
-    monkeypatch.setattr(dscim.preprocessing.input_damages, "validate_damages", 1 + 1)
+    monkeypatch.setattr(dscim.preprocessing.input_damages, "validate_damages", True)
 
     rcp = ["rcp45", "rcp85"]
     gcm = ["ACCESS1-0", "GFDL-CM3"]
@@ -1014,7 +1015,7 @@ def test_prep_mortality_damages(
     Test that prep_mortality_damages correctly reshapes different versions of mortality estimate runs for use in integration system and saves to zarr file
     """
 
-    monkeypatch.setattr(dscim.preprocessing.input_damages, "validate_damages", 1 + 1)
+    monkeypatch.setattr(dscim.preprocessing.input_damages, "validate_damages", True)
 
     for b in ["6", "9"]:
         ds_in = xr.Dataset(
@@ -1169,7 +1170,7 @@ def test_coastal_inputs(
     Test that coastal_inputs correctly reshapes different versions of coastal results for use in integration system and saves to zarr file (v0.21 and v0.22 have exactly the same structure, so testing either one should be sufficient)
     """
 
-    monkeypatch.setattr(dscim.preprocessing.input_damages, "validate_damages", 1 + 1)
+    monkeypatch.setattr(dscim.preprocessing.input_damages, "validate_damages", True)
 
     if version_test == "v0.21":
         ds_in = xr.Dataset(
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
index 3ac9a1db..3fabaa36 100644
--- a/tests/test_preprocessing.py
+++ b/tests/test_preprocessing.py
@@ -12,6 +12,7 @@
     reduce_damages,
     ce_from_chunk,
 )
+import dscim
 from pathlib import Path
 import yaml
 
@@ -293,7 +294,7 @@ def test_reduce_damages(tmp_path, recipe, eta, monkeypatch):
     Test that reduce_damages returns a Zarr file with damages reduced according to the expected file structure
     """
 
-    monkeypatch.setattr(dscim.preprocessing.input_damages, "validate_damages", 1 + 1)
+    monkeypatch.setattr(dscim.preprocessing.input_damages, "validate_damages", True)
 
     d = tmp_path / "reduction"
     d.mkdir()

From 172a518d0f05b8b423bd3f738f9a42e4c3315721 Mon Sep 17 00:00:00 2001
From: Jonah Gilbert <jonahmgilbert1@gmail.com>
Date: Thu, 15 Jun 2023 14:23:30 -0500
Subject: [PATCH 7/7] Fix monkeypatch

---
 tests/test_input_damages.py | 16 ++++++++++++----
 tests/test_preprocessing.py | 12 +++++++++---
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/tests/test_input_damages.py b/tests/test_input_damages.py
index 04a5d848..51c8b1a6 100644
--- a/tests/test_input_damages.py
+++ b/tests/test_input_damages.py
@@ -80,7 +80,9 @@ def test_concatenate_damage_output(tmp_path, monkeypatch):
     """
     Test that concatenate_damage_output correctly concatenates damages across batches and saves to a single zarr file
     """
-    monkeypatch.setattr(dscim.preprocessing.input_damages, "validate_damages", True)
+    monkeypatch.setattr(
+        "dscim.preprocessing.input_damages.validate_damages", lambda *args: True
+    )
 
     d = os.path.join(tmp_path, "concatenate_in")
     if not os.path.exists(d):
@@ -440,7 +442,9 @@ def test_compute_ag_damages(
     Test that compute_ag_damages correctly reshapes ag estimate runs for use in integration system and saves to zarr file
     """
 
-    monkeypatch.setattr(dscim.preprocessing.input_damages, "validate_damages", True)
+    monkeypatch.setattr(
+        "dscim.preprocessing.input_damages.validate_damages", lambda *args: True
+    )
 
     rcp = ["rcp45", "rcp85"]
     gcm = ["ACCESS1-0", "GFDL-CM3"]
@@ -1015,7 +1019,9 @@ def test_prep_mortality_damages(
     Test that prep_mortality_damages correctly reshapes different versions of mortality estimate runs for use in integration system and saves to zarr file
     """
 
-    monkeypatch.setattr(dscim.preprocessing.input_damages, "validate_damages", True)
+    monkeypatch.setattr(
+        "dscim.preprocessing.input_damages.validate_damages", lambda *args: True
+    )
 
     for b in ["6", "9"]:
         ds_in = xr.Dataset(
@@ -1170,7 +1176,9 @@ def test_coastal_inputs(
     Test that coastal_inputs correctly reshapes different versions of coastal results for use in integration system and saves to zarr file (v0.21 and v0.22 have exactly the same structure, so testing either one should be sufficient)
     """
 
-    monkeypatch.setattr(dscim.preprocessing.input_damages, "validate_damages", True)
+    monkeypatch.setattr(
+        "dscim.preprocessing.input_damages.validate_damages", lambda *args: True
+    )
 
     if version_test == "v0.21":
         ds_in = xr.Dataset(
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
index 3fabaa36..1f30b5a7 100644
--- a/tests/test_preprocessing.py
+++ b/tests/test_preprocessing.py
@@ -12,7 +12,6 @@
     reduce_damages,
     ce_from_chunk,
 )
-import dscim
 from pathlib import Path
 import yaml
 
@@ -294,7 +293,9 @@ def test_reduce_damages(tmp_path, recipe, eta, monkeypatch):
     Test that reduce_damages returns a Zarr file with damages reduced according to the expected file structure
     """
 
-    monkeypatch.setattr(dscim.preprocessing.input_damages, "validate_damages", True)
+    monkeypatch.setattr(
+        "dscim.preprocessing.preprocessing.validate_damages", lambda *args: True
+    )
 
     d = tmp_path / "reduction"
     d.mkdir()
@@ -427,10 +428,15 @@ def test_reduce_damages(tmp_path, recipe, eta, monkeypatch):
     )
 
 
-def test_reduce_damages_batchsize_error(tmp_path):
+def test_reduce_damages_batchsize_error(tmp_path, monkeypatch):
     """
     Test that reduce_damages with batchsize not equal to 15 returns an error
     """
+
+    monkeypatch.setattr(
+        "dscim.preprocessing.preprocessing.validate_damages", lambda *args: True
+    )
+
     d = tmp_path / "reduction"
     d.mkdir()
     dummy_sector1_dir = d / "dummy_sector1"