From e24a3395c17d0751bab87cd7532e5e05a4706275 Mon Sep 17 00:00:00 2001 From: Shreya Keshive Date: Thu, 25 Jul 2024 16:16:38 +0000 Subject: [PATCH 1/5] Fix field name --- .../main.py | 12 ++--- .../main_test.py | 46 +++++++++---------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main.py b/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main.py index 991a237..f01babf 100644 --- a/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main.py +++ b/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main.py @@ -70,7 +70,7 @@ def spatialize_chunk_predictions(cloud_event: http.CloudEvent) -> None: except ValueError as ve: # Any raised ValueErrors are non-retriable so return instead of throwing an # exception (which would trigger retries) - print(ve) + print(f"Error for {object_name}: {ve}") return storage_client = gcs_client.Client() @@ -178,12 +178,12 @@ def _get_study_area_metadata( not study_area_metadata or "cell_size" not in study_area_metadata or "crs" not in study_area_metadata - or "row_count" not in study_area_metadata - or "col_count" not in study_area_metadata + or "chunk_x_count" not in study_area_metadata + or "chunk_y_count" not in study_area_metadata ): raise ValueError( f'Study area "{study_area_name}" is missing one or more required ' - "fields: cell_size, crs, row_count, col_count" + "fields: cell_size, crs, chunk_x_count, chunk_y_count" ) return study_area_metadata, chunks_ref @@ -470,8 +470,8 @@ def _aggregate_h3_predictions( if ( neighbor_x < 0 or neighbor_y < 0 - or neighbor_x >= study_area_metadata["col_count"] - or neighbor_y >= study_area_metadata["row_count"] + or neighbor_x >= study_area_metadata["chunk_x_count"] + or neighbor_y >= study_area_metadata["chunk_y_count"] ): # Chunk is outside the study area boundary. continue diff --git a/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main_test.py b/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main_test.py index 0e8015c..5e32d52 100644 --- a/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main_test.py +++ b/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main_test.py @@ -137,8 +137,8 @@ def test_spatialize_chunk_predictions_invalid_study_area( study_area_metadata: Dict[str, Any] = { "name": "study_area_name", "crs": "EPSG:32618", - "row_count": 2, - "col_count": 3, + "chunk_y_count": 2, + "chunk_x_count": 3, } # Missing "cell_size" required field chunks_metadata: List[Dict[str, Any]] = [ { @@ -164,7 +164,7 @@ def test_spatialize_chunk_predictions_invalid_study_area( assert ( 'Study area "study-area-name" is missing one or more required ' - "fields: cell_size, crs, row_count, col_count" in output.getvalue() + "fields: cell_size, crs, chunk_x_count, chunk_y_count" in output.getvalue() ) @@ -200,8 +200,8 @@ def test_spatialize_chunk_predictions_missing_chunk( "name": "study_area_name", "cell_size": 10, "crs": "EPSG:32618", - "row_count": 2, - "col_count": 3, + "chunk_y_count": 2, + "chunk_x_count": 3, } chunks_metadata: List[Dict[str, Any]] = [ { @@ -261,8 +261,8 @@ def test_spatialize_chunk_predictions_invalid_chunk( "name": "study_area_name", "cell_size": 10, "crs": "EPSG:32618", - "row_count": 2, - "col_count": 3, + "chunk_y_count": 2, + "chunk_x_count": 3, } chunks_metadata: List[Dict[str, Any]] = [ { @@ -321,8 +321,8 @@ def test_spatialize_chunk_predictions_missing_predictions( "name": "study_area_name", "cell_size": 10, "crs": "EPSG:32618", - "row_count": 2, - "col_count": 3, + "chunk_y_count": 2, + "chunk_x_count": 3, } chunks_metadata: List[Dict[str, Any]] = [ { @@ -387,8 +387,8 @@ def test_spatialize_chunk_predictions_too_many_predictions( "name": "study_area_name", "cell_size": 10, "crs": "EPSG:32618", - "row_count": 2, - "col_count": 3, + "chunk_y_count": 2, + "chunk_x_count": 3, } chunks_metadata: List[Dict[str, Any]] = [ { @@ -449,8 +449,8 @@ def test_spatialize_chunk_predictions_missing_expected_neighbor_chunk( "name": "study_area_name", "cell_size": 10, "crs": "EPSG:32618", - "row_count": 2, - "col_count": 3, + "chunk_y_count": 2, + "chunk_x_count": 3, } chunks_metadata: List[Dict[str, Any]] = [ { @@ -514,8 +514,8 @@ def test_spatialize_chunk_predictions_invalid_neighbor_chunk( "name": "study_area_name", "cell_size": 10, "crs": "EPSG:32618", - "row_count": 2, - "col_count": 3, + "chunk_y_count": 2, + "chunk_x_count": 3, } chunks_metadata: List[Dict[str, Any]] = [ { @@ -588,8 +588,8 @@ def test_spatialize_chunk_predictions_neighbor_chunk_missing_predictions( "name": "study_area_name", "cell_size": 10, "crs": "EPSG:32618", - "row_count": 2, - "col_count": 3, + "chunk_y_count": 2, + "chunk_x_count": 3, } chunks_metadata: List[Dict[str, Any]] = [ { @@ -680,8 +680,8 @@ def test_spatialize_chunk_predictions_h3_centroids_within_chunk( "name": "study_area_name", "cell_size": 10, "crs": "EPSG:32618", - "row_count": 2, - "col_count": 3, + "chunk_y_count": 2, + "chunk_x_count": 3, } chunks_metadata: List[Dict[str, Any]] = [ { @@ -788,8 +788,8 @@ def test_spatialize_chunk_predictions_h3_centroids_outside_chunk( "name": "study_area_name", "cell_size": 5, "crs": "EPSG:32618", - "row_count": 2, - "col_count": 3, + "chunk_y_count": 2, + "chunk_x_count": 3, } chunks_metadata: List[Dict[str, Any]] = [ { @@ -950,8 +950,8 @@ def test_spatialize_chunk_predictions_overlapping_neighbors( "name": "study_area_name", "cell_size": 3, "crs": "EPSG:32618", - "row_count": 2, - "col_count": 3, + "chunk_y_count": 2, + "chunk_x_count": 3, } chunks_metadata: List[Dict[str, Any]] = [ { From 167f423e2b14ed57a3bc36067b82147a82cc6cab Mon Sep 17 00:00:00 2001 From: Shreya Keshive Date: Thu, 25 Jul 2024 16:40:31 +0000 Subject: [PATCH 2/5] Reduce number of passes through spatialized_predictions --- .../main.py | 69 +++++++++++-------- 1 file changed, 42 insertions(+), 27 deletions(-) diff --git a/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main.py b/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main.py index f01babf..f3d377d 100644 --- a/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main.py +++ b/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main.py @@ -289,7 +289,7 @@ def _build_spatialized_model_predictions( ) -def _add_h3_index_details(cell: pd.Series) -> pd.Series: +def _add_h3_index_details(cell: pd.Series, chunk_boundary: Any) -> pd.Series: """Projects the cell centroid to a H3 index. Args: @@ -300,17 +300,41 @@ def _add_h3_index_details(cell: pd.Series) -> pd.Series: """ h3_index = h3.geo_to_h3(cell["lat"], cell["lon"], H3_LEVEL) centroid_lat, centroid_lon = h3.h3_to_geo(h3_index) - boundary_xy = h3.h3_to_geo_boundary(h3_index, True) + boundary_xy = geometry.Polygon(h3.h3_to_geo_boundary(h3_index, True)) + is_boundary_cell = not boundary_xy.within(chunk_boundary) + + # Filter out any rows where the projected H3 centroid falls outside of the + # chunk boundary. + if not chunk_boundary.contains(geometry.Point(centroid_lon, centroid_lat)): + h3_index = None + centroid_lat = None + centroid_lon = None + boundary_xy = None + is_boundary_cell = None + return pd.Series( { "h3_index": h3_index, "h3_centroid_lat": centroid_lat, "h3_centroid_lon": centroid_lon, - "h3_boundary": geometry.Polygon(boundary_xy), + "h3_boundary": boundary_xy, + "is_boundary_cell": is_boundary_cell, } ) +def _add_h3_index(cell: pd.Series) -> pd.Series: + """Projects the cell centroid to a H3 index. + + Args: + cell: A cell row containing the lat and lon of the cell centroid. + + Returns: + A Series containing the H3 index of the projected cell centroid. + """ + return pd.Series({"h3_index": h3.geo_to_h3(cell["lat"], cell["lon"], H3_LEVEL)}) + + def _get_chunk_boundary(study_area_metadata: dict, chunk_metadata: dict): """Calculates the boundary points of the chunk. @@ -376,29 +400,24 @@ def _calculate_h3_indexes( missing required fields. """ # Calculate H3 information for each cell. - spatialized_predictions[ - ["h3_index", "h3_centroid_lat", "h3_centroid_lon", "h3_boundary"] - ] = spatialized_predictions.apply(_add_h3_index_details, axis=1) - - # Filter out any rows where the projected H3 centroid falls outside of the - # chunk boundary. chunk_boundary = _get_chunk_boundary(study_area_metadata, chunk_metadata) - spatialized_predictions = spatialized_predictions[ - spatialized_predictions.apply( - lambda row: chunk_boundary.contains( - geometry.Point(row["h3_centroid_lon"], row["h3_centroid_lat"]) - ), - axis=1, - ) - ] + spatialized_predictions[ + [ + "h3_index", + "h3_centroid_lat", + "h3_centroid_lon", + "h3_boundary", + "is_boundary_cell", + ] + ] = spatialized_predictions.apply( + lambda row: _add_h3_index_details(row, chunk_boundary), axis=1 + ) + spatialized_predictions = spatialized_predictions.dropna(subset=["h3_index"]) # Extract rows where the projected H3 cell is not fully contained within the chunk # so we can aggregate prediction values across chunk boundaries. boundary_h3_cells = spatialized_predictions[ - spatialized_predictions.apply( - lambda row: not row["h3_boundary"].within(chunk_boundary), - axis=1, - ) + spatialized_predictions["is_boundary_cell"] ]["h3_boundary"].unique() return _aggregate_h3_predictions( @@ -524,12 +543,8 @@ def _aggregate_h3_predictions( neighbor_chunk_predictions, ) ) - # TODO: Optionally only calculate the h3_index if calculating other - # metadata is expensive - neighbor_chunk_spatialized_predictions[ - ["h3_index", "h3_centroid_lat", "h3_centroid_lon", "h3_boundary"] - ] = neighbor_chunk_spatialized_predictions.apply( - _add_h3_index_details, axis=1 + neighbor_chunk_spatialized_predictions[["h3_index"]] = ( + neighbor_chunk_spatialized_predictions.apply(_add_h3_index, axis=1) ) neighbor_chunk_spatialized_predictions = ( neighbor_chunk_spatialized_predictions[ From 23ba2b39aec4ce3c232d456d3160f59909819468 Mon Sep 17 00:00:00 2001 From: Shreya Keshive Date: Thu, 25 Jul 2024 16:55:22 +0000 Subject: [PATCH 3/5] mypy fix --- .../climateiq_spatialize_chunk_predictions_cf/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main.py b/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main.py index f3d377d..6c1109d 100644 --- a/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main.py +++ b/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main.py @@ -310,7 +310,7 @@ def _add_h3_index_details(cell: pd.Series, chunk_boundary: Any) -> pd.Series: centroid_lat = None centroid_lon = None boundary_xy = None - is_boundary_cell = None + is_boundary_cell = False return pd.Series( { @@ -412,7 +412,7 @@ def _calculate_h3_indexes( ] = spatialized_predictions.apply( lambda row: _add_h3_index_details(row, chunk_boundary), axis=1 ) - spatialized_predictions = spatialized_predictions.dropna(subset=["h3_index"]) + spatialized_predictions = spatialized_predictions.dropna(how='any') # Extract rows where the projected H3 cell is not fully contained within the chunk # so we can aggregate prediction values across chunk boundaries. From 91985d59c1c4a8ffbe20dce02d6e235396e77953 Mon Sep 17 00:00:00 2001 From: Shreya Keshive Date: Thu, 25 Jul 2024 16:57:42 +0000 Subject: [PATCH 4/5] format --- .../climateiq_spatialize_chunk_predictions_cf/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main.py b/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main.py index 6c1109d..d1692ef 100644 --- a/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main.py +++ b/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main.py @@ -412,7 +412,7 @@ def _calculate_h3_indexes( ] = spatialized_predictions.apply( lambda row: _add_h3_index_details(row, chunk_boundary), axis=1 ) - spatialized_predictions = spatialized_predictions.dropna(how='any') + spatialized_predictions = spatialized_predictions.dropna(how="any") # Extract rows where the projected H3 cell is not fully contained within the chunk # so we can aggregate prediction values across chunk boundaries. From b188337f1c4ecb925d2788fb509ea3f61e62f7f3 Mon Sep 17 00:00:00 2001 From: Shreya Keshive Date: Thu, 25 Jul 2024 17:13:16 +0000 Subject: [PATCH 5/5] update comments --- .../climateiq_spatialize_chunk_predictions_cf/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main.py b/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main.py index d1692ef..981613c 100644 --- a/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main.py +++ b/cloud_functions/climateiq_spatialize_chunk_predictions_cf/main.py @@ -290,10 +290,11 @@ def _build_spatialized_model_predictions( def _add_h3_index_details(cell: pd.Series, chunk_boundary: Any) -> pd.Series: - """Projects the cell centroid to a H3 index. + """Projects the cell centroid to a H3 index and adds H3 details. Args: cell: A cell row containing the lat and lon of the cell centroid. + chunk_boundary: A shapely.Polygon representing the chunk. Returns: A Series containing H3 information for the projected cell centroid.