From 51a40c93e4035a0d5b2581352e3c75100e20eeaa Mon Sep 17 00:00:00 2001
From: sylmak <163595985+sylmak@users.noreply.github.com>
Date: Mon, 5 Aug 2024 18:20:32 -0400
Subject: [PATCH] Change trigger CF to Cloud Run Job. (#48)

---
 .../Dockerfile                                | 34 ++++++++++
 .../main.py                                   | 68 ++++++++++---------
 .../main_test.py                              | 37 ++--------
 3 files changed, 77 insertions(+), 62 deletions(-)
 create mode 100644 cloud_functions/climateiq_trigger_export_pipeline_cf/Dockerfile

diff --git a/cloud_functions/climateiq_trigger_export_pipeline_cf/Dockerfile b/cloud_functions/climateiq_trigger_export_pipeline_cf/Dockerfile
new file mode 100644
index 0000000..26a76fa
--- /dev/null
+++ b/cloud_functions/climateiq_trigger_export_pipeline_cf/Dockerfile
@@ -0,0 +1,34 @@
+# Copyright 2020 Google, LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Use the official Python image.
+# https://hub.docker.com/_/python
+FROM python:3.12-slim
+
+# Allow statements and log messages to immediately appear in the Cloud Run logs
+ENV PYTHONUNBUFFERED True
+
+# Copy application dependency manifests to the container image.
+# Copying this separately prevents re-running pip install on every code change.
+COPY requirements.txt ./
+
+# Install production dependencies.
+RUN pip install -r requirements.txt
+
+ENV PYTHONUNBUFFERED True
+
+# Copy local code to the container image.
+ENV APP_HOME /app
+WORKDIR $APP_HOME
+COPY . ./
diff --git a/cloud_functions/climateiq_trigger_export_pipeline_cf/main.py b/cloud_functions/climateiq_trigger_export_pipeline_cf/main.py
index e9bd81a..0a49de8 100644
--- a/cloud_functions/climateiq_trigger_export_pipeline_cf/main.py
+++ b/cloud_functions/climateiq_trigger_export_pipeline_cf/main.py
@@ -2,10 +2,9 @@
 import pathlib
 import json
 import os
+import sys
 import time
 
-from cloudevents import http
-import functions_framework
 from google.cloud import tasks_v2
 from google.cloud.storage import client as gcs_client, retry
 
@@ -24,6 +23,10 @@
 SPATIALIZE_CF_QUEUE = "spatialize-chunk-predictions-queue"
 
 
+def _write_structured_log(message: str, severity: str = "INFO"):
+    print(json.dumps(dict(message=message, severity=severity)), flush=True)
+
+
 def _write_file(line: str, output_filename: str, storage_client: gcs_client.Client):
     output_blob = storage_client.bucket(OUTPUT_BUCKET_NAME).blob(output_filename)
     # Specify retry here due to bug:
@@ -31,8 +34,7 @@ def _write_file(line: str, output_filename: str, storage_client: gcs_client.Clie
     output_blob.upload_from_string(line, retry=retry.DEFAULT_RETRY)
 
 
-@functions_framework.cloud_event
-def trigger_export_pipeline(cloud_event: http.CloudEvent) -> None:
+def trigger_export_pipeline(object_name: str) -> None:
     """Triggered by writes to the "climateiq-predictions" bucket.
 
     Splits predictions into one file per chunk and kicks off
@@ -41,16 +43,13 @@ def trigger_export_pipeline(cloud_event: http.CloudEvent) -> None:
     Note: This function only runs once all output prediction files are written.
     Additionally, the climateiq_spatialize_chunk_predictions cloud function is
     only triggered once all prediction files per chunk are written since data
-    from neighboring chunks is required for spatializiation.
+    from neighboring chunks is required for spatialization.
 
     Args:
-        cloud_event: The CloudEvent representing the storage event.
+        object_name: The name of the blob written to the bucket.
     """
     start = time.time()
 
-    data = cloud_event.data
-    object_name = data["name"]
-
     # Extract components from the object name and determine the total number of
     # output prediction files.
     expected_format = (
@@ -99,6 +98,8 @@ def trigger_export_pipeline(cloud_event: http.CloudEvent) -> None:
         # written yet.
         return
 
+    _write_structured_log(f"[{object_name}] Starting process.", "DEBUG")
+
     # Split predictions into one file per chunk and output to GCS.
     output_filenames = []
     write_futures = []
@@ -117,17 +118,17 @@ def trigger_export_pipeline(cloud_event: http.CloudEvent) -> None:
                     )
                     write_futures.append(future)
 
-    futures.wait(write_futures)
-    print(
-        json.dumps(
-            dict(
-                severity="DEBUG",
-                message=(
-                    f"[{object_name}] Created {len(output_filenames)} files in "
-                    f"{time.time() - start} s."
-                ),
-            )
-        )
+    futures.wait(write_futures, return_when=futures.FIRST_EXCEPTION)
+    # If any exceptions were raised in a thread, calling result() will raise it here.
+    for future in write_futures:
+        future.result()
+
+    _write_structured_log(
+        (
+            f"[{object_name}] Created {len(output_filenames)} files in "
+            f"{time.time() - start} s."
+        ),
+        "DEBUG",
     )
 
     # Once all output files have been written, push tasks to Task Queue.
@@ -160,16 +161,19 @@ def trigger_export_pipeline(cloud_event: http.CloudEvent) -> None:
                     ),
                 )
             )
-    futures.wait(queue_futures)
-
-    print(
-        json.dumps(
-            dict(
-                severity="DEBUG",
-                message=(
-                    f"[{object_name}] Triggered export pipeline for "
-                    f"{len(output_filenames)} chunks."
-                ),
-            )
-        )
+    futures.wait(queue_futures, return_when=futures.FIRST_EXCEPTION)
+    # If any exceptions were raised in a thread, calling result() will raise it here.
+    for future in write_futures:
+        future.result()
+
+    _write_structured_log(
+        (
+            f"[{object_name}] Triggered export pipeline for "
+            f"{len(output_filenames)} chunks."
+        ),
+        "DEBUG",
     )
+
+
+if __name__ == "__main__":
+    trigger_export_pipeline(sys.argv[1])
diff --git a/cloud_functions/climateiq_trigger_export_pipeline_cf/main_test.py b/cloud_functions/climateiq_trigger_export_pipeline_cf/main_test.py
index 153dc3f..b9cca53 100644
--- a/cloud_functions/climateiq_trigger_export_pipeline_cf/main_test.py
+++ b/cloud_functions/climateiq_trigger_export_pipeline_cf/main_test.py
@@ -1,43 +1,20 @@
 import json
 from unittest import mock
 
-from cloudevents import http
 from google.cloud import storage, tasks_v2
 from google.cloud.storage import blob, client as gcs_client
 import pytest
 import main
 
 
-def _create_pubsub_event() -> http.CloudEvent:
-    attributes = {
-        "type": "google.cloud.storage.object.v1.finalized",
-        "source": "source",
-    }
-    data = {
-        "bucket": "climateiq-predictions",
-        "name": "id1/flood/v1.0/manhattan/extreme/prediction.results-3-of-5",
-    }
-    return http.CloudEvent(attributes, data)
-
-
 def test_trigger_export_pipeline_invalid_object_name():
-    attributes = {
-        "type": "google.cloud.storage.object.v1.finalized",
-        "source": "source",
-    }
-    data = {
-        "bucket": "climateiq-predictions",
-        "name": "invalid_name",  # Invalid object name
-    }
-    event = http.CloudEvent(attributes, data)
-
     expected_error = (
         "Invalid object name format. Expected format: '<id>/<prediction_type>/"
         "<model_id>/<study_area_name>/<scenario_id>/prediction.results-"
         "<file_number>-of-{number_of_files_generated}'\nActual name: 'invalid_name'"
     )
     with pytest.raises(ValueError, match=expected_error):
-        main.trigger_export_pipeline(event)
+        main.trigger_export_pipeline("invalid_name")
 
 
 @mock.patch.object(tasks_v2, "CloudTasksClient", autospec=True)
@@ -45,8 +22,6 @@ def test_trigger_export_pipeline_invalid_object_name():
 def test_trigger_export_pipeline_missing_prediction_files(
     mock_storage_client, mock_tasks_client
 ):
-    event = _create_pubsub_event()
-
     # Missing predictions for chunks 2 and 4.
     input_blobs = [
         storage.Blob(
@@ -64,7 +39,9 @@ def test_trigger_export_pipeline_missing_prediction_files(
     ]
     mock_storage_client().list_blobs.return_value = input_blobs
 
-    main.trigger_export_pipeline(event)
+    main.trigger_export_pipeline(
+        "id1/flood/v1.0/manhattan/extreme/prediction.results-3-of-5"
+    )
 
     mock_tasks_client().create_task.assert_not_called()
 
@@ -72,8 +49,6 @@ def test_trigger_export_pipeline_missing_prediction_files(
 @mock.patch.object(tasks_v2, "CloudTasksClient", autospec=True)
 @mock.patch.object(gcs_client, "Client", autospec=True)
 def test_trigger_export_pipeline(mock_storage_client, mock_tasks_client):
-    event = _create_pubsub_event()
-
     # Input blobs setup
     def create_mock_input_blob(name, start_chunk_id):
         chunk_id = (start_chunk_id - 1) * 2 + 1
@@ -108,7 +83,9 @@ def create_mock_input_blob(name, start_chunk_id):
         lambda name: mock_output_blobs.setdefault(name, mock.MagicMock())
     )
 
-    main.trigger_export_pipeline(event)
+    main.trigger_export_pipeline(
+        "id1/flood/v1.0/manhattan/extreme/prediction.results-3-of-5"
+    )
 
     # Confirm output blobs written
     for i in range(1, 11):