diff --git a/tools/README.md b/tools/README.md index d40691456..9e7d81f38 100644 --- a/tools/README.md +++ b/tools/README.md @@ -1,7 +1,7 @@ # Tools -This folder contains single use scripts which have been used to assist in running argo workflows. -The scripts are stored in this folder if it is thought they may become useful again in the future. +This folder contains single use scripts which have been used to assist in various argo tasks. +The scripts should be stored in this folder if they may become useful again in the future. ## generate-argo-commands-imagery.py @@ -9,8 +9,40 @@ The scripts are stored in this folder if it is thought they may become useful ag **Related Jira Tickets:** [TDE-632](https://toitutewhenua.atlassian.net/jira/software/c/projects/TDE/boards/768/backlog?atlOrigin=eyJpIjoiNjVkNmMyNmNmNGJlNDIzOGI2YmIyMzViNzVkNDUwZjEiLCJwIjoiaiJ9); [TDE-631](https://toitutewhenua.atlassian.net/browse/TDE-631?atlOrigin=eyJpIjoiNDI5OGE5MGY5ZmUxNGUyNzkwZjdlYTcxOTg5ZmQ0MGUiLCJwIjoiaiJ9) -**Description:** This script was generated to allow for the processing of numerous imagery datasets using the argo cli. +**Description:** +This script sets up for the automated processing of numerous imagery datasets using the argo cli. -**Additional Resources/links:** +**Setup:** -- [CSV](https://linzsrm.sharepoint.com/:x:/r/sites/Topography/_layouts/15/Doc.aspx?sourcedoc=%7B508567E2-EF88-458B-9115-0FC719CAA540%7D&file=imagery-standardising-parameters-bulk-process.xlsx&action=default&mobileredirect=true) +Download the [parameters csv](https://linzsrm.sharepoint.com/:x:/r/sites/Topography/_layouts/15/Doc.aspx?sourcedoc=%7B508567E2-EF88-458B-9115-0FC719CAA540%7D&file=imagery-standardising-parameters-bulk-process.xlsx&action=default&mobileredirect=true) from sharepoint, store as `imagery-standardising-parameters-bulk-process.csv` in `./tools/` + _nb: you may have to convert this from xlsx to csv, this can be done many places [online](https://cloudconvert.com/xlsx-to-csv)._ + +**Instructions:** + +1. Update the `SOURCE` variable in generate-argo-cli-commands.py +2. Run: + +```bash +cd ./tools +python3 generate-argo-cli-commands.py > log.txt +``` + +**Output:** + +- **region-year-scale.yaml:** workflow parameters for this dataset +- **standardise-publish.sh:** bash script to 'deploy' argo workflows +- **standardise-publish-import.sh:** bash script to 'deploy' argo workflows that also require basemaps import +- **logs.txt:** Contains important logs about skipped datasets. + +**Submitting:** +`standardise-publish.sh` is set up and ready to go, just run: + +```bash +sh standardise-publish.sh +``` + +If created, `standardise-publish-import.sh` will require you to uncomment some lines in `standardise-publish-import.yaml`, then run: + +```bash +sh standardise-publish-import.sh +``` diff --git a/tools/generate-argo-cli-commands.py b/tools/generate-argo-cli-commands.py index c4f33cb5f..1807bed3e 100644 --- a/tools/generate-argo-cli-commands.py +++ b/tools/generate-argo-cli-commands.py @@ -1,14 +1,16 @@ import csv -from typing import List, Optional +from typing import Dict, List, Optional import yaml from linz_logger import get_log -# nb: CHANGE if working from a different source -# SOURCE = "s3://linz-data-lake-raster-prod/" +# ####################################### +# USER PARAMETERS: SOURCE = "s3://linz-raster-data-store/" - PARAMETERS_CSV = "./imagery-standardising-parameters-bulk-process.csv" +# ####################################### + +# read in enums from workflow template with open("../workflows/imagery/standardising.yaml", "r") as f: workflow = yaml.load(f, Loader=yaml.loader.SafeLoader) for parameter in workflow["spec"]["arguments"]["parameters"]: @@ -19,9 +21,6 @@ if parameter["name"] == "scale": SCALES = parameter["enum"] -spi_list = [] -sp_list = [] - def _format_date(date: str) -> str: fd_lst = date.split("/") @@ -57,137 +56,159 @@ def _validate_licensor(licensor: str) -> Optional[str]: return None -def _validate_producer(producer: str) -> Optional[str]: +def _add_licensor(row: List[str], index: Dict[str, int]) -> Dict[str, str]: + licensor = _validate_licensor(row[index["licensor"]]) + if not licensor: + get_log().warning( + "skipped: invalid licensor", + licensor=row[index["licensor"]], + source=row[index["source"]], + title=row[index["title"]], + ) + return {} + elif licensor and ";" in licensor: + return {"licensor-list": licensor, "licensor": ""} + else: + return {"licensor": licensor, "licensor-list": ""} + + +def _get_valid_producer(producer: str) -> Dict[str, str]: if producer in PRODUCERS: - return producer + return {"producer": producer} elif producer == "NZ Aerial Mapping Ltd": - return "NZ Aerial Mapping" + return {"producer": "NZ Aerial Mapping"} elif producer == "Aerial Surveys Ltd" or producer == "Aerial Surveys Limited": - return "Aerial Surveys" + return {"producer": "Aerial Surveys"} elif producer == "AAM NZ Limited": - return "AAM NZ" + return {"producer": "AAM NZ"} elif producer == "Landpro Ltd": - return "Landpro" + return {"producer": "Landpro"} elif producer == "UAV Mapping NZ Ltd": - return "UAV Mapping NZ" - return None + return {"producer": "UAV Mapping NZ"} + return {} -def _validate_scale(scale: str) -> Optional[str]: +def _get_valid_scale(scale: str) -> Dict[str, str]: if scale in SCALES: - return scale - return None + return {"scale": scale} + return {} + + +def _index_csv(header: List[str]) -> Dict[str, int]: + ind = {} + ind["comment"] = header.index("Comment") + ind["source"] = header.index("source") + ind["target"] = header.index("target") + ind["scale"] = header.index("scale") + ind["title"] = header.index("Title") + ind["licensor"] = header.index("licensor(s)") + ind["producer"] = header.index("producer(s)") + ind["description"] = header.index("description") + ind["startdate"] = header.index("start_datetime") + ind["enddate"] = header.index("end_datetime") + ind["basemaps"] = header.index("basemaps s3 path") + return ind + + +def _add_bm_params(target: str, row: List[str], index: Dict[str, int]) -> Dict[str, str]: + get_log().info( + "basemaps import required", + source=row[index["source"]], + title=row[index["title"]], + ) + return { + "category": "Urban Aerial Photos", + "name": "target".rstrip("/rgb/2193/").split("/")[-1], + "tile-matrix": "NZTM2000Quad/WebMercatorQuad", + "blend": "20", + "aligned-level": "6", + "create-pull-request": "true", + } + + +def _validate_params(params: Dict[str, str], row: List[str], index: Dict[str, int]) -> bool: + if not params["scale"]: + get_log().warning( + "skipped: invalid scale", + scale=row[index["scale"]], + source=row[index["source"]], + title=row[index["title"]], + ) + return False + if not params["producer"]: + get_log().warning( + "skipped: invalid producer", + producer=row[index["producer"]], + source=row[index["source"]], + title=row[index["title"]], + ) + return False + return True + + +def _write_params(params: Dict[str, str], file: str) -> None: + with open(f"./{file}.yaml", "w", encoding="utf-8") as output: + yaml.dump( + params, + output, + default_flow_style=False, + default_style='"', + sort_keys=False, + allow_unicode=True, + width=1000, + ) + def main() -> None: + spi_list = [] + sp_list = [] + + command = "argo submit ~/dev/topo-workflows/workflows/imagery/standardising-publish-import.yaml -n argo -f ./{0}.yaml --generate-name ispi-{1}-\n" + with open(PARAMETERS_CSV, "r") as csv_file: reader = csv.reader(csv_file) header = next(reader) - - ind_comment = header.index("Comment") - ind_source = header.index("source") - ind_target = header.index("target") - ind_scale = header.index("scale") - ind_title = header.index("Title") - ind_licensor = header.index("licensor(s)") - ind_producer = header.index("producer(s)") - ind_description = header.index("description") - ind_startdate = header.index("start_datetime") - ind_enddate = header.index("end_datetime") - ind_basemaps = header.index("basemaps s3 path") - - command = "argo submit ~/dev/topo-workflows/workflows/imagery/standardising-publish-import.yaml -n argo -f ./{0}.yaml --generate-name ispi-{1}-\n" + index = _index_csv(header) for row in reader: - if not row[ind_source].startswith(SOURCE): + if not row[index["source"]].startswith(SOURCE): continue - if row[ind_comment] != "": + if row[index["comment"]] != "": get_log().warning( "skipped: comment", - comment=row[ind_comment], - source=row[ind_source], - title=row[ind_title], + comment=row[index["comment"]], + source=row[index["source"]], + title=row[index["title"]], ) continue + file_name = row[index["target"]].rstrip("/rgb/2193/").split("/")[-1] + formatted_file_name = file_name.replace("_", "-").replace(".", "-") + params = { - "source": row[ind_source].rstrip("/") + "/", - "target": row[ind_target], - "scale": _validate_scale(row[ind_scale]), - "title": row[ind_title], - "description": row[ind_description], - "producer": _validate_producer(row[ind_producer]), - "start-datetime": _format_date(row[ind_startdate]), - "end-datetime": _format_date(row[ind_enddate]), + "source": row[index["source"]].rstrip("/") + "/", + "target": row[index["target"]], + "title": row[index["title"]], + "description": row[index["description"]], + "start-datetime": _format_date(row[index["startdate"]]), + "end-datetime": _format_date(row[index["enddate"]]), } - licensor = _validate_licensor(row[ind_licensor]) - if licensor and ";" in licensor: - params["licensor-list"] = licensor - params["licensor"] = "" - else: - params["licensor"] = licensor - params["licensor-list"] = "" + params = {**params, **_add_licensor(row, index)} + params = {**params, **_get_valid_producer(row[index["producer"]])} + params = {**params, **_get_valid_scale(row[index["scale"]])} - if not params["licensor"] and params["licensor-list"] == "": - get_log().warning( - "skipped: invalid licensor", - licensor=row[ind_licensor], - source=row[ind_source], - title=row[ind_title], - ) - continue - - if not params["producer"]: - get_log().warning( - "skipped: invalid producer", - producer=row[ind_producer], - source=row[ind_source], - title=row[ind_title], - ) + if not _validate_params(params, row, index): continue - if not params["scale"]: - get_log().warning( - "skipped: invalid scale", - scale=f"{row[ind_scale]}", - source=row[ind_source], - title=row[ind_title], - ) - continue - - file_name = row[ind_target].rstrip("/rgb/2193/").split("/")[-1] - formatted_file_name = file_name.replace("_", "-").replace(".", "-") - - if row[ind_basemaps] == "": - get_log().info( - "basemaps import required", - source=row[ind_source], - title=row[ind_title], - ) - bm_params = { - "category": "Urban Aerial Photos", - "name": params["target"].rstrip("/rgb/2193/").split("/")[-1], - "tile-matrix": "NZTM2000Quad/WebMercatorQuad", - "blend": "20", - "aligned-level": "6", - "create-pull-request": "true" - } - params = {**params, **bm_params} + if row[index["basemaps"]] == "": + params = {**params, **_add_bm_params(params["target"], row, index)} spi_list.append(command.format(formatted_file_name, formatted_file_name)) else: sp_list.append(command.format(formatted_file_name, formatted_file_name)) - with open(f"./{formatted_file_name}.yaml", "w", encoding="utf-8") as output: - yaml.dump( - params, - output, - default_flow_style=False, - default_style='"', - sort_keys=False, - allow_unicode=True, - width=1000, - ) + _write_params(params, formatted_file_name) with open("standardise-publish.sh", "w") as script: script.write("#!/bin/bash\n\n") @@ -197,4 +218,5 @@ def main() -> None: script.write("#!/bin/bash\n\n") script.writelines(spi_list) + main()