Merge pull request #722 from CliMA/ln/up-slack-report

Longrun output summary
CliMA · Apr 10, 2024 · 26bbd0b · 26bbd0b
2 parents f6a43c2 + 79594b1
commit 26bbd0b
Show file tree

Hide file tree

Showing 4 changed files with 62 additions and 137 deletions.
diff --git a/.buildkite/longruns/pipeline.yml b/.buildkite/longruns/pipeline.yml
@@ -13,7 +13,6 @@ env:
   SLURM_KILL_BAD_EXIT: 1
 
   CONFIG_PATH: "config/longrun_configs"
-  PERF_CONFIG_PATH: "config/perf_configs"
 
 timeout_in_minutes: 1440
 
@@ -34,11 +33,6 @@ steps:
       - "julia --project=artifacts -e 'using Pkg; Pkg.status()'"
       - "julia --project=artifacts artifacts/download_artifacts.jl"
 
-      - echo "--- Instantiate perf env"
-      - "julia --project=perf/ -e 'using Pkg; Pkg.instantiate(;verbose=true)'"
-      - "julia --project=perf/ -e 'using Pkg; Pkg.precompile()'"
-      - "julia --project=perf/ -e 'using Pkg; Pkg.status()'"
-
     agents:
       slurm_cpus_per_task: 8
     env:
@@ -75,6 +69,7 @@ steps:
       slurm_mem: 20GB
       slurm_gpus: 1
       modules: common
+    soft_fail: true
 
   - group: "Coupler integration and conservation tests"
 
@@ -91,6 +86,7 @@ steps:
           slurm_ntasks_per_node: 1
           slurm_nodes: 1
           slurm_mem_per_cpu: 16G
+        soft_fail: true
 
       - label: "Slabplanet_aqua: couple"
         key: "slabplanet_aqua_atmos_sf_couple" # SF at each Atmos stage, coupling, prescribed SST from coupler - identical results to the above confirm 1) initial conditions in Atmos are unchanged compared to the slab, 2) coupling not introducing variability when constant surface
@@ -102,6 +98,7 @@ steps:
           slurm_ntasks_per_node: 1
           slurm_nodes: 1
           slurm_mem_per_cpu: 16G
+        soft_fail: true
 
       - label: "Slabplanet_aqua: coupler fluxes"
         key: "slabplanet_aqua_coupler_sf" # SF at each coupler timestep, constant ocean - comparing to the above runs, this tests the sensitivity of less frequent flux calculation
@@ -113,6 +110,7 @@ steps:
           slurm_ntasks_per_node: 1
           slurm_nodes: 1
           slurm_mem_per_cpu: 16G
+        soft_fail: true
 
       - label: "Slabplanet_aqua: coupler fluxes, evolving ocean"
         key: "slabplanet_aqua_coupler_sf_evolve_ocn" # SF at each coupler timestep, evolving ocean - comparing to the above run, tests the sensitivity of evolving ocean
@@ -124,6 +122,7 @@ steps:
           slurm_ntasks_per_node: 1
           slurm_nodes: 1
           slurm_mem_per_cpu: 16G
+        soft_fail: true
 
       - label: "Slabplanet_terra: coupler fluxes, evolving bucket"
         key: "slabplanet_terra" # SF at each coupler timestep, evolving ocean - comparing to the above run, tests the sensitivity of evolving bucket
@@ -135,6 +134,7 @@ steps:
           slurm_ntasks_per_node: 1
           slurm_nodes: 1
           slurm_mem_per_cpu: 16G
+        soft_fail: true
 
       - label: "Slabplanet: coupler fluxes, evolving ocean and land"
         key: "slabplanet_coupler_sf_evolve_ocn"
@@ -146,8 +146,9 @@ steps:
           slurm_ntasks_per_node: 1
           slurm_nodes: 1
           slurm_mem_per_cpu: 16G
+        soft_fail: true
 
-  - group: "Current target tests: idealized surface"
+  - group: "Current target tests: idealized surfaces"
 
     steps:
 
@@ -162,6 +163,7 @@ steps:
           slurm_ntasks_per_node: 16
           slurm_nodes: 4
           slurm_mem_per_cpu: 16G
+        soft_fail: true
 
       - label: "TARGET IDEALIZED: new target aqua - fixed ocean T, nocouple, atmos flux calc"
         key: "slabplanet_aqua_target_nocouple"
@@ -174,6 +176,7 @@ steps:
           slurm_ntasks_per_node: 16
           slurm_nodes: 4
           slurm_mem_per_cpu: 16G
+        soft_fail: true
 
       - label: "TARGET IDEALIZED: new target aqua - fixed ocean T, coupler flux calc"
         key: "slabplanet_aqua_target"
@@ -186,6 +189,7 @@ steps:
           slurm_ntasks_per_node: 16
           slurm_nodes: 4
           slurm_mem_per_cpu: 16G
+        soft_fail: true
 
       - label: "TARGET IDEALIZED: new target aqua - evolving slab ocean T"
         key: "slabplanet_aqua_target_evolve_ocn"
@@ -198,6 +202,7 @@ steps:
           slurm_ntasks_per_node: 16
           slurm_nodes: 4
           slurm_mem_per_cpu: 16G
+        soft_fail: true
 
       - label: "TARGET IDEALIZED: new target slab - fixed ocean T, bucket"
         key: "slabplanet_target"
@@ -210,6 +215,7 @@ steps:
           slurm_ntasks_per_node: 16
           slurm_nodes: 4
           slurm_mem_per_cpu: 16G
+        soft_fail: true
 
       - label: "TARGET IDEALIZED: new target slab - evolving slab ocean T, bucket"
         key: "slabplanet_target_evolve_ocn"
@@ -222,9 +228,10 @@ steps:
           slurm_ntasks_per_node: 16
           slurm_nodes: 4
           slurm_mem_per_cpu: 16G
+        soft_fail: true
 
 
-  - group: "Current target tests: AMIP surface"
+  - group: "Current target tests: AMIP surfaces"
 
     steps:
 
@@ -239,6 +246,7 @@ steps:
           slurm_ntasks_per_node: 16
           slurm_nodes: 4
           slurm_mem_per_cpu: 16G
+        soft_fail: true
 
       - label: "MPI AMIP FINE: new target amip: topo"
         key: "amip_target_topo"
@@ -251,6 +259,7 @@ steps:
           slurm_ntasks_per_node: 16
           slurm_nodes: 4
           slurm_mem_per_cpu: 16G
+        soft_fail: true
 
       - label: "MPI AMIP FINE: new target amip: topo + diagedmf"
         key: "amip_target_topo_diagedmf"
@@ -263,6 +272,7 @@ steps:
           slurm_ntasks_per_node: 16
           slurm_nodes: 4
           slurm_mem_per_cpu: 20G
+        soft_fail: true
 
   - group: "Current target tests on GPU: AMIP surface"
 
@@ -275,6 +285,7 @@ steps:
         agents:
           slurm_gpus: 1
           slurm_mem: 16GB
+        soft_fail: true
 
       - label: "GPU AMIP FINE: new target amip: topo + diagedmf"
         key: "gpu_amip_target_topo_diagedmf"
@@ -283,105 +294,13 @@ steps:
         agents:
           slurm_gpus: 1
           slurm_mem: 16GB
+        soft_fail: true
 
-  - group: "Other AMIP targets"
-
-    steps:
-
-      # DYAMOND AMIP: 1 day (convection resolving)
-      - label: "MPI AMIP SUPERFINE: dyamond_target"
-        key: "dyamond_target"
-        command: "srun julia --color=yes --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --config_file $CONFIG_PATH/dyamond_target.yml"
-        artifact_paths: "experiments/AMIP/output/amip/dyamond_target_artifacts/*"
-        env:
-          CLIMACORE_DISTRIBUTED: "MPI"
-          BUILD_HISTORY_HANDLE: ""
-        agents:
-          slurm_ntasks_per_node: 16
-          slurm_nodes: 4
-          slurm_mem_per_cpu: 16G
-
-      # mid-resolution AMIP: MPI performance scaling (10 days)
-      - label: "MPI AMIP FINE: n64"
-        key: "mpi_amip_fine_n64"
-        command: "srun julia --color=yes --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --config_file $CONFIG_PATH/amip_n64_shortrun.yml"
-        artifact_paths: "experiments/AMIP/output/amip/amip_n64_shortrun_artifacts/*"
-        env:
-          CLIMACORE_DISTRIBUTED: "MPI"
-          BUILD_HISTORY_HANDLE: ""
-        agents:
-          slurm_ntasks_per_node: 16
-          slurm_nodes: 4
-          slurm_mem_per_cpu: 16G
-
-      - label: "MPI AMIP FINE: n32"
-        key: "mpi_amip_fine_n32"
-        command: "srun julia --color=yes --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --config_file $CONFIG_PATH/amip_n32_shortrun.yml"
-        artifact_paths: "experiments/AMIP/output/amip/amip_n32_shortrun_artifacts/*"
-        env:
-          CLIMACORE_DISTRIBUTED: "MPI"
-          BUILD_HISTORY_HANDLE: ""
-        agents:
-          slurm_ntasks_per_node: 8
-          slurm_nodes: 4
-          slurm_mem_per_cpu: 16G
-
-      - label: "MPI AMIP FINE: n8"
-        key: "mpi_amip_fine_n8"
-        command: "srun julia --color=yes --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --config_file $CONFIG_PATH/amip_n8_shortrun.yml"
-        artifact_paths: "experiments/AMIP/output/amip/amip_n8_shortrun_artifacts/*"
-        env:
-          CLIMACORE_DISTRIBUTED: "MPI"
-          BUILD_HISTORY_HANDLE: ""
-        agents:
-          slurm_ntasks_per_node: 8
-          slurm_nodes: 1
-          slurm_mem_per_cpu: 16G
-
-      - label: "MPI AMIP FINE: n2" # 10d take 21h, so reducing to 1d
-        key: "mpi_amip_fine_n2"
-        command: "srun julia --color=yes --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --config_file $CONFIG_PATH/amip_n2_shortrun.yml"
-        artifact_paths: "experiments/AMIP/output/amip/amip_n2_shortrun_artifacts/*"
-        env:
-          CLIMACORE_DISTRIBUTED: "MPI"
-          BUILD_HISTORY_HANDLE: ""
-        agents:
-          slurm_ntasks_per_node: 2
-          slurm_nodes: 1
-          slurm_mem_per_cpu: 16G
-
-      - label: "MPI AMIP FINE: n1" # also reported by longruns with a flame graph; 10d take 21h, so reducing to 1d
-        key: "mpi_amip_fine_n1"
-        command: "julia --color=yes --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --config_file $CONFIG_PATH/amip_n1_shortrun.yml"
-        artifact_paths: "experiments/AMIP/output/amip/amip_n1_shortrun_artifacts/*"
-        env:
-          BUILD_HISTORY_HANDLE: ""
-        agents:
-          slurm_ntasks_per_node: 1
-          slurm_nodes: 1
-          slurm_mem_per_cpu: 16G
-
-      - label: "MPI AMIP FINE: n1 no couple" # sim time = Δt_cpl (~ benchmarking with standalone models)
-        key: "mpi_amip_fine_n1_nocouple"
-        command: "julia --color=yes --project=experiments/AMIP/ experiments/AMIP/coupler_driver.jl --config_file $CONFIG_PATH/amip_n1_shortrun_nocouple.yml"
-        artifact_paths: "experiments/AMIP/output/amip/amip_n1_shortrun_nocouple_artifacts/*"
-        env:
-          BUILD_HISTORY_HANDLE: ""
-        agents:
-          slurm_ntasks_per_node: 1
-          slurm_nodes: 1
-          slurm_mem_per_cpu: 16G
+  - wait
 
-      # mpi_amip_fine_n1 flame graph report (NB: arguments passed from the ci pipeline.yml)
-      - label: ":rocket: performance: flame graph diff: perf_target_amip_n1_shortrun"
-        command: "julia --color=yes --project=perf perf/flame_diff.jl --config_file $PERF_CONFIG_PATH/perf_diff_target_amip_n1_shortrun.yml"
-        artifact_paths: "perf/output/perf_diff_target_amip_n1_shortrun/*"
-        agents:
-          slurm_ntasks_per_node: 1
-          slurm_nodes: 1
-          slurm_mem_per_cpu: 16G
+  - group: "Job analysis and reporting"
 
-      - wait
+    steps:
 
       # plot job performance history
       - label: ":chart_with_downwards_trend: build history"
@@ -394,21 +313,20 @@ steps:
 
       - label: ":envelope: Slack report: build_history"
         command:
-          - slack-upload -c "#coupler-report" -f build_history.html -m html -n build_history -x "Overall job performance"
+          - |
+            slack-upload -c "#coupler-report" -f build_history.html -m html -n build_history -x ":rocket: Interactive overall job performance history (download the attached file and view in browser) :rocket:"
 
       - label: ":envelope: Slack report: Slabplanet"
         command:
-          - slack-upload -c "#coupler-report" -f experiments/AMIP/output/slabplanet/slabplanet_coupler_sf_evolve_ocn_artifacts/total_energy_log_bucket.png -m png -n slab_coarse_log -x "Slabplanet energy conservation (log error)"
           - slack-upload -c "#coupler-report" -f experiments/AMIP/output/slabplanet/slabplanet_coupler_sf_evolve_ocn_artifacts/total_energy_bucket.png -m png -n slab_coarse -x "Slabplanet energy conservation"
-          - slack-upload -c "#coupler-report" -f experiments/AMIP/output/slabplanet/slabplanet_coupler_sf_evolve_ocn_artifacts/total_water_log_bucket.png -m png -n slab_coarse_w_log -x "Slabplanet water conservation (log error)"
           - slack-upload -c "#coupler-report" -f experiments/AMIP/output/slabplanet/slabplanet_coupler_sf_evolve_ocn_artifacts/total_water_bucket.png -m png -n slab_coarse_w -x "Slabplanet water conservation"
 
       - label: ":envelope: Slack report: target AMIP"
         command:
-          - slack-upload -c "#coupler-report" -f experiments/AMIP/output/amip/amip_target_artifacts/amip_paperplots.png -m png -n amip_fine -x "AMIP Target Longrun"
-          - slack-upload -c "#coupler-report" -f experiments/AMIP/output/amip/amip_target_artifacts/biases.png -m png -n amip_fine -x "AMIP Target Longrun"
+          - slack-upload -c "#coupler-report" -f experiments/AMIP/output/amip/amip_target_topo_diagedmf_artifacts/amip_ncep.png -m png -n amip_fine -x "Target AMIP v NCEP Last Month Mean"
+          - |
+            find experiments/AMIP/output/amip/amip_target_topo_diagedmf_artifacts/ -type f -name 'bias*.png' -print0 | while IFS= read -r -d '' file; do
+              slack-upload -c "#coupler-report" -f "$$file" -m png -n "$$(basename "$$file" .png)" -x "$$(basename "$$file" .png)"
+            done
+
 
-      - label: ":envelope: Slack report: Flame Diff"
-        command:
-          - slack-upload -c "#coupler-report" -f perf/output/perf_diff_target_amip_n1_shortrun/flame_diff.html -m png -n amip_fine_flamegraphdiff -x "AMIP Longrun FlameGraphDiff"
-          - slack-upload -c "#coupler-report" -f perf/output/perf_diff_target_amip_n1_shortrun/flame_diff_self_count.html -m png -n amip_fine_flamegraphdiffself -x "AMIP Longrun FlameGraphDiffSelf"
diff --git a/experiments/AMIP/coupler_driver.jl b/experiments/AMIP/coupler_driver.jl
@@ -114,7 +114,7 @@ config_dict = merge(parsed_args, config_dict)
 config_dict_atmos = get_atmos_config(config_dict)
 
 ## merge dictionaries of command line arguments, coupler dictionary and component model dictionaries
-## (if there are common keys, the last dictorionary in the `merge` arguments takes precedence)
+## (if there are common keys, the last dictionary in the `merge` arguments takes precedence)
 config_dict = merge(config_dict_atmos, config_dict)
 
 ## read in some parsed command line arguments, required by this script
@@ -826,7 +826,7 @@ if ClimaComms.iamroot(comms_ctx)
             tubulent_energy_fluxes = (; clims = (-250, 250), units = "W/m^2"),
             q_liq_ice = (; clims = (0, 10), units = "g/kg"),
         )
-        amip_data = amip_paperplots(
+        amip_data, fig_amip = amip_paperplots(
             post_spec,
             plot_spec,
             COUPLER_OUTPUT_DIR,
@@ -847,31 +847,38 @@ if ClimaComms.iamroot(comms_ctx)
             tubulent_energy_fluxes = (:horizontal_slice,),
         )
         ncep_plot_spec = plot_spec
-        ncep_data = ncep_paperplots(
+        ncep_data, fig_ncep = ncep_paperplots(
             ncep_post_spec,
             ncep_plot_spec,
             COUPLER_OUTPUT_DIR,
             output_dir = COUPLER_ARTIFACTS_DIR,
             month_date = cs.dates.date[1],
         ) ## plot data that correspond to the model's last save_hdf5 call (i.e., last month)
 
-        # Compare against observations
+        ## combined plots
+        plot_combined = Plots.plot(fig_amip, fig_ncep, layout = (2, 1), size = (1400, 1800))
+        Plots.png(joinpath(COUPLER_ARTIFACTS_DIR, "amip_ncep.png"))
+
+        ## Compare against observations
         if t_end > 84600
             @info "Error against observations"
+            output_dates = cs.dates.date0[] .+ Second.(atmos_sim.integrator.sol.t)
+
             include("user_io/leaderboard.jl")
             compare_vars = ["pr"]
             function plot_biases(dates, output_name)
                 output_path = joinpath(COUPLER_ARTIFACTS_DIR, "bias_$(output_name).png")
                 Leaderboard.plot_biases(atmos_sim.integrator.p.output_dir, compare_vars, dates; output_path)
             end
-            plot_biases(cs.dates.date, "total")
+            plot_biases(output_dates, "total")
 
-            MAM, JJA, SON, DJF = Leaderboard.split_by_season(cs.dates.date)
+            ## collect all days between cs.dates.date0 and cs.dates.date
+            MAM, JJA, SON, DJF = Leaderboard.split_by_season(output_dates)
 
-            !isempty(MAM) && plot_biases(cs.dates.date, "MAM")
-            !isempty(JJA) && plot_biases(cs.dates.date, "JJA")
-            !isempty(SON) && plot_biases(cs.dates.date, "SON")
-            !isempty(DJF) && plot_biases(cs.dates.date, "DJF")
+            !isempty(MAM) && plot_biases(MAM, "MAM")
+            !isempty(JJA) && plot_biases(JJA, "JJA")
+            !isempty(SON) && plot_biases(SON, "SON")
+            !isempty(DJF) && plot_biases(DJF, "DJF")
         end
     end
 

diff --git a/experiments/AMIP/user_io/amip_visualizer.jl b/experiments/AMIP/user_io/amip_visualizer.jl
@@ -57,18 +57,15 @@ function amip_paperplots(
     end
 
     # combine plots and save figure
-    save_fig = Plots.plot(
-        all_plots...,
-        size = (1500, 1200),
-        right_margin = 3Plots.mm,
-        left_margin = 3Plots.mm,
-        bottom_margin = 3Plots.mm,
-        top_margin = 3Plots.mm,
-    )
+    layout = @layout([A{0.05h}; [B C D; E F G; H I J]])
+    title =
+        Plots.plot(plot_title = "AMIP Monthly Mean Fields", grid = false, showaxis = false, bottom_margin = -50Plots.px)
+    save_fig =
+        Plots.plot(title, all_plots..., size = (1500, 1200), layout = layout, titlefont = font(12), margin = 2Plots.mm)
 
     Plots.png(save_fig, joinpath(output_dir, fig_name * ".png"))
 
-    return all_data
+    return all_data, save_fig
 end
 
 """