Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stabilize long runs #464

Closed
wants to merge 12 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
304 changes: 165 additions & 139 deletions .buildkite/longruns/pipeline.yml

Large diffs are not rendered by default.

202 changes: 202 additions & 0 deletions .buildkite/longruns/pipeline_orig.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
agents:
queue: central
slurm_time: 24:00:00
modules: julia/1.9.3 cuda/12.2 ucx/1.14.1_cuda-12.2 openmpi/4.1.5_cuda-12.2 hdf5/1.12.2-ompi415 nsight-systems/2023.2.1

env:
JULIA_LOAD_PATH: "${JULIA_LOAD_PATH}:${BUILDKITE_BUILD_CHECKOUT_PATH}/.buildkite"
OPENBLAS_NUM_THREADS: 1
JULIA_NVTX_CALLBACKS: gc
OMPI_MCA_opal_warn_on_missing_libcuda: 0
JULIA_MAX_NUM_PRECOMPILE_FILES: 100
GKSwstype: 100

CONFIG_PATH: "config/longrun_configs"
PERF_CONFIG_PATH: "config/perf_configs"

timeout_in_minutes: 1440

steps:
- label: "init :computer:"
key: "init_cpu_env"
command:

- echo "--- Configure MPI"
- julia -e 'using Pkg; Pkg.add("MPIPreferences"); using MPIPreferences; use_system_binary()'

- echo "--- Instantiate AMIP env"
- "julia --project=experiments/AMIP/modular/ -e 'using Pkg; Pkg.instantiate(;verbose=true)'"
- "julia --project=experiments/AMIP/modular/ -e 'using Pkg; Pkg.precompile()'"
- "julia --project=experiments/AMIP/modular/ -e 'using Pkg; Pkg.status()'"

- echo "--- Download artifacts"
- "julia --project=artifacts -e 'using Pkg; Pkg.instantiate(;verbose=true)'"
- "julia --project=artifacts -e 'using Pkg; Pkg.precompile()'"
- "julia --project=artifacts -e 'using Pkg; Pkg.status()'"
- "julia --project=artifacts artifacts/download_artifacts.jl"

- echo "--- Instantiate perf env"
- "julia --project=perf/ -e 'using Pkg; Pkg.instantiate(;verbose=true)'"
- "julia --project=perf/ -e 'using Pkg; Pkg.precompile()'"
- "julia --project=perf/ -e 'using Pkg; Pkg.status()'"

agents:
slurm_cpus_per_task: 8
env:
JULIA_NUM_PRECOMPILE_TASKS: 8
JULIA_MAX_NUM_PRECOMPILE_FILES: 50

- wait

- group: "Targeted resolution coupled AMIP long runs"

steps:

- label: "Slabplanet: default"
key: "slabplanet_default_longrun"
command: "julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/slabplanet_default_longrun.yml"
artifact_paths: "experiments/AMIP/modular/output/slabplanet/slabplanet_default_longrun_artifacts/*"
env:
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks_per_node: 1
slurm_nodes: 1
slurm_mem_per_cpu: 16G

# DYAMOND AMIP: 1 day (convection resolving)

- label: "MPI AMIP SUPERFINE: dyamond_target"
key: "dyamond_target"
command: "mpiexec julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/dyamond_target.yml"
artifact_paths: "experiments/AMIP/modular/output/amip/dyamond_target_artifacts/*"
env:
CLIMACORE_DISTRIBUTED: "MPI"
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks_per_node: 16
slurm_nodes: 4
slurm_mem_per_cpu: 16G

# mid-resolution AMIP: longrun (140 days)
- label: "MPI AMIP FINE: target longrun"
key: "amip_longrun_target"
command: "mpiexec julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/amip_longrun_target.yml"
artifact_paths: "experiments/AMIP/modular/output/amip/amip_longrun_target_artifacts/*"
env:
CLIMACORE_DISTRIBUTED: "MPI"
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks_per_node: 16
slurm_nodes: 4
slurm_mem_per_cpu: 16G

# mid-resolution AMIP: MPI performance scaling (10 days)
- label: "MPI AMIP FINE: n64"
key: "mpi_amip_fine_n64"
command: "mpiexec julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/amip_n64_shortrun.yml"
artifact_paths: "experiments/AMIP/modular/output/amip/amip_n64_shortrun_artifacts/*"
env:
CLIMACORE_DISTRIBUTED: "MPI"
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks_per_node: 16
slurm_nodes: 4
slurm_mem_per_cpu: 16G

- label: "MPI AMIP FINE: n32"
key: "mpi_amip_fine_n32"
command: "mpiexec julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/amip_n32_shortrun.yml"
artifact_paths: "experiments/AMIP/modular/output/amip/amip_n32_shortrun_artifacts/*"
env:
CLIMACORE_DISTRIBUTED: "MPI"
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks_per_node: 8
slurm_nodes: 4
slurm_mem_per_cpu: 16G

- label: "MPI AMIP FINE: n8"
key: "mpi_amip_fine_n8"
command: "mpiexec julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/amip_n8_shortrun.yml"
artifact_paths: "experiments/AMIP/modular/output/amip/amip_n8_shortrun_artifacts/*"
env:
CLIMACORE_DISTRIBUTED: "MPI"
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks_per_node: 8
slurm_nodes: 1
slurm_mem_per_cpu: 16G

- label: "MPI AMIP FINE: n2" # 10d take 21h, so reducing to 1d
key: "mpi_amip_fine_n2"
command: "mpiexec julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/amip_n2_shortrun.yml"
artifact_paths: "experiments/AMIP/modular/output/amip/amip_n2_shortrun_artifacts/*"
env:
CLIMACORE_DISTRIBUTED: "MPI"
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks_per_node: 2
slurm_nodes: 1
slurm_mem_per_cpu: 16G

- label: "MPI AMIP FINE: n1" # also reported by longruns with a flame graph; 10d take 21h, so reducing to 1d
key: "mpi_amip_fine_n1"
command: "julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/amip_n1_shortrun.yml"
artifact_paths: "experiments/AMIP/modular/output/amip/amip_n1_shortrun_artifacts/*"
env:
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks_per_node: 1
slurm_nodes: 1
slurm_mem_per_cpu: 16G

- label: "MPI AMIP FINE: n1 no couple" # sim time = Δt_cpl (~ benchmarking with standalone models)
key: "mpi_amip_fine_n1_nocouple"
command: "julia --color=yes --project=experiments/AMIP/modular/ experiments/AMIP/modular/coupler_driver_modular.jl --config_file $CONFIG_PATH/amip_n1_shortrun_nocouple.yml"
artifact_paths: "experiments/AMIP/modular/output/amip/amip_n1_shortrun_nocouple_artifacts/*"
env:
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks_per_node: 1
slurm_nodes: 1
slurm_mem_per_cpu: 16G

# mpi_amip_fine_n1 flame graph report (NB: arguments passed from the ci pipeline.yml)
- label: ":rocket: performance: flame graph diff: perf_target_amip_n1_shortrun"
command: "julia --color=yes --project=perf perf/flame_diff.jl --config_file $PERF_CONFIG_PATH/perf_diff_target_amip_n1_shortrun.yml"
artifact_paths: "perf/output/perf_diff_target_amip_n1_shortrun/*"
agents:
slurm_ntasks_per_node: 1
slurm_nodes: 1
slurm_mem_per_cpu: 16G

- wait

# plot job performance history
- label: ":chart_with_downwards_trend: build history"
command:
- build_history main # name of branch to plot
artifact_paths:
- "build_history.html"

- wait

- label: ":envelope: Slack report: build_history"
command:
- slack-upload -c "#coupler-report" -f build_history.html -m html -n build_history -x "Overall job performance"

- label: ":envelope: Slack report: Slabplanet"
command:
- slack-upload -c "#coupler-report" -f experiments/AMIP/modular/output/slabplanet/slabplanet_default_longrun_artifacts/total_energy_log_bucket.png -m png -n slab_coarse_log -x "Slabplanet energy conservation (log error)"
- slack-upload -c "#coupler-report" -f experiments/AMIP/modular/output/slabplanet/slabplanet_default_longrun_artifacts/total_energy_bucket.png -m png -n slab_coarse -x "Slabplanet energy conservation"
- slack-upload -c "#coupler-report" -f experiments/AMIP/modular/output/slabplanet/slabplanet_default_longrun_artifacts/total_water_log_bucket.png -m png -n slab_coarse_w_log -x "Slabplanet water conservation (log error)"
- slack-upload -c "#coupler-report" -f experiments/AMIP/modular/output/slabplanet/slabplanet_default_longrun_artifacts/total_water_bucket.png -m png -n slab_coarse_w -x "Slabplanet water conservation"

- label: ":envelope: Slack report: target AMIP"
command:
- slack-upload -c "#coupler-report" -f experiments/AMIP/modular/output/amip/amip_longrun_target_artifacts/amip_paperplots.png -m png -n amip_fine -x "AMIP Target Longrun"

- label: ":envelope: Slack report: Flame Diff"
command:
- slack-upload -c "#coupler-report" -f perf/output/perf_diff_target_amip_n1_shortrun/flame_diff.html -m png -n amip_fine_flamegraphdiff -x "AMIP Longrun FlameGraphDiff"
- slack-upload -c "#coupler-report" -f perf/output/perf_diff_target_amip_n1_shortrun/flame_diff_self_count.html -m png -n amip_fine_flamegraphdiffself -x "AMIP Longrun FlameGraphDiffSelf"
12 changes: 9 additions & 3 deletions config/longrun_configs/amip_longrun_target.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
run_name: "amip_longrun_target"
anim: true
dt_cpl: 150
dt_cpl: 100
energy_check: false
mode_name: "amip"
mono_surface: false
Expand All @@ -11,12 +11,18 @@ precip_model: "0M"
z_elem: 35
dz_bottom: 50
h_elem: 12
kappa_4: 3e16
kappa_4: 4e16
rayleigh_sponge: true
alpha_rayleigh_uh: 0
dt: "100secs"
t_end: "100days" # TODO this has been decreased from 140 days to avoid instability #460
t_end: "140days" # TODO this has been decreased from 140 days to avoid instability #460
job_id: "amip_longrun_target"
dt_save_to_sol: "5days"
dt_save_to_disk: "1days"
apply_limiter: false
topography: "Earth"
topo_smoothing: true
use_reference_state: false
surface_setup: PrescribedSurface
rayleigh_sponge: true
alpha_rayleigh_uh: 0
23 changes: 23 additions & 0 deletions config/longrun_configs/amip_longrun_target_orig.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
run_name: "amip_longrun_target"
anim: true
dt_cpl: 150
energy_check: false
mode_name: "amip"
mono_surface: false
vert_diff: "true"
moist: "equil"
rad: "clearsky"
precip_model: "0M"
z_elem: 35
dz_bottom: 50
h_elem: 12
kappa_4: 3e16
rayleigh_sponge: true
alpha_rayleigh_uh: 0
dt: "100secs"
t_end: "140days" # TODO this has been decreased from 140 days to avoid instability #460
job_id: "amip_longrun_target"
dt_save_to_sol: "5days"
dt_save_to_disk: "1days"
apply_limiter: false
surface_setup: PrescribedSurface
1 change: 1 addition & 0 deletions config/longrun_configs/amip_n1_shortrun.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ dt_save_to_sol: "100days"
mono_surface: false
apply_limiter: false
precip_model: "0M"
surface_setup: PrescribedSurface
1 change: 1 addition & 0 deletions config/longrun_configs/amip_n1_shortrun_nocouple.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ dt_save_to_sol: "100days"
mono_surface: false
apply_limiter: false
precip_model: "0M"
surface_setup: PrescribedSurface
1 change: 1 addition & 0 deletions config/longrun_configs/amip_n2_shortrun.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ dt_save_to_sol: "100days"
mono_surface: false
apply_limiter: false
precip_model: "0M"
surface_setup: PrescribedSurface
1 change: 1 addition & 0 deletions config/longrun_configs/amip_n32_shortrun.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ dt_save_to_sol: "100days"
mono_surface: false
apply_limiter: false
precip_model: "0M"
surface_setup: PrescribedSurface
1 change: 1 addition & 0 deletions config/longrun_configs/amip_n64_shortrun.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ dt_save_to_sol: "100days"
mono_surface: false
apply_limiter: false
precip_model: "0M"
surface_setup: PrescribedSurface
1 change: 1 addition & 0 deletions config/longrun_configs/amip_n8_shortrun.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ dt_save_to_sol: "100days"
mono_surface: false
apply_limiter: false
precip_model: "0M"
surface_setup: PrescribedSurface
4 changes: 4 additions & 0 deletions config/longrun_configs/dyamond_target.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,7 @@ turb_flux_partition: "CombinedStateFluxes"
atmos_config_file: "config/longrun_configs/longrun_aquaplanet_dyamond.yml"
atmos_toml_file: "toml/longrun_aquaplanet_dyamond.toml"
monthly_checkpoint: false
topography: "Earth"
topo_smoothing: true
use_reference_state: false
surface_setup: PrescribedSurface
17 changes: 17 additions & 0 deletions config/longrun_configs/new_target.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
run_name: "new_target"
anim: true
dt_cpl: 150
energy_check: false
mode_name: "amip"
mono_surface: false
t_end: "300days"
job_id: "new_target"
dt_save_to_sol: "20days"
dt_save_to_disk: "10days"
dt_save_restart: "5days"
turb_flux_partition: "CombinedStateFluxes"
atmos_config_file: "config/longrun_configs/longrun_aquaplanet_rhoe_equilmoist_nz63_0M_55km_rs35km_clearsky_tvinsolation.yml"
atmos_toml_file: "toml/longrun_aquaplanet_rhoe_equilmoist_nz63_0M_55km_rs35km_clearsky.toml"
monthly_checkpoint: false
hourly_checkpoint: true
surface_setup: PrescribedSurface
17 changes: 17 additions & 0 deletions config/longrun_configs/new_target_nocouple.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
run_name: "new_target_nocouple"
anim: true
dt_cpl: 25920000
energy_check: false
mode_name: "amip"
mono_surface: false
t_end: "300days"
job_id: "new_target_nocouple"
dt_save_to_sol: "20days"
dt_save_to_disk: "10days"
dt_save_restart: "5days"
turb_flux_partition: "CombinedStateFluxes"
atmos_config_file: "config/longrun_configs/longrun_aquaplanet_rhoe_equilmoist_nz63_0M_55km_rs35km_clearsky_tvinsolation.yml"
atmos_toml_file: "toml/longrun_aquaplanet_rhoe_equilmoist_nz63_0M_55km_rs35km_clearsky.toml"
monthly_checkpoint: false
hourly_checkpoint: true
surface_setup: DefaultMoninObukhov
1 change: 1 addition & 0 deletions config/longrun_configs/slabplanet_default_longrun.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ precip_model: "0M"
anim: true
apply_limiter: false
job_id: "slabplanet_default_longrun"
surface_setup: PrescribedSurface
Loading