Skip to content

Commit

Permalink
Add tests for restarted simulation
Browse files Browse the repository at this point in the history
This commit adds a test to ensure that restarted simulations are
identical to non-restarted ones.

This is accomplished by running a simulation for three steps and
comparing it to two other cases:
1. A simulation restarted from the checkpoint produced at step 3
2. A simulation restarted from the checkpoint produced at step 2 and
   solved

Co-authored-by: Gabriele Bozzola <[email protected]>
  • Loading branch information
Akshay Sridhar and Sbozzolo committed Sep 17, 2024
1 parent e0482c5 commit 04be7c1
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 1 deletion.
11 changes: 11 additions & 0 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,17 @@ steps:
#retry:
# automatic: true

- label: ":computer: MPI GPU test restart"
command: >
srun julia --color=yes --project=test test/restart.jl
env:
CLIMACOMMS_CONTEXT: "MPI"
CLIMACOMMS_DEVICE: "CUDA"
agents:
slurm_gpus_per_task: 1
slurm_ntasks: 2
slurm_mem: 16GB

- label: ":computer: MPI no lim aquaplanet (ρe) equilmoist clearsky radiation"
command: >
srun julia --color=yes --project=examples examples/hybrid/driver.jl
Expand Down
130 changes: 130 additions & 0 deletions test/restart.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import ClimaAtmos as CA
import ClimaComms
using Test

function compare(one, two)
arr1 = Array(parent(one))
arr2 = Array(parent(two))

# Calculate element-wise relative difference, avoiding division by zero
diff = abs.(arr1 .- arr2)
denominator = abs.(arr1)
relative_diff =
ifelse.(
denominator .> 0,
diff ./ denominator,
ifelse.(diff .== 0, 0.0, Inf),
)

# Check if the max relative differences is within tolerance
@test maximum(relative_diff) < 100eps(eltype(arr1))
end

@testset "Test restarts across configuration combinations" begin
### Test Description
# Generate a simulation with some complexity of
# config arguments. Some config combinations are
# incompatible so we do not sweep over all possible
# iterations.

# Modify the timestep to 1-second increments.
# Save simulation state at each timestep,
# and generate a restart file at 0secs, 2secs simulation time.
# Verify objects read in using ClimaCore.InputOutput functions
# are identical (i.e. restarts result
# in the same simulation states as if one were to advance
# the timestepper uninterrupted.)

# TODO: Restart and diagnostic behaviour needs to be
# clearly defined when config files have different
# settings (or when tendency computations conflict with
# dt or t_end parsed args)

for configuration in ["sphere", "column"]
for moisture in ["equil"]
for turb_conv in ["diagnostic_edmfx", "prognostic_edmfx"]
for precip in ["0M", "1M"]

mktempdir() do output_loc
job_id = "restart_$(configuration)_$(moisture)_$(turb_conv)_$(precip)"
test_dict = Dict(
"check_nan_every" => 3,
"log_progress" => false,
"moist" => moisture,
"precip_model" => precip,
"config" => configuration,
"turbconv" => turb_conv,
"perturb_initstate" => false,
"dt" => "1secs",
"t_end" => "3secs",
"dt_save_state_to_disk" => "1secs",
"enable_diagnostics" => false,
"output_dir" => joinpath(output_loc, job_id),
)

@info "output_dir: $(test_dict["output_dir"])"

config = CA.AtmosConfig(test_dict, job_id = job_id)

simulation = CA.get_simulation(config)
CA.solve_atmos!(simulation)

# Check re-importing the same state
restart_dir = simulation.output_dir
@test isfile(joinpath(restart_dir), "day0.3.hdf5")

config_should_be_same = CA.AtmosConfig(
merge(
test_dict,
Dict("detect_restart_file" => true),
),
job_id = job_id,
)

simulation_restarted =
CA.get_simulation(config_should_be_same)
@info "Check file-read from checkpoint data"
@info "Checking integrator.u.c"
compare(
simulation.integrator.u.c,
simulation_restarted.integrator.u.c,
)
@info "Checking integrator.u.f"
compare(
simulation.integrator.u.f,
simulation_restarted.integrator.u.f,
)

# Check re-importing from previous state and advancing one step
restart_file =
joinpath(simulation.output_dir, "day0.2.hdf5")
@test isfile(joinpath(restart_dir), "day0.2.hdf5")
@info "Restart from specific file"
config2 = CA.AtmosConfig(
merge(
test_dict,
Dict("restart_file" => restart_file),
),
job_id = job_id,
)

simulation_restarted2 = CA.get_simulation(config2)
@info "Advancing restarted simulation"
CA.solve_atmos!(simulation_restarted2)
@info "Restarted simulation complete"
@info "Checking integrator.u.c"
compare(
simulation.integrator.u.c,
simulation_restarted2.integrator.u.c,
)
@info "Checking integrator.u.f"
compare(
simulation.integrator.u.f,
simulation_restarted2.integrator.u.f,
)
end
end
end
end
end
end
3 changes: 2 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ using Test
@safetestset "Sponge interface tests" begin @time include("parameterized_tendencies/sponge/rayleigh_sponge.jl") end
@safetestset "Precipitation interface tests" begin @time include("parameterized_tendencies/microphysics/precipitation.jl") end
@safetestset "Model getters" begin @time include("solver/model_getters.jl") end
@safetestset "Topography tests" begin @time include("topography.jl") end
@safetestset "Topography tests" begin @time include("topography.jl") end
@safetestset "Restart" begin @time include("restart.jl") end

#! format: on

Expand Down

0 comments on commit 04be7c1

Please sign in to comment.