diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index fae7612d9ad..1fbad333a77 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -447,6 +447,17 @@ steps: #retry: # automatic: true + - label: ":computer: MPI GPU test restart" + command: > + srun julia --color=yes --project=test test/restart.jl + env: + CLIMACOMMS_CONTEXT: "MPI" + CLIMACOMMS_DEVICE: "CUDA" + agents: + slurm_gpus_per_task: 1 + slurm_ntasks: 2 + slurm_mem: 16GB + - label: ":computer: MPI no lim aquaplanet (ρe) equilmoist clearsky radiation" command: > srun julia --color=yes --project=examples examples/hybrid/driver.jl diff --git a/test/restart.jl b/test/restart.jl new file mode 100644 index 00000000000..319419b59ab --- /dev/null +++ b/test/restart.jl @@ -0,0 +1,130 @@ +import ClimaAtmos as CA +import ClimaComms +using Test + +function compare(one, two) + arr1 = Array(parent(one)) + arr2 = Array(parent(two)) + + # Calculate element-wise relative difference, avoiding division by zero + diff = abs.(arr1 .- arr2) + denominator = abs.(arr1) + relative_diff = + ifelse.( + denominator .> 0, + diff ./ denominator, + ifelse.(diff .== 0, 0.0, Inf), + ) + + # Check if the max relative differences is within tolerance + @test maximum(relative_diff) < 100eps(eltype(arr1)) +end + +@testset "Test restarts across configuration combinations" begin + ### Test Description + # Generate a simulation with some complexity of + # config arguments. Some config combinations are + # incompatible so we do not sweep over all possible + # iterations. + + # Modify the timestep to 1-second increments. + # Save simulation state at each timestep, + # and generate a restart file at 0secs, 2secs simulation time. + # Verify objects read in using ClimaCore.InputOutput functions + # are identical (i.e. restarts result + # in the same simulation states as if one were to advance + # the timestepper uninterrupted.) + + # TODO: Restart and diagnostic behaviour needs to be + # clearly defined when config files have different + # settings (or when tendency computations conflict with + # dt or t_end parsed args) + + for configuration in ["sphere", "column"] + for moisture in ["equil"] + for turb_conv in ["diagnostic_edmfx", "prognostic_edmfx"] + for precip in ["0M", "1M"] + + mktempdir() do output_loc + job_id = "restart_$(configuration)_$(moisture)_$(turb_conv)_$(precip)" + test_dict = Dict( + "check_nan_every" => 3, + "log_progress" => false, + "moist" => moisture, + "precip_model" => precip, + "config" => configuration, + "turbconv" => turb_conv, + "perturb_initstate" => false, + "dt" => "1secs", + "t_end" => "3secs", + "dt_save_state_to_disk" => "1secs", + "enable_diagnostics" => false, + "output_dir" => joinpath(output_loc, job_id), + ) + + @info "output_dir: $(test_dict["output_dir"])" + + config = CA.AtmosConfig(test_dict, job_id = job_id) + + simulation = CA.get_simulation(config) + CA.solve_atmos!(simulation) + + # Check re-importing the same state + restart_dir = simulation.output_dir + @test isfile(joinpath(restart_dir), "day0.3.hdf5") + + config_should_be_same = CA.AtmosConfig( + merge( + test_dict, + Dict("detect_restart_file" => true), + ), + job_id = job_id, + ) + + simulation_restarted = + CA.get_simulation(config_should_be_same) + @info "Check file-read from checkpoint data" + @info "Checking integrator.u.c" + compare( + simulation.integrator.u.c, + simulation_restarted.integrator.u.c, + ) + @info "Checking integrator.u.f" + compare( + simulation.integrator.u.f, + simulation_restarted.integrator.u.f, + ) + + # Check re-importing from previous state and advancing one step + restart_file = + joinpath(simulation.output_dir, "day0.2.hdf5") + @test isfile(joinpath(restart_dir), "day0.2.hdf5") + @info "Restart from specific file" + config2 = CA.AtmosConfig( + merge( + test_dict, + Dict("restart_file" => restart_file), + ), + job_id = job_id, + ) + + simulation_restarted2 = CA.get_simulation(config2) + @info "Advancing restarted simulation" + CA.solve_atmos!(simulation_restarted2) + @info "Restarted simulation complete" + @info "Checking integrator.u.c" + compare( + simulation.integrator.u.c, + simulation_restarted2.integrator.u.c, + ) + @info "Checking integrator.u.f" + compare( + simulation.integrator.u.f, + simulation_restarted2.integrator.u.f, + ) + end + end + end + end + end +end diff --git a/test/runtests.jl b/test/runtests.jl index db0c16c1c21..9214a187bcb 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -19,7 +19,8 @@ using Test @safetestset "Sponge interface tests" begin @time include("parameterized_tendencies/sponge/rayleigh_sponge.jl") end @safetestset "Precipitation interface tests" begin @time include("parameterized_tendencies/microphysics/precipitation.jl") end @safetestset "Model getters" begin @time include("solver/model_getters.jl") end -@safetestset "Topography tests" begin @time include("topography.jl") end +@safetestset "Topography tests" begin @time include("topography.jl") end +@safetestset "Restart" begin @time include("restart.jl") end #! format: on