From a0af80292d5be370122f7baa563edbf91b8d65b1 Mon Sep 17 00:00:00 2001 From: Charles Kawczynski Date: Thu, 26 Dec 2024 17:08:52 -0500 Subject: [PATCH] Apply more reproducibility test infra fixes --- reproducibility_tests/move_output.jl | 7 +- reproducibility_tests/ref_counter.jl | 5 +- .../reproducibility_tools.jl | 15 +- .../reproducibility_utils.jl | 237 ++++++++++++------ reproducibility_tests/test_mse.jl | 11 +- test/unit_reproducibility_infra.jl | 85 +++---- 6 files changed, 215 insertions(+), 145 deletions(-) diff --git a/reproducibility_tests/move_output.jl b/reproducibility_tests/move_output.jl index 641f1e3a19..6bdd931e52 100644 --- a/reproducibility_tests/move_output.jl +++ b/reproducibility_tests/move_output.jl @@ -17,7 +17,12 @@ move_data_to_save_dir(; if buildkite_ci && in_merge_queue folders = get_reference_dirs_to_delete(; root_dir = cluster_data_prefix) - debug_reproducibility() && @warn "Repro: deleting folders $folders" + bins = compute_bins(folders) + if !isempty(folders) + msg = prod(x -> " $x\n", folders) + @warn "Repro: deleting folders:\n$msg" + end + @warn "Deleted folder bins:\n $(string_bins(bins))" for f in folders rm(f; recursive = true, force = true) end diff --git a/reproducibility_tests/ref_counter.jl b/reproducibility_tests/ref_counter.jl index acc62d2d3c..736015e973 100644 --- a/reproducibility_tests/ref_counter.jl +++ b/reproducibility_tests/ref_counter.jl @@ -1,4 +1,4 @@ -192 +193 # **README** # @@ -21,6 +21,9 @@ #= +193 +- More reproducibility infrastructure fixes. + 192 - Reproducibility infrastructure fixes. diff --git a/reproducibility_tests/reproducibility_tools.jl b/reproducibility_tests/reproducibility_tools.jl index c064e88e2f..0ecb8844b5 100644 --- a/reproducibility_tests/reproducibility_tools.jl +++ b/reproducibility_tests/reproducibility_tools.jl @@ -27,16 +27,6 @@ function error_if_dissimilar_dicts(dicts, dict) end end -function all_files_in_dir(dir) - all_files = String[] - for (root, dirs, files) in walkdir(dir) - for file in files - push!(all_files, joinpath(root, file)) - end - end - return all_files -end - function no_comparison_error(dirs, non_existent_files) msg = "\n\n" msg *= "Pull request author:\n" @@ -59,7 +49,10 @@ function no_comparison_error(dirs, non_existent_files) msg *= "for how to merge this PR." msg *= "\n\n" for dir in dirs - msg *= "Files in dirs: $(all_files_in_dir(dir))\n" + msg *= "Files in dir $dir\n" + for file in all_files_in_dir(dir) + msg *= " $file\n" + end end error(msg) end diff --git a/reproducibility_tests/reproducibility_utils.jl b/reproducibility_tests/reproducibility_utils.jl index 122a162725..8db107c62a 100644 --- a/reproducibility_tests/reproducibility_utils.jl +++ b/reproducibility_tests/reproducibility_utils.jl @@ -13,20 +13,16 @@ Consider the following set of reproducibility directories, prefixed by "reference counters", which allow users to compare against other reproducible states in that column. -Note that reference counter changes can "rewind"(which may happen in the case of -reverted commits). In such cases, we do consider the rewound state as an -entirely new state, in order to fully preserve the history (to some depth). +Note that the reference counter must increment in the case of reverted commits, +resulting in an entirely new state, in order to fully preserve the history +(to some depth). -An important consequence of this requires precise terminology to avoid ambiguous -descriptions. - -For example, "comparable references per reference counter" is not well defined, -because the reference counter can be reverted. So, let's introduce the concept -of a "bin", which can be defined as a collection of directories created in a -period with the same reference counter. Folders created before and after that -bin have a different reference counter. Also, `n_bins == n_reference_changes + -1`(modulo the edge case of when there are no bins) because, if the reference -counter doesn't change, new results are put into the same bin. +Here, we introduce the concept of a "bin", which can be defined as a collection +of directories created in a period with the same reference counter. Folders +created before and after that bin have a different reference counter. Also, +`n_bins == n_reference_changes + 1`(modulo the edge case of when there are no +bins) because, if the reference counter doesn't change, new results are put +into the same bin. ``` comparable states @@ -34,28 +30,78 @@ comparable states | | | bin 1 bin 2 bin 3 bin 4 bin 5 bin 6 bin 7 | | | - | 02_49f92 04_36ebe 05_beb8a 06_4d837 05_8c311 08_45875 10_bc1e0 | - | 04_d6e48 06_d6d73 08_1cc58 | - v 04_4c042 v newest + | 02_49f92 03_36ebe 04_beb8a 05_4d837 06_8c311 07_45875 08_bc1e0 | + | 03_d6e48 05_d6d73 07_1cc58 | + v 03_4c042 v newest ``` + +# File states + +Reproducibility tests inherently rely on comparing multiple states, which means +that our reproducibility testing infrastructure is _stateful_. During our +continuous integration testing (CI), files are generated, moved, and zipped. To help +assist our understanding and reasoning, we let's assume that there are two states: + +## state 1: end of simulation, folder structure + + - `job_id/output_dir/` + - `job_id/output_dir/reproducibility_bundle/` + - `job_id/output_dir/reproducibility_bundle/ref_counter.jl` + - `job_id/output_dir/reproducibility_bundle/prog_state.hdf5` + +## state 2: data is saved for future reference + + - `commit_hash/job_id/output_dir/` + - `commit_hash/job_id/output_dir/reproducibility_bundle/` + - `commit_hash/job_id/output_dir/reproducibility_bundle/ref_counter.jl` + - `commit_hash/job_id/output_dir/reproducibility_bundle/prog_state.hdf5` + + - `commit_hash/reproducibility_bundle/ref_counter.jl` + - `commit_hash/reproducibility_bundle/job_id/` + - `commit_hash/reproducibility_bundle/job_id/prog_state.hdf5` + +In other words, we strip out `output_dir/`, and swap `job_id` and +`reproducibility_bundle`. This is done for two reasons: + + - The ref_counter is job-independent, hence the swap + - The `output_dir/` is redundant to the purpose of the commit hash folder + ################################################################################ =# -# debug_reproducibility() = false +# debug_reproducibility() = true debug_reproducibility() = get(ENV, "BUILDKITE_PIPELINE_SLUG", nothing) == "climaatmos-ci" import Dates import OrderedCollections +function string_all_files_in_dir(dir) + msg = "Files in dir $dir\n" + for file in all_files_in_dir(dir) + msg *= " $file\n" + end + return msg +end + +function all_files_in_dir(dir) + all_files = String[] + for (root, dirs, files) in walkdir(dir; follow_symlinks = true) + for file in files + push!(all_files, joinpath(root, file)) + end + end + return all_files +end + read_ref_counter(file) = parse(Int, first(readlines(file))) """ sorted_dirs_with_matched_files(; dir = pwd(), filename) Return an array of subdirectories of `dir` (defaults to the current working -directory) sorted by modification time (oldest to newest). Return an empty -vector if no subdirectories are found. +directory) sorted by the reference counters contained in the folders. Return an +empty vector if no subdirectories are found. This function recurses through `dir`, and finds all directories that have the file `filename`. @@ -75,27 +121,10 @@ function sorted_dirs_with_matched_files(; isempty(matched_dirs) && return String[] # sort by timestamp sorted_dirs = - sort(matched_dirs; by = f -> Dates.unix2datetime(stat(f).mtime)) - return sorted_dirs -end - -""" - sorted_dataset_folder(; dir=pwd()) - -Return an array of subdirectories of `dir` (defaults to the current working -directory) sorted by modification time (oldest to newest). Return an empty -vector if no subdirectories are found. -""" -function sorted_dataset_folder(; dir = pwd()) - matching_dirs = filter(isdir, readdir(dir; join = true)) - isempty(matching_dirs) && return String[] - # sort by timestamp - sorted_dirs = - sort(matching_dirs; by = f -> Dates.unix2datetime(stat(f).mtime)) + sort(matched_dirs; by = f -> read_ref_counter(joinpath(f, filename))) return sorted_dirs end - """ ref_counters_per_dir(dirs) @@ -200,17 +229,17 @@ comparable states v 04_4c042 v newest ``` """ -compute_bins( +function compute_bins( root_dir::String = "/central/scratch/esm/slurm-buildkite/climaatmos-main"; filename = "ref_counter.jl", -) = compute_bins( - reverse( - sorted_dirs_with_matched_files(; - dir = root_dir, - filename = "ref_counter.jl", - ), - ), ) + dirs = sorted_dirs_with_matched_files(; + dir = root_dir, + filename = "ref_counter.jl", + ) + return compute_bins(reverse(dirs)) +end + function compute_bins(sorted_dirs::Vector{String}) @assert isempty(invalid_reference_folders(sorted_dirs)) bins = Vector{String}[] @@ -241,11 +270,31 @@ function compute_bins(sorted_dirs::Vector{String}) return bins end +print_bins(bins) = print_bins(stdout, bins) +print_bins(io::IO, bins) = println(io, string_bins(bins)) + +""" + string_bins(bins) + +Return a string summarizing the given bins. +""" +function string_bins(bins) + msg = "Bins:\n" + for (i, bin) in enumerate(bins) + msg *= " Bin $i:\n" + for (j, state) in enumerate(bin) + ref_counter = read_ref_counter(joinpath(state, "ref_counter.jl")) + msg *= " (State $j, ref_counter): ($state, $ref_counter)\n" + end + end + return msg +end + """ get_reference_dirs_to_delete(; root_dir, - keep_n_comparable_states = 5, - keep_n_bins_back = 7, + keep_n_comparable_states = 100, + keep_n_bins_back = 100, ) Return a list of folders to delete. @@ -262,9 +311,9 @@ keep_n_comparable_states | | | bin 1 bin 2 bin 3 bin 4 bin 5 bin 6 bin 7 | | | - | 02_49f92 04_36ebe 05_beb8a 06_4d837 05_8c311 08_45875 10_bc1e0 | - | 04_d6e48 06_d6d73 08_1cc58 | - v 04_4c042 v newest + | 02_49f92 03_36ebe 04_beb8a 05_4d837 06_8c311 07_45875 08_bc1e0 | + | 03_d6e48 05_d6d73 07_1cc58 | + v 03_4c042 v newest ``` With these folders, and given a reference counter of 10, we'll see the following @@ -274,35 +323,18 @@ behavior: get_reference_dirs_to_delete(; keep_n_comparable_states = 4, keep_n_bins_back = 3 - ) -> [02_49f92, 04_36ebe, 04_d6e48, 04_4c042] + ) -> [02_49f92, 03_36ebe, 03_d6e48, 03_4c042] get_reference_dirs_to_delete(; keep_n_comparable_states = 1, keep_n_bins_back = 5 - ) -> [02_49f92, 04_d6e48, 04_4c042, 06_d6d73, 08_1cc58] + ) -> [02_49f92, 03_d6e48, 03_4c042, 05_d6d73, 07_1cc58] ``` - -Note: - `keep_n_references_back` is sorted _chronologically_, in order to correctly - operate in the case of reverted pull requests. In other words, the above - references may look like this: - -``` -keep_n_comparable_states - | <---- keep_n_bins_back | oldest - | | - | bin 1 bin 2 bin 3 bin 4 bin 5 bin 6 bin 7 | - | | - | 02_49f92 04_36ebe 05_beb8a 06_4d837 05_8c311 08_45875 10_bc1e0 | - | 04_d6e48 06_d6d73 08_1cc58 | - v 04_4c042 v newest -``` - """ function get_reference_dirs_to_delete(; root_dir, - keep_n_comparable_states = 5, - keep_n_bins_back = 7, + keep_n_comparable_states = 100, + keep_n_bins_back = 100, filename = "ref_counter.jl", ) dirs = sorted_dirs_with_matched_files(; dir = root_dir, filename) @@ -333,7 +365,7 @@ Return a hash from the contents of all Julia files found recursively in `dir` """ function source_checksum(dir = pwd()) jl_files = String[] - for (root, dirs, files) in walkdir(dir) + for (root, dirs, files) in walkdir(dir; follow_symlinks = true) for file in files endswith(file, ".jl") && push!(jl_files, joinpath(root, file)) end @@ -368,26 +400,48 @@ function source_has_changed(; end end +rm_folder(path; strip_folder) = + joinpath(filter(x -> !occursin(strip_folder, x), splitpath(path))...) + """ move_data_to_save_dir(; dest_root = "/central/scratch/esm/slurm-buildkite/climaatmos-main", - buildkite_ci = get(ENV, "BUILDKITE_PIPELINE_SLUG", nothing) == "climaatmos-ci", + buildkite_ci = get(ENV, "BUILDKITE_PIPELINE_SLUG", nothing) == + "climaatmos-ci", commit = get(ENV, "BUILDKITE_COMMIT", nothing), branch = get(ENV, "BUILDKITE_BRANCH", nothing), in_merge_queue = startswith(branch, "gh-readonly-queue/main/"), dirs_src, + strip_folder = Pair("output_active", ""), ref_counter_file_PR = joinpath(@__DIR__, "ref_counter.jl"), + ref_counter_PR = read_ref_counter(ref_counter_file_PR), + skip = get(ENV, "BUILDKITE_PIPELINE_SLUG", nothing) != "climaatmos-ci", + n_hash_characters = 7, + repro_folder = "reproducibility_bundle", ) -Moves data from directories `dirs_src[i]` to `dest_root/commit_sha/basename -(dirs_src[i])`, given some conditions are met. In particular, data movement -will occur when this function is called: +Moves data in the following way: + +for job_id in dest_src + `job_id/out/repro/ref_counter.jl` -> `commit_hash/repro/ref_counter.jl` + `job_id/out/repro/` -> `commit_hash/repro/job_id/` + `job_id/out/repro/prog_state.hdf5` -> `commit_hash/repro/job_id/prog_state.hdf5` +end + +Note that files not in the `repro` folder are not moved. + +In other words, we strip out `out/`, and swap `job_id` and `repro`. This is done +for two reasons: + + - The ref_counter is job-independent, hence the swap + - The `out/` is redundant to the purpose of the commit hash folder + +Data movement will occur when this function is called: - on a job run in buildkite - when in the merge queue - when on the main branch if the `source_checksum` is different from the source code in the latest comparable reference - """ function move_data_to_save_dir(; dest_root = "/central/scratch/esm/slurm-buildkite/climaatmos-main", @@ -397,6 +451,7 @@ function move_data_to_save_dir(; branch = get(ENV, "BUILDKITE_BRANCH", nothing), in_merge_queue = startswith(branch, "gh-readonly-queue/main/"), dirs_src, + strip_folder = "output_active", ref_counter_file_PR = joinpath(@__DIR__, "ref_counter.jl"), ref_counter_PR = read_ref_counter(ref_counter_file_PR), skip = get(ENV, "BUILDKITE_PIPELINE_SLUG", nothing) != "climaatmos-ci", @@ -426,12 +481,36 @@ function move_data_to_save_dir(; # can compare against multiple references for src in dirs_src dst = joinpath(dest_repro, basename(src)) + debug_reproducibility() && @info "Repro: moving $src to $dst" mv(src, dst; force = true) - debug_reproducibility() && - @info "Reproducibility: File $src moved to $dst" + end + for dst in all_files_in_dir(dest_repro) + dst_new = rm_folder(dst; strip_folder) + if debug_reproducibility() + @show isfile(dst) + @show dst + @show dst_new + end + if dst ≠ dst_new + mkpath(dirname(dst_new)) + debug_reproducibility() && + @info "Repro: re-moving $dst to $dst_new" + mv(dst, dst_new; force = true) + end end ref_counter_file_main = joinpath(dest_repro, "ref_counter.jl") + debug_reproducibility() && + @info "Repro: moving $ref_counter_file_PR to $ref_counter_file_main" mv(ref_counter_file_PR, ref_counter_file_main; force = true) + if debug_reproducibility() + println("####################### SRC") + for src in dirs_src + @info(string_all_files_in_dir(src)) + end + println("####################### DST") + @info(string_all_files_in_dir(dest_repro)) + println("#######################") + end else if debug_reproducibility() @warn "Repro: skipping data movement" diff --git a/reproducibility_tests/test_mse.jl b/reproducibility_tests/test_mse.jl index be9f7d01b0..7e586b2059 100644 --- a/reproducibility_tests/test_mse.jl +++ b/reproducibility_tests/test_mse.jl @@ -42,7 +42,16 @@ if isempty(computed_mse_filenames) read_ref_counter(joinpath(newest_saved_dir, "ref_counter.jl")) ref_counter_PR = read_ref_counter(joinpath(@__DIR__, "ref_counter.jl")) - @assert ref_counter_PR == newest_saved_ref_counter + 1 "Reference counter must be incremented by 1. ref_counter_PR=$ref_counter_PR, newest_saved_ref_counter=$newest_saved_ref_counter" + if ref_counter_PR ≠ newest_saved_ref_counter + 1 + if debug_reproducibility() + @info " ref_counter_PR=$ref_counter_PR, newest_saved_ref_counter=$newest_saved_ref_counter\n" + @info "newest_saved_dir: $newest_saved_dir\n" + @info "newest_saved_dir_legacy: $newest_saved_dir_legacy\n" + @info "newest_saved_dir_new: $newest_saved_dir_new\n" + print_bins(bins) + end + error("Reference counter must be incremented by 1.") + end end else msg = "There were comparable references, but no computed mse files exist." diff --git a/test/unit_reproducibility_infra.jl b/test/unit_reproducibility_infra.jl index 3eee46222a..145f98fac8 100644 --- a/test/unit_reproducibility_infra.jl +++ b/test/unit_reproducibility_infra.jl @@ -5,6 +5,7 @@ using Revise; include("test/unit_reproducibility_infra.jl") using Test import Dates import Logging +import ClimaUtilities.OutputPathGenerator # this also includes reproducibility_utils.jl include(joinpath("..", "reproducibility_tests/reproducibility_tools.jl")) @@ -147,7 +148,7 @@ end @test dirs == [d6, d5] # d6 is most recent end - # reverted commits examples + # folders modified out of chronological order examples make_and_cd() do dir d1 = make_ref_file_counter(1, dir, "d1") d2 = make_ref_file_counter(2, dir, "d2") @@ -161,7 +162,7 @@ end ref_counter_PR = 3, skip = false, ) - @test dirs == [d6] + @test dirs == [d6, d3] end # appending to p7 now, confusingly, removes p3: @@ -186,13 +187,17 @@ end @testset "Reproducibility infrastructure: validate_reference_folders" begin # No dirs at all make_and_cd() do dir - @test invalid_reference_folders(sorted_dataset_folder(; dir)) == [] + @test invalid_reference_folders( + sorted_dirs_with_matched_files(; dir), + ) == [] end # 1 dir without ref counter make_and_cd() do dir p1 = make_dir(dir, "d1") - @test invalid_reference_folders(sorted_dataset_folder(; dir)) == [p1] + @test invalid_reference_folders( + sorted_dirs_with_matched_files(; dir), + ) == [] end # mix @@ -203,8 +208,9 @@ end r2 = make_dir(dir, "r2") d3 = make_ref_file_counter(3, dir, "d3") r3 = make_dir(dir, "r3") - @test invalid_reference_folders(sorted_dataset_folder(; dir)) == - [r1, r2, r3] + @test invalid_reference_folders( + sorted_dirs_with_matched_files(; dir), + ) == [] end end @@ -237,9 +243,13 @@ end d6 = make_ref_file_counter(5, dir, "d6") d7 = make_ref_file_counter(6, dir, "d7") @test compute_bins(dir) == [[d7], [d6, d5], [d4, d3], [d2], [d1]] + @test occursin( + "(State 1, ref_counter):", + string_bins(compute_bins(dir)), + ) end - # simulating reverted PR + # simulating folders modified out of chronological order make_and_cd() do dir d1 = make_ref_file_counter(1, dir, "d1") d2 = make_ref_file_counter(2, dir, "d2") @@ -248,7 +258,7 @@ end d5 = make_ref_file_counter(3, dir, "d5") d6 = make_ref_file_counter(4, dir, "d6") d7 = make_ref_file_counter(5, dir, "d7") - @test compute_bins(dir) == [[d7], [d6], [d5], [d4], [d3], [d2], [d1]] + @test compute_bins(dir) == [[d7], [d6, d4], [d5, d3], [d2], [d1]] end end @@ -310,45 +320,6 @@ end ) @test dirs == reverse([d01, d02, d03, d04, d05, d06, d07]) end - - #= - # Reverted commits example, consider: - - keep_n_comparable_states - | <---- keep_n_bins_back | oldest - | | - | B01 B02 B03 B01 B02 B03 B04 | - | | - | d01 d02 d05 d06 d08 d09 d11 | - | d03 d07 d10 | - v d04 v newest - =# - - make_and_cd() do dir - d01 = make_ref_file_counter(1, dir, "01") - d02 = make_ref_file_counter(2, dir, "02") - d03 = make_ref_file_counter(2, dir, "03") - d04 = make_ref_file_counter(2, dir, "04") - d05 = make_ref_file_counter(3, dir, "05") - d06 = make_ref_file_counter(1, dir, "06") - d07 = make_ref_file_counter(1, dir, "07") - d08 = make_ref_file_counter(2, dir, "08") - d09 = make_ref_file_counter(3, dir, "09") - d10 = make_ref_file_counter(3, dir, "10") - d11 = make_ref_file_counter(4, dir, "11") - dirs = get_reference_dirs_to_delete(; - root_dir = dir, - keep_n_comparable_states = 1, - keep_n_bins_back = 5, - ) - @test dirs == reverse([d01, d02, d03, d04, d06, d09]) - dirs = get_reference_dirs_to_delete(; - root_dir = dir, - keep_n_comparable_states = 4, - keep_n_bins_back = 3, - ) - @test dirs == reverse([d01, d02, d03, d04, d05, d06, d07]) - end end function make_file_with_contents(dir, filename, contents) @@ -703,6 +674,15 @@ MSEs[\"d3\"][b] = 3 end end +@testset "all_files_in_dir with generate_output_path" begin + make_and_cd() do dir + # Tests that symlink directories are not + # returned in all_files_in_dir + output_dir = OutputPathGenerator.generate_output_path(dir) + @test all_files_in_dir(dir) == String[] + end +end + using ClimaComms using ClimaCore: Spaces, Fields, Grids, InputOutput using ClimaCore @@ -1255,16 +1235,16 @@ if pkgversion(ClimaCore) ≥ v"0.14.18" make_file_with_contents(computed_dir, "file_z.jl", "abc") ref_counter_file_dir = make_ref_file_counter(3, computed_dir, "repro_bundle") - job_id_1 = joinpath(computed_dir, "job_id_1") - job_id_2 = joinpath(computed_dir, "job_id_2") + job_id_1 = joinpath(computed_dir, "repro_bundle", "job_id_1") + job_id_2 = joinpath(computed_dir, "repro_bundle", "job_id_2") put_data_file( - job_id_1, + joinpath(job_id_1, "output_active"), fv, comms_ctx; filename = "ref_prog_state.hdf5", ) put_data_file( - job_id_2, + joinpath(job_id_2, "output_active"), fv, comms_ctx; filename = "ref_prog_state.hdf5", @@ -1273,8 +1253,8 @@ if pkgversion(ClimaCore) ≥ v"0.14.18" @test source_checksum(hash2) == source_checksum(computed_dir) repro_folder = "repro_bundle" - repro_dir = joinpath(save_dir, "hash_new", repro_folder) move_data_to_save_dir(; + strip_folder = "output_active", dest_root = save_dir, buildkite_ci = true, commit = "hash_new", @@ -1290,6 +1270,7 @@ if pkgversion(ClimaCore) ≥ v"0.14.18" ref_counter_PR = 3, skip = false, ) + repro_dir = joinpath(save_dir, "hash_new", "repro_bundle") @test isfile(joinpath(repro_dir, "job_id_1", "ref_prog_state.hdf5")) @test isfile(joinpath(repro_dir, "job_id_2", "ref_prog_state.hdf5")) @test isfile(joinpath(repro_dir, "ref_counter.jl"))