From 36931a14da0ad70590d2c61bb5aa6d9e63bd38ad Mon Sep 17 00:00:00 2001 From: Andrew Cutler Date: Fri, 14 Apr 2023 16:43:11 +1000 Subject: [PATCH] Add support for loading from a fully qualitied object path or enumerating a dump given a partial timestamp in the path --- commands/common.sh | 108 +++++++++++++++++++++++++++++++++++++++++++++ commands/load | 28 +++++------- commands/load.md | 7 +-- tests/test.sh | 5 ++- 4 files changed, 124 insertions(+), 24 deletions(-) diff --git a/commands/common.sh b/commands/common.sh index 8365f93..d923404 100644 --- a/commands/common.sh +++ b/commands/common.sh @@ -63,6 +63,7 @@ get_storage_commands() { fetch_cmd=( "gsutil" "cp" ) storage_type="gs" gsutil_auth + find_object="find_object_gs" ;; s3://*) echo ">> Storage type: aws s3" @@ -72,6 +73,7 @@ get_storage_commands() { ls_cmd=( "aws" "s3" "ls" "${AWS_S3_ADDITIONAL_ARGS}" ) fetch_cmd=( "aws" "s3" "cp" "${AWS_S3_ADDITIONAL_ARGS}" ) storage_type="s3" + find_object="find_object_s3" ;; file://*|/*|./*) echo ">> Storage type: file" @@ -80,6 +82,7 @@ get_storage_commands() { fetch_cmd=( "cat" ) source="${source#file:\/\/}" storage_type="file" + find_object="find_object_file" ;; *) echoerr "Unknown storage type" @@ -136,3 +139,108 @@ gsutil_auth() { "service_account = default" > /etc/boto.cfg fi } + +# helper functions +function get_filename_from_object_path() { + # Returns just the filename portion of the full object path + echo "${1}" | sed -E -e 's/.*[\/ ]([^\/]*)$/\1/' +} + +function get_basename_from_object_path() { + # Returns just the bucketname / base path + echo "${1}" | sed 's/\(file\|s3\|gs\):\/\/\([^\/]\+\)\/.*/\1:\/\/\2\//' +} + +function get_timestamp_from_object_path() { + # Returns just the timestamp portion of the full object path 2-14 digits + echo "${1}" | sed -n 's/.*\/\([0-9]\{2,14\}\).*/\1/p; t; q;' +} + +function check_object_exists() { + if [[ $(eval "${ls_cmd[@]}" "${1}") ]]; then + return 0 + else + echoerr "Error file not found" + return 1 + fi +} + +function find_object_gs { + # find the object + # the following are are all valid + # gs://mybucket/20230413000003/my_database.sql.lz4 + # gs://mybucket/20230413000003/ my_database + # gs://mybucket/ my_database + # gs://mybucket/20230413 my_database + + source="${1}" + database="${2:-}" + timestamp="$(get_timestamp_from_object_path "${source}")" + base="$(get_basename_from_object_path "${source}")" + + if [[ "${timestamp}" == "" ]]; then + # no timestamp in the path, find the latest + timestamp="$(eval "${ls_cmd[@]}" "${source}" | sed -E -e '/[0-9]{14}/!d' -e 's/.*([0-9]{14})\/$/\1/' | sort | tail -n1)" + full_path="$(eval "${ls_cmd[@]}" "${source}${timestamp}/" | grep "/${database}[\.\-]")" + else + # has timestamp, either fully qualified, or needs expanding + if [[ $source =~ [0-9]{14}/${database} ]]; then + # should be complete path + full_path="${source}" + elif [[ $source =~ [0-9]{14} ]]; then + # complete timestamp + full_path="$(eval "${ls_cmd[@]}" "${source}" | grep "/${database}[\.\-]")" + else + # partial timestamp. search for matching object path + full_path="$(eval "${ls_cmd[@]}" "${base}${timestamp}*/" | grep "/${database}[\.\-]")" + fi + fi + check_object_exists "${full_path}" || { echoerr "Error file not found"; exit 1; } + echo "${full_path}" +} + + +function find_object_s3 { + # find the object + # the following are are all valid + # s3://mybucket/20230413000003/my_database.sql.lz4 + # s3://mybucket/20230413000003/ my_database + # s3://mybucket/ my_database + # s3://mybucket/20230413 my_database + + source="${1}" + database="${2:-}" + timestamp="$(get_timestamp_from_object_path "${source}")" + base="$(get_basename_from_object_path "${source}")" + + if [[ "${timestamp}" == "" ]]; then + # no timestamp in the path, find the latest + timestamp="$(eval "${ls_cmd[@]}" "${base}" | sed -E -e '/[0-9]{14}/!d' -e 's/.*([0-9]{14})\/$/\1/' | sort | tail -n1)" + file="$(eval "${ls_cmd[@]}" "${base}${timestamp}/" | sed -E -e 's/.*[\/ ]([^\/]*)$/\1/' | grep "^${database}[\.\-]")" + full_path="${base}${timestamp}/${file}" + else + # has timestamp, either fully qualified, or needs expanding + if [[ $source =~ [0-9]{14}/${database} ]]; then + # should be complete path + full_path="${source}" + elif [[ $source =~ [0-9]{14} ]]; then + # complete timestamp + file="$(eval "${ls_cmd[@]}" "${source}" | sed -E -e 's/.*[\/ ]([^\/]*)$/\1/' | grep "^${database}[\.\-]")" + full_path="${source}${file}" + else + # partial timestamp. search for matching object path + timestamp="$(eval "${ls_cmd[@]}" "${base}" | sed -E -e '/[0-9]{14}/!d' -e 's/.*([0-9]{14})\/$/\1/' | grep "${timestamp}")" + timestamp_count=$(wc -l <<<"${timestamp}") + [[ "${timestamp_count}" -gt 1 ]] && { echoerr "Error too many items found. Timestamp is not distinct."; exit 1; } + file="$(eval "${ls_cmd[@]}" "${base}${timestamp}/" | sed -E -e 's/.*[\/ ]([^\/]*)$/\1/' | grep "^${database}[\.\-]")" + full_path="${base}${timestamp}/${file}" + fi + fi + check_object_exists "${full_path}" || { echoerr "Error file not found"; exit 1; } + echo "${full_path}" +} + +function find_object_file { + echoerr "find_object_file not implemented" + exit 1 +} diff --git a/commands/load b/commands/load index 7a4ef06..e860709 100755 --- a/commands/load +++ b/commands/load @@ -208,30 +208,24 @@ case "${#args[@]}" in ;; esac -echo "Source: ${source}" -echo "SRC DB: ${src_database}" -echo "DEST DB: ${dest_database}" +echo "Source: ${source}" +echo "Source Database: ${src_database}" +echo "Destination Database: ${dest_database}" # Set the umask, umask defaults to 0077 to keep files private during db dumping umask "${umask:-0077}" -date="$(date --utc "+${date_format:-%Y%m%d%H%M%S}")" - get_storage_commands "${source}" - -# Find latest -latest="$(eval "${ls_cmd[@]}" "${source}" | sed -E -e '/[0-9]{14}/!d' -e 's/.*([0-9]{14})\/$/\1/' | sort | tail -n1)" -echo "Latest: ${latest}" - -file="$(eval "${ls_cmd[@]}" "${source}/${latest}/" | sed -E -e 's/.*[\/ ]([^\/]*)$/\1/' | grep "^${src_database}[\.\-]" || true)" -if [[ -z "${file}" ]]; then - echo "No save found for database in ${latest}" +file_path="$(eval ${find_object} "${source}" "${src_database}")" +if [[ -z "${file_path}" ]]; then + echo "No save found for database ${src_database} in ${source}" exit 1 +else + echo "Filepath: ${file_path}" fi -echo "File: ${file}" if [[ "${compression:-auto}" == "auto" ]]; then - case "${file##*.}" in + case "${file_path##*.}" in "lz4") compression="lz4" ;; @@ -268,9 +262,9 @@ wait_mariadb "${host}" "${port:-3306}" /usr/bin/mysql "${connection[@]}" -rs -e "DROP DATABASE IF EXISTS ${dest_database}; CREATE DATABASE ${dest_database}" if [[ "${#sed_cmd[@]}" -gt 0 ]]; then - eval "${fetch_cmd[@]}" "${source}/${latest}/${file}" "-" | "${decompression_cmd[@]}" | sed -E "${sed_cmd[@]}" | /usr/bin/mysql "${connection_no_db[@]}" "${dest_database}" + eval "${fetch_cmd[@]}" "${file_path}" "-" | "${decompression_cmd[@]}" | sed -E "${sed_cmd[@]}" | /usr/bin/mysql "${connection_no_db[@]}" "${dest_database}" else - eval "${fetch_cmd[@]}" "${source}/${latest}/${file}" "-" | "${decompression_cmd[@]}" | /usr/bin/mysql "${connection_no_db[@]}" "${dest_database}" + eval "${fetch_cmd[@]}" "${file_path}" "-" | "${decompression_cmd[@]}" | /usr/bin/mysql "${connection_no_db[@]}" "${dest_database}" fi echo ">> Finished." diff --git a/commands/load.md b/commands/load.md index a9e0493..1d87722 100644 --- a/commands/load.md +++ b/commands/load.md @@ -8,8 +8,5 @@ Command to load a sql dump from object storage (or filesystem) to a destination * DROP and CREATE destination database * Support for sed filters * gsutil auth helper - -## Limitations - -* Source must be a directory named with the date ie 20200813000000 (must be 14 chars) -* Only the latest is loadable +* Source can either be a bucket root with timestamped directories named with the date ie 20200813000000 (must be 14 chars), +or a path to the dump to restore, or a date stamped path (or partial datestamp), eg `s3://mybucket`, or `s3://mybucket/20200813000000`, or `s3://mybucket/20200813000000/my_database.sql.lz4` diff --git a/tests/test.sh b/tests/test.sh index 694e74d..d7b4577 100755 --- a/tests/test.sh +++ b/tests/test.sh @@ -65,8 +65,9 @@ docker run -d --name mariadb -p 3306:3306 -e MYSQL_ROOT_PASSWORD=password ${MARI docker run -d --name minio -p 9000:9000 ${MINIO_IMAGE}:${MINIO_TAG} server /data > /dev/null docker run --rm -i --link minio -e MC_HOST_minio=http://minioadmin:minioadmin@minio:9000 minio/mc:latest --quiet mb minio/backup docker run -i --name ${TEST_NAME}-save --link mariadb --link minio -e AWS_ACCESS_KEY_ID=minioadmin -e AWS_SECRET_ACCESS_KEY=minioadmin -e AWS_S3_ADDITIONAL_ARGS="--endpoint-url http://minio:9000" $TEST_CONTAINER save --host mariadb --password password s3://backup -docker run -i --name ${TEST_NAME}-load --link mariadb --link minio -e AWS_ACCESS_KEY_ID=minioadmin -e AWS_SECRET_ACCESS_KEY=minioadmin -e AWS_S3_ADDITIONAL_ARGS="--endpoint-url http://minio:9000" $TEST_CONTAINER load --host mariadb --password password s3://backup/ mysql newdb -cleanup mariadb minio ${TEST_NAME}-save ${TEST_NAME}-load +docker run -i --name ${TEST_NAME}-load1 --link mariadb --link minio -e AWS_ACCESS_KEY_ID=minioadmin -e AWS_SECRET_ACCESS_KEY=minioadmin -e AWS_S3_ADDITIONAL_ARGS="--endpoint-url http://minio:9000" $TEST_CONTAINER load --host mariadb --password password s3://backup/ mysql newdb +docker run -i --name ${TEST_NAME}-load2 --link mariadb --link minio -e AWS_ACCESS_KEY_ID=minioadmin -e AWS_SECRET_ACCESS_KEY=minioadmin -e AWS_S3_ADDITIONAL_ARGS="--endpoint-url http://minio:9000" $TEST_CONTAINER load --host mariadb --password password s3://backup/202 mysql newdb +cleanup mariadb minio ${TEST_NAME}-save ${TEST_NAME}-load1 ${TEST_NAME}-load2 echo "=> Test mysql command" docker run -d --name mariadb -p 3306:3306 -e MYSQL_ROOT_PASSWORD=password ${MARIADB_IMAGE}:${MARIADB_TAG} > /dev/null