diff --git a/clients/README.md b/clients/README.md index 74e40fd..25cf16d 100644 --- a/clients/README.md +++ b/clients/README.md @@ -228,6 +228,7 @@ Note that `webwrite` handles all HTTP statuses internally, and is unable to filt The Julia integration can be installed using Julia's builtin package manager Pkg ``` +import Pkg Pkg.add("UMBridge") ``` @@ -519,3 +520,46 @@ The remaining arguments are the stopping tolerance (`1e-5`) and a vectorization We can notice the total number of model evaluations as `cum#evals=449` in the printout of `greedy2_cross`, and the Tensor-Train rank `max_rank=3`. A neat reduction compared to 1089 points in the full 2D grid! [Full example sources here.](https://github.com/UM-Bridge/umbridge/blob/main/clients/matlab/ttClient.m) + + +## Sparse Grids Matlab Kit client + +The Sparse Grids Matlab Kit (SGMK) provides a Matlab implementation of sparse grids, and can be used for approximating high-dimensional functions and, in particular, for surrogate-model-based uncertainty quantification; for more info, see the [SGMK website](https://sites.google.com/view/sparse-grids-kit). + +The SGMK integrates in a very straightforward way with the Matlab UM-Bridge client (it literally takes 1 line!), see e.g. the script [sgmkClient.m](https://github.com/UM-Bridge/umbridge/blob/main/clients/matlab/sgmkClient.m) that can be found in the folder [clients/matlab](https://github.com/UM-Bridge/umbridge/blob/main/clients/matlab). + +The script begins by checking whether the SGMK is already in our path, and if not downloads it from the Github repo [here](https://github.com/lorenzo-tamellini/sparse-grids-matlab-kit) and adds it to the path: +```matlab +check_sgmk() +``` +The goal of this simple script is to use the SGMK as a high-dimensional quadrature tool to compute the integral of the posterior density function (pdf) defined in the benchmark **analytic-gaussian-mixture**. The pdf in the benchmark is actually not normalized so the integral should be around 3. + +To this end, create a model as before: +```matlab +uri = 'http://localhost:4243'; +model = HTTPModel(uri, 'posterior','webwrite'); +``` +then simply wrap `model.evaluate()` in an `@-function` and **you're done**! +```matlab +f = @(y) model.evaluate(y); +``` +The script then goes on creating a sparse grid and evaluating `f` over the sparse grid points: +```matlab +N=2; +w=7; +domain = [-5.5 -5; + 5 5.5]; +knots = {@(n) knots_CC(n,domain(1,1),domain(2,1),'nonprob'), @(n) knots_CC(n,domain(1,2),domain(2,2),'nonprob')}; +S = create_sparse_grid(N,w,knots,@lev2knots_doubling); +Sr = reduce_sparse_grid(S); +f_evals = evaluate_on_sparse_grid(f,Sr); +``` +and finally, computing the integral given the values of `f` just obtained. Note that the values returned by the container and stored in `f_evals` are actually the log-posterior, so we need to take their exponent before computing the integral: +```matlab +Ev = quadrature_on_sparse_grid(exp(f_evals),Sr) +``` +which indeed returns `Ev = 2.9948`, i.e., close to 3 as expected. The script then ends by plotting the sparse grids interpolant of `f` and of `exp(f)`. + +[Full example sources here.](https://github.com/UM-Bridge/umbridge/blob/main/clients/matlab/sgmkClient.m) + + diff --git a/clients/matlab/sgmkClient.m b/clients/matlab/sgmkClient.m index 301e89f..6debb77 100644 --- a/clients/matlab/sgmkClient.m +++ b/clients/matlab/sgmkClient.m @@ -1,20 +1,27 @@ clear - -% Analytic-Banana benchmark. Use the sparse grids matlab kit as a high-dimensional quadrature tool to compute the -% integral of the posterior density defined in the benchmark. The problem is a bit challenging so even a poor result is -% ok, this is just for testing the client. The pdfs in the benchmark are not normalized so the integral should be +% Analytic-gaussian-mixture. Use the sparse grids matlab kit as a high-dimensional quadrature tool to compute the +% integral of the posterior density function (pdf) defined in the benchmark. The problem is a bit challenging so even a poor result is +% ok, this is just for testing the client. The pdfs in the benchmark is not normalized so the integral should be % around 3 -% add the Sparse Grids Matlab Kit and umbridge to your path +% add the Sparse Grids Matlab Kit to your path check_sgmk() +% run these commands too if you haven't umbridge in your path +% currdir = pwd; +% cd ../../matlab/ +% addpath(genpath(pwd)) +% cd(currdir) + +% also, start the Analytic-gaussian-mixture container +% sudo docker run -it -p 4243:4243 linusseelinger/benchmark-analytic-gaussian-mixture % uri of the service running the server uri = 'http://localhost:4243'; % HTTPModel is an object provided by the UM-Bridge Matlab client. % model = HTTPModel(uri, 'posterior'); -model = HTTPModel(uri, 'posterior','webwrite'); +model = HTTPModel(uri, 'posterior','webwrite'); % let's use webwrite, it's much faster % model.evaluate(y) sends a request to the server to evaluate the model at y. Wrap it in an @-function: f = @(y) model.evaluate(y); @@ -26,17 +33,23 @@ % as a column, so that % D = [a c ; % b d ]; -domain = [-5 -5; - 5 5]; -knots={@(n) knots_CC(n,domain(1,1),domain(2,1),'nonprob'), @(n) knots_CC(n,domain(1,2),domain(2,2),'nonprob')}; -S=create_sparse_grid(N,w,knots,@lev2knots_doubling); -Sr=reduce_sparse_grid(S); -f_evals=evaluate_on_sparse_grid(f,Sr); - -% from here on, do whatever UQ analysis you want with the values contained in f_evals +domain = [-5.5 -5; + 5 5.5]; +knots = {@(n) knots_CC(n,domain(1,1),domain(2,1),'nonprob'), @(n) knots_CC(n,domain(1,2),domain(2,2),'nonprob')}; +S = create_sparse_grid(N,w,knots,@lev2knots_doubling); +Sr = reduce_sparse_grid(S); +f_evals = evaluate_on_sparse_grid(f,Sr); + +% from here on, do whatever UQ analysis you want with the values contained in f_evals. Here we just check that +% the pdf integrates to 3 (the benchmark is not normalized). Note that the values returned by the container +% and stored in f_evals are actually the log-posterior, so we need to take their exponent before computing the integral + +Ev = quadrature_on_sparse_grid(exp(f_evals),Sr) + + +% We also plot the sparse grids interpolant of the function in the benchmark figure plot_sparse_grids_interpolant(S,Sr,domain,exp(f_evals),'nb_plot_pts',80) figure plot_sparse_grids_interpolant(S,Sr,domain,f_evals,'with_f_values') -Ev = quadrature_on_sparse_grid(exp(f_evals),Sr) \ No newline at end of file diff --git a/hpc/LoadBalancer.cpp b/hpc/LoadBalancer.cpp index d268dab..99c604f 100644 --- a/hpc/LoadBalancer.cpp +++ b/hpc/LoadBalancer.cpp @@ -16,17 +16,43 @@ void create_directory_if_not_existing(std::string directory) { } } +void clear_url(std::string directory) { + for (auto& file : std::filesystem::directory_iterator(directory)) { + if (std::regex_match(file.path().filename().string(), std::regex("url-\\d+\\.txt"))) { + std::filesystem::remove(file); + } + } +} + std::string get_hostname() { char hostname[HOST_NAME_MAX]; gethostname(hostname, HOST_NAME_MAX); return std::string(hostname); } +void launch_hq_with_alloc_queue() { + std::system("hq server stop &> /dev/null"); + + std::system("hq server start &"); + sleep(1); // Workaround: give the HQ server enough time to start. + + // Create HQ allocation queue + std::system("hq_scripts/allocation_queue.sh"); +} + +const std::vector get_model_names() { + HyperQueueJob hq_job("", false); // Don't start a client. + + return umbridge::SupportedModels(hq_job.server_url); +} + int main(int argc, char *argv[]) { - create_directory_if_not_existing("urls"); create_directory_if_not_existing("sub-jobs"); + clear_url("urls"); + + launch_hq_with_alloc_queue(); // Read environment variables for configuration char const *port_cstr = std::getenv("PORT"); @@ -41,15 +67,15 @@ int main(int argc, char *argv[]) port = atoi(port_cstr); } - // Start: Instaltialize multiple LB classes for multiple models on the regular server + char const *delay_cstr = std::getenv("HQ_SUBMIT_DELAY_MS"); + if (delay_cstr != NULL) + { + hq_submit_delay_ms = atoi(delay_cstr); + } + std::cout << "HQ_SUBMIT_DELAY_MS set to " << hq_submit_delay_ms << std::endl; - // start a SLURM job for single request - const std::string job_id = submitJob("sbatch model.slurm"); - const std::string server_url = readUrl("./urls/url-" + job_id + ".txt"); // read server url from txt file - // May use $SLURM_LOCALID in a .slurm file later - std::cout << "Hosting sub-server at : " << server_url << std::endl; - // List supported models - std::vector model_names = umbridge::SupportedModels(server_url); + // Initialize load balancer for each available model on the model server. + const std::vector model_names = get_model_names(); std::vector LB_vector; for (auto model_name : model_names) @@ -58,9 +84,7 @@ int main(int argc, char *argv[]) LB_vector.emplace_back(LoadBalancer{model_name}); } - // End: Instaltialize multiple LB classes for multiple models on the regular server - - // Create a new vector of pointers to LB_vector + // umbridge::serveModels currently only accepts raw pointers. std::vector LB_ptr_vector(LB_vector.size()); std::transform(LB_vector.begin(), LB_vector.end(), LB_ptr_vector.begin(), [](LoadBalancer& obj) { return &obj; }); @@ -68,4 +92,4 @@ int main(int argc, char *argv[]) std::cout << "Load balancer running on host " << get_hostname() << " and bound to 0.0.0.0:" << port << std::endl; umbridge::serveModels(LB_ptr_vector, "0.0.0.0", port, false); -} \ No newline at end of file +} diff --git a/hpc/LoadBalancer.hpp b/hpc/LoadBalancer.hpp index c8c4268..2db7ee1 100644 --- a/hpc/LoadBalancer.hpp +++ b/hpc/LoadBalancer.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "lib/umbridge.h" // run and get the result of command @@ -29,88 +30,18 @@ std::string getCommandOutput(const std::string command) return output; } -// state = ["PENDING","RUNNING","COMPLETED","FAILED","CANCELLED"] -bool waitForJobState(const std::string &job_id, const std::string &state = "COMPLETED") +// wait until file is created +bool waitForFile(const std::string &filename) { - const std::string command = "scontrol show job " + job_id + " | grep -oP '(?<=JobState=)[^ ]+'"; - // std::cout << "Checking runtime: " << command << std::endl; - std::string job_status; - - do - { - job_status = getCommandOutput(command); - - // Delete the line break - if (!job_status.empty()) - job_status.pop_back(); - - // Don't wait if there is an error or the job is ended - if (job_status == "" || (state != "COMPLETE" && job_status == "COMPLETED") || job_status == "FAILED" || job_status == "CANCELLED") - { - std::cerr << "Wait for job status failure, status : " << job_status << std::endl; - return false; - } - // std::cout<<"Job status: "< timeout) - { - std::cerr << "Timeout reached waiting for file " << filename << std::endl; - return false; + // Check if the file exists + while (!std::filesystem::exists(filename)) { + // If the file doesn't exist, wait for a certain period + std::this_thread::sleep_for(std::chrono::seconds(1)); } return true; } -// Start a slurm job and return job id -std::string submitJob(const std::string &command) -{ - std::string sbatch_command = command + " | awk '{print $4}'"; // extract job ID from sbatch output - std::cout << "Submitting job with command: " << command << std::endl; - - std::string job_id; - int i = 0; - do - { - job_id = getCommandOutput(sbatch_command); - - // Delete the line break - if (!job_id.empty()) - job_id.pop_back(); - - std::cout << "job_id: " << job_id << std::endl; - ++i; - - } while (waitForJobState(job_id, "RUNNING") == false && i < 3 && waitForFile("./urls/url-" + job_id + ".txt", 20) == false); - // Wait to start all nodes on the cluster, call scontrol for every 1 sceond to check - // Also wait until job is running and url file is written - // Try maximum 3 times - - // Check if the job is running - if (waitForJobState(job_id, "RUNNING") == false || waitForFile("./urls/url-" + job_id + ".txt", 10) == false) - { - std::cout << "Submit job failure." << std::endl; - exit(-1); - } - - return job_id; -} - std::string readUrl(const std::string &filename) { std::ifstream file(filename); @@ -134,134 +65,95 @@ std::string readUrl(const std::string &filename) return url; } -class SingleSlurmJob +std::mutex job_submission_mutex; +int hq_submit_delay_ms = 0; + +class HyperQueueJob { public: - SingleSlurmJob(std::string model_name = "forward") + HyperQueueJob(std::string model_name, bool start_client=true) { - // start a SLURM job for single request - job_id = submitJob("sbatch model.slurm"); - - const std::string server_url = readUrl("./urls/url-" + job_id + ".txt"); // read server url from txt file - // May use $SLURM_LOCALID in a .slurm file later + job_id = submitHQJob(); - std::cout << "Hosting sub-server at : " << server_url << std::endl; + // Get the server URL + server_url = readUrl("./urls/url-" + job_id + ".txt"); - // List supported models - std::vector models = umbridge::SupportedModels(server_url); - std::cout << "Supported models: " << std::endl; - for (auto model : models) + // Start a client, using unique pointer + if(start_client) { - std::cout << " " << model << std::endl; + client_ptr = std::make_unique(server_url, model_name); } - std::cout << "Using model: " << model_name << std::endl; - - // Start a client, using unique pointer - client_ptr = std::make_unique(server_url, model_name); // use the first model avaliable on server by default } - ~SingleSlurmJob() + ~HyperQueueJob() { // Cancel the SLURM job - std::system(("scancel " + job_id).c_str()); + std::system(("hq job cancel " + job_id).c_str()); // Delete the url text file std::system(("rm ./urls/url-" + job_id + ".txt").c_str()); } + std::string server_url; std::unique_ptr client_ptr; private: - std::string job_id; -}; - -// state = ["WAITING", "RUNNING", "FINISHED", "CANCELED"] -bool waitForHQJobState(const std::string &job_id, const std::string &state = "COMPLETED") -{ - const std::string command = "hq job info " + job_id + " | grep State | awk '{print $4}'"; - // std::cout << "Checking runtime: " << command << std::endl; - std::string job_status; - do + std::string submitHQJob() { - job_status = getCommandOutput(command); - - // Delete the line break - if (!job_status.empty()) - job_status.pop_back(); - - // Don't wait if there is an error or the job is ended - if (job_status == "" || (state != "FINISHED" && job_status == "FINISHED") || job_status == "FAILED" || job_status == "CANCELED") - { - std::cerr << "Wait for job status failure, status : " << job_status << std::endl; - return false; + if (hq_submit_delay_ms) { + std::lock_guard lock(job_submission_mutex); + std::this_thread::sleep_for(std::chrono::milliseconds(hq_submit_delay_ms)); } - // std::cout<<"Job status: "<(server_url, model_name); // always uses the model "forward" + return job_id; } - ~HyperQueueJob() + // state = ["WAITING", "RUNNING", "FINISHED", "CANCELED"] + bool waitForHQJobState(const std::string &job_id, const std::string &state) { - // Cancel the SLURM job - std::system(("hq job cancel " + job_id).c_str()); + const std::string command = "hq job info " + job_id + " | grep State | awk '{print $4}'"; + // std::cout << "Checking runtime: " << command << std::endl; + std::string job_status; - // Delete the url text file - std::system(("rm ./urls/url-" + job_id + ".txt").c_str()); - } + do + { + job_status = getCommandOutput(command); - std::unique_ptr client_ptr; + // Delete the line break + if (!job_status.empty()) + job_status.pop_back(); + + // Don't wait if there is an error or the job is ended + if (job_status == "" || (state != "FINISHED" && job_status == "FINISHED") || job_status == "FAILED" || job_status == "CANCELED") + { + std::cerr << "Wait for job status failure, status : " << job_status << std::endl; + return false; + } + + sleep(1); + } while (job_status != state); + + return true; + } -private: std::string job_id; }; @@ -269,31 +161,23 @@ class HyperQueueJob class LoadBalancer : public umbridge::Model { public: - LoadBalancer(std::string name = "forward") : umbridge::Model(name) - { - // Setup HyperQueue server - std::system("hq server start &"); - sleep(1); // Workaround: give the HQ server enough time to start. - - // Create allocation queue - std::system("hq_scripts/allocation_queue.sh"); - } + LoadBalancer(std::string name) : umbridge::Model(name) {} std::vector GetInputSizes(const json &config_json = json::parse("{}")) const override { - HyperQueueJob hq_job; + HyperQueueJob hq_job(name); return hq_job.client_ptr->GetInputSizes(config_json); } std::vector GetOutputSizes(const json &config_json = json::parse("{}")) const override { - HyperQueueJob hq_job; + HyperQueueJob hq_job(name); return hq_job.client_ptr->GetOutputSizes(config_json); } std::vector> Evaluate(const std::vector> &inputs, json config_json = json::parse("{}")) override { - HyperQueueJob hq_job; + HyperQueueJob hq_job(name); return hq_job.client_ptr->Evaluate(inputs, config_json); } @@ -303,7 +187,7 @@ class LoadBalancer : public umbridge::Model const std::vector &sens, json config_json = json::parse("{}")) override { - HyperQueueJob hq_job; + HyperQueueJob hq_job(name); return hq_job.client_ptr->Gradient(outWrt, inWrt, inputs, sens, config_json); } @@ -313,7 +197,7 @@ class LoadBalancer : public umbridge::Model const std::vector &vec, json config_json = json::parse("{}")) override { - HyperQueueJob hq_job; + HyperQueueJob hq_job(name); return hq_job.client_ptr->ApplyJacobian(outWrt, inWrt, inputs, vec, config_json); } @@ -325,28 +209,28 @@ class LoadBalancer : public umbridge::Model const std::vector &vec, json config_json = json::parse("{}")) { - HyperQueueJob hq_job; + HyperQueueJob hq_job(name); return hq_job.client_ptr->ApplyHessian(outWrt, inWrt1, inWrt2, inputs, sens, vec, config_json); } bool SupportsEvaluate() override { - HyperQueueJob hq_job; + HyperQueueJob hq_job(name); return hq_job.client_ptr->SupportsEvaluate(); } bool SupportsGradient() override { - HyperQueueJob hq_job; + HyperQueueJob hq_job(name); return hq_job.client_ptr->SupportsGradient(); } bool SupportsApplyJacobian() override { - HyperQueueJob hq_job; + HyperQueueJob hq_job(name); return hq_job.client_ptr->SupportsApplyJacobian(); } bool SupportsApplyHessian() override { - HyperQueueJob hq_job; + HyperQueueJob hq_job(name); return hq_job.client_ptr->SupportsApplyHessian(); } }; diff --git a/hpc/README.md b/hpc/README.md index a8b90a7..48917f0 100644 --- a/hpc/README.md +++ b/hpc/README.md @@ -1,59 +1,72 @@ -# README +# HPC -This load balancer allows any UM-Bridge client to control many parallel instances of any numerical model running on an HPC system. +This load balancer allows any scaling up UM-Bridge applications to HPC systems. To the client, it behaves like a regular UM-Bridge server, except that i can process concurrent model evaluation requests. When it receives requests, it will adaptively spawn model server instances on the HPC system, and forward evaluation requests to them. To each model server instance, the load balancer in turn appears as a regular UM-Bridge client. -## File descriptions +## Installation -- `LoadBalancer.hpp` +1. **Build the load balancer** + + Clone the UM-Bridge repository. + + ``` + git clone https://github.com/UM-Bridge/umbridge.git + ``` + + Then navigate to the `hpc` directory. - The main header file that implements the load balancer as a C++ class `LoadBalancer`. + ``` + cd umbridge/hpc + ``` + + Finally, compile the load balancer. Depending on your HPC system, you likely have to load a module providing a recent c++ compiler. -- `LoadBalancer.cpp` + ``` + make + ``` - Load balancer executable. +2. **Download HyperQueue** + + Download HyperQueue from the most recent release at https://github.com/It4innovations/hyperqueue/releases and place the `hq` binary in the `hpc` directory next to the load balancer. -- `LoadBalancer.slurm` +## Usage - A slurm configuration file, which is used to start a LoadBalancer on a compute node. +The load balancer is primarily intended to run on a login node. -- `model.slurm` +1. **Configure resource allocation** - A slurm configuration file, which is used to start a slurm job running a model server on a compute node. + The load balancer instructs HyperQueue to allocate batches of resources on the HPC system, depending on demand for model evaluations. HyperQueue will submit SLURM or PBS jobs on the HPC system when needed, scheduling requested model runs within those jobs. When demand decreases, HyperQueue will cancel some of those jobs again. + + Adapt the configuration in ``hpc/hq_scripts/allocation_queue.sh`` to your needs. + For example, when running a very fast UM-Bridge model on an HPC cluster, it is advisable to choose medium-sized jobs for resource allocation. That will avoid submitting large numbers of jobs to the HPC system's scheduler, while HyperQueue itself will handle large numbers of small model runs within those allocated jobs. -## How to start the load balancer +2. **Configure model job** ->The LoadBalancer server is supposed to run at login node, but it can also run at computing node. + Adapt the configuration in ``hpc/hq_scripts/job.sh`` to your needs: + * Specify what UM-Bridge model server to run, + * set `#HQ` variables at the top to specify what resources each instance should receive, + * and set the directory of your load balancer binary in `load_balancer_dir`. -1. Load module that is necessary to compile `cpp` files -> e.g. On Helix it's `module load compiler/gnu` + Importantly, the UM-Bridge model server must serve its models at the port specified by the environment variable `PORT`. The value of `PORT` is automatically determined by `job.sh`, avoiding potential conflicts if multiple servers run on the same compute node. -2. (**Optional**) Set the port of load balancer: `export PORT=4243` -> Sometimes the default port 4242 of the login node is occupied. + If your job is supposed to span multiple compute nodes via MPI, make sure that you forward the nodes HyperQueue allocates to you in `HQ_NODE_FILE` to MPI. See https://it4innovations.github.io/hyperqueue/stable/jobs/multinode/ for instructions. -3. Compile and run the server - - Compile the load balancer: `make` +4. **Run load balancer** - - Prepare a model server. Specify the path of your model server file in `model.slurm`, as the variable `server_file`. - > You can also specify slurm parameters in the file `regular-server.slurm`. - - Run the load balancer: `./load-balancer` + Navigate to the `hpc` directory and execute the load balancer. - > You can specify slurm parameters in the file `LoadBalancer.slurm` - > The the LoadBalancer server will occupy a terminal, so you need to start a new one if you want to run a client on the same node. + ``` + ./load-balancer + ``` -> The Load Balancer will submit a new SLURM job whenever it receives an evaluation request, and cancel the SLURM job when the evaluation is finished. -> The Load Balancer will listen to the hostname of node instead of localhost. -> The regular server in SLURM job will also listen to the hostname and use a random port that is not in use. +5. **Connect from client** -## How to connect a client to the LoadBalancer + Once running, you can connect to the load balancer from any UM-Bridge client on the login node via `http://localhost:4242`. To the client, it will appear like any other UM-Bridge server, except that it can process concurrent evaluation requests. -A client is supposed to run on the login node or at your own device, since it does not perform intensive calculations. - -Clients running directly on the login node may connect to the load balancer via `localhost. - -Alternatively, you can create an SSH tunnel to the login node, and then run the client on your own device. For example: +## (Optional) Running clients on your own machine while offloading runs to HPC +Alternatively, a client may run on your own device. In order to connect UM-Bridge clients on your machine to the login node, you can create an SSH tunnel to the HPC system. ``` ssh @hpc.cluster.address -N -f -L 4242::4242 @@ -62,10 +75,4 @@ Alternatively, you can create an SSH tunnel to the login node, and then run the # -f : request ssh to go to the background once the ssh connection has been established ``` -While the SSH tunnel is running, you can run the client on your own device, and connect it to the load balancer via `localhost:4242`. - -## Example - -An example server is in the folder `test/MultiplyBy2`. The server `minimal-server.cpp` take the input written in `client.py`, multiply them by 2 and then return. - -Currently, it will run and test 4 models in parallel, but the LoadBalancer server will process them in sequence. \ No newline at end of file +While the SSH tunnel is running, you can run the client on your own device, and connect it to the load balancer via `http://localhost:4242`. diff --git a/hpc/hq_scripts/job.sh b/hpc/hq_scripts/job.sh index af0d7a6..b55d97a 100755 --- a/hpc/hq_scripts/job.sh +++ b/hpc/hq_scripts/job.sh @@ -1,17 +1,37 @@ #! /bin/bash #HQ --resource model=1 +#HQ --cpus=1 #HQ --time-request=1m #HQ --time-limit=2m #HQ --stdout none #HQ --stderr none -# Launch model server, send back slurm job ID -# and wait to ensure that HQ won't schedule any more jobs to this allocation +# Launch model server, send back server URL +# and wait to ensure that HQ won't schedule any more jobs to this allocation. -/your/model/server/call & # CHANGE ME! +function get_avaliable_port { + # Define the range of ports to select from + MIN_PORT=1024 + MAX_PORT=65535 + + # Generate a random port number + port=$(shuf -i $MIN_PORT-$MAX_PORT -n 1) + + # Check if the port is in use + while lsof -Pi :$port -sTCP:LISTEN -t >/dev/null; do + # If the port is in use, generate a new random port number + port=$(shuf -i $MIN_PORT-$MAX_PORT -n 1) + done -port=4242 + echo $port +} + +port=$(get_avaliable_port) +export PORT=$port + +# Assume that server sets the port according to the environment variable 'PORT'. +/your/model/server/call & # CHANGE ME! load_balancer_dir="/load/balancer/directory" # CHANGE ME! @@ -23,7 +43,8 @@ while ! curl -s "http://$host:$port/Info" > /dev/null; do sleep 1 done +# Write server URL to file identified by HQ job ID. mkdir -p "$load_balancer_dir/urls" echo "http://$host:$port" > "$load_balancer_dir/urls/url-$HQ_JOB_ID.txt" -sleep infinity # keep the job occupied \ No newline at end of file +sleep infinity # keep the job occupied diff --git a/hpc/hq_scripts/setup_model.sh b/hpc/hq_scripts/setup_model.sh deleted file mode 100755 index 1714a4a..0000000 --- a/hpc/hq_scripts/setup_model.sh +++ /dev/null @@ -1,47 +0,0 @@ -#! /bin/bash - -# Path to the directory of the load balancer executable, e.g. "$HOME/xxx" -# WARNING: This has to be an absolute path because this script -# will run in the .hq-server directory NOT in the CWD. -load_balancer_dir="..." # CHANGE ME! - -# Path to the model executable/source, e.g. "$HOME/xxx/server" or "$HOME/xxx/server.py" -# WARNING: This has to be an absolute path because this script -# will run in the .hq-server directory NOT in the CWD. -server_file="..." # CHANGE ME! - -function get_avaliable_port { - # Define the range of ports to select from - MIN_PORT=1024 - MAX_PORT=65535 - - # Generate a random port number - port=$(shuf -i $MIN_PORT-$MAX_PORT -n 1) - - # Check if the port is in use - while lsof -Pi :$port -sTCP:LISTEN -t >/dev/null; do - # If the port is in use, generate a new random port number - port=$(shuf -i $MIN_PORT-$MAX_PORT -n 1) - done - - echo $port -} - -port=$(get_avaliable_port) -export PORT=$port - -# Send the model URL to the load balancer -mkdir -p "$load_balancer_dir/urls" -echo "http://$(hostname):$port" > "$load_balancer_dir/urls/url-$SLURM_JOB_ID.txt" - -# Load any dependencies (e.g. activate a conda environment) -# ... - -# Start the model -# For an executable binary (e.g. C++) -$server_file & - -# For a Python model -#python $server_file & - -# May want to add a check here if the model is ready to be used, e.g. by sending a basic request. \ No newline at end of file diff --git a/hpc/lib/umbridge.h b/hpc/lib/umbridge.h index b557a8f..c614436 100644 --- a/hpc/lib/umbridge.h +++ b/hpc/lib/umbridge.h @@ -5,7 +5,8 @@ // Increase timeout to allow for long-running models. // This should be (to be on the safe side) significantly greater than the maximum time your model may take -#define CPPHTTPLIB_READ_TIMEOUT_SECOND 60*60 + +#define CPPHTTPLIB_READ_TIMEOUT_SECOND 7 * 24 * 60 * 60 #include #include diff --git a/hpc/model.slurm b/hpc/model.slurm deleted file mode 100644 index b331001..0000000 --- a/hpc/model.slurm +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash -#SBATCH --partition=devel -#SBATCH --ntasks=1 -#SBATCH --time=00:10:00 -#SBATCH --mem=1gb -#SBATCH --output=./sub-jobs/%j.out - -# start the server named "server" -server_file="./server" # Assume your server files are placed in the project root directory, named "server" - -function get_avaliable_port { - # Define the range of ports to select from - MIN_PORT=1024 - MAX_PORT=65535 - - # Generate a random port number - port=$(shuf -i $MIN_PORT-$MAX_PORT -n 1) - - # Check if the port is in use - while lsof -Pi :$port -sTCP:LISTEN -t >/dev/null; do - # If the port is in use, generate a new random port number - port=$(shuf -i $MIN_PORT-$MAX_PORT -n 1) - done - - echo $port -} - -port=$(get_avaliable_port) -export PORT=$port - -mkdir -p ./urls -echo "http://$(hostname):$port" > ./urls/url-$SLURM_JOB_ID.txt # send the url to load-balancer - -$server_file - diff --git a/kubernetes/README.md b/kubernetes/README.md index 2bc81ef..b3b3b3d 100644 --- a/kubernetes/README.md +++ b/kubernetes/README.md @@ -1,4 +1,4 @@ -# Cloud HPC +# Kubernetes UM-Bridge provides a kubernetes-based solution for running any UM-Bridge model container on cloud platforms at HPC scale. @@ -80,7 +80,7 @@ kubectl get services --namespace=haproxy-controller The model instances may be accessed from any UM-Bridge client, and up to `replicas` requests will be handled in parallel. -# Multinode MPI on Cloud HPC +# Multinode MPI on kubernetes The instructions above work for any UM-Bridge model container, even ones that are MPI parallel. However, a single container is naturally limited to a single physical node. In order to parallelize across nodes (and therefore across containers) via MPI, the additional steps below are needed. diff --git a/kubernetes/model.yaml b/kubernetes/model.yaml index 86900ca..6cdb123 100644 --- a/kubernetes/model.yaml +++ b/kubernetes/model.yaml @@ -6,7 +6,7 @@ spec: selector: matchLabels: app: model - replicas: 4 + replicas: 12 template: metadata: labels: diff --git a/tutorial/qmcpy-client-l2-sea.py b/tutorial/qmcpy-client-l2-sea.py new file mode 100644 index 0000000..76f0878 --- /dev/null +++ b/tutorial/qmcpy-client-l2-sea.py @@ -0,0 +1,50 @@ +import argparse +import qmcpy as qp +from qmcpy.integrand.um_bridge_wrapper import UMBridgeWrapper +import numpy as np +import umbridge + +# Read URL from command line argument +parser = argparse.ArgumentParser(description='QMCPy with UM-Bridge model demo.') +parser.add_argument('url', metavar='url', type=str, + help='the URL at which the model is running, for example http://localhost:4242') +args = parser.parse_args() +print(f"Connecting to host URL {args.url}") + +# Wrap a given model and fix its last parameters to 0 +class TwoInputModel: + def __init__(self, model): + self.model = model + + def get_input_sizes(self, config={}): + return [2] + + def get_output_sizes(self, config={}): + return self.model.get_output_sizes() + + def __call__(self, theta, config={}): + return self.model([[theta[0][0], theta[0][1], 0,0,0,0,0,0,0,0,0,0,0,0,0,0]], config) + + def supports_evaluate(self): + return True + + +# Set up umbridge model and model config +l2sea_model = umbridge.HTTPModel(args.url, "forward") +l2sea_fixed_design_params = TwoInputModel(l2sea_model) +config = {"fidelity": 7} + +# Get input dimension from model +d = l2sea_fixed_design_params.get_input_sizes(config)[0] + +# Froud [0.25,0.41] +# Draft [-6.6, -5.7] +# Choose a distribution to sample via QMC +dnb2 = qp.DigitalNetB2(d) +gauss_sobol = qp.Uniform(dnb2, lower_bound=[0.25,-6.6], upper_bound=[0.41,-5.7]) + +integrand = UMBridgeWrapper(gauss_sobol, l2sea_fixed_design_params, config, parallel=False) + +qmc_sobol_algorithm = qp.CubQMCSobolG(integrand, abs_tol=1e-1, n_init = 256, n_max = 256) +solution,data = qmc_sobol_algorithm.integrate() +print(data)