diff --git a/api/boinc_api.cpp b/api/boinc_api.cpp index be7b345d3e9..6b1709f6d09 100644 --- a/api/boinc_api.cpp +++ b/api/boinc_api.cpp @@ -1052,7 +1052,8 @@ int boinc_report_app_status_aux( double _fraction_done, int other_pid, double _bytes_sent, - double _bytes_received + double _bytes_received, + double wss ) { char msg_buf[MSG_CHANNEL_SIZE], buf[1024]; if (standalone) return 0; @@ -1081,6 +1082,10 @@ int boinc_report_app_status_aux( sprintf(buf, "%d\n", ac_state); strlcat(msg_buf, buf, sizeof(msg_buf)); } + if (wss) { + sprintf(buf, "%f\n", wss); + strlcat(msg_buf, buf, sizeof(msg_buf)); + } #ifdef MSGS_FROM_FILE if (fout) { fputs(msg_buf, fout); @@ -1100,7 +1105,7 @@ int boinc_report_app_status( double _fraction_done ){ return boinc_report_app_status_aux( - cpu_time, checkpoint_cpu_time, _fraction_done, 0, 0, 0 + cpu_time, checkpoint_cpu_time, _fraction_done, 0, 0, 0, 0 ); } diff --git a/api/boinc_api.h b/api/boinc_api.h index acb65bb8452..758cf7e55b9 100644 --- a/api/boinc_api.h +++ b/api/boinc_api.h @@ -138,7 +138,8 @@ extern int boinc_upload_status(std::string& name); extern char* boinc_msg_prefix(char*, int); extern int boinc_report_app_status_aux( double cpu_time, double checkpoint_cpu_time, double _fraction_done, - int other_pid, double bytes_sent, double bytes_received + int other_pid, double bytes_sent, double bytes_received, + double wss ); extern int boinc_temporary_exit( int delay, const char* reason=NULL, bool is_notice=false diff --git a/client/app.cpp b/client/app.cpp index eda742eb22b..8fbfba2c773 100644 --- a/client/app.cpp +++ b/client/app.cpp @@ -110,6 +110,7 @@ ACTIVE_TASK::ACTIVE_TASK() { peak_disk_usage = 0; once_ran_edf = false; + wss_from_app = 0; fraction_done = 0; fraction_done_elapsed_time = 0; first_fraction_done = 0; @@ -420,6 +421,8 @@ void ACTIVE_TASK_SET::get_memory_usage() { // at least on Windows. Use the VM size instead. // pi.working_set_size_smoothed = atp->wup->rsc_memory_bound; + } else if (atp->wss_from_app > 0) { + pi.working_set_size_smoothed = .5*(pi.working_set_size_smoothed + atp->wss_from_app); } else { pi.working_set_size_smoothed = .5*(pi.working_set_size_smoothed + pi.working_set_size); } diff --git a/client/app.h b/client/app.h index d7b306bea2f..72534512ef8 100644 --- a/client/app.h +++ b/client/app.h @@ -53,12 +53,14 @@ typedef int PROCESS_ID; // Represents a job in progress. -// When an active task is created, it is assigned a "slot" +// When a job is started, it is assigned a "slot" // which determines the directory it runs in. -// This doesn't change over the life of the active task; -// thus the task can use the slot directory for temp files +// This doesn't change over the life of the job; +// so it can use the slot directory for temp files // that BOINC doesn't know about. +// If you add anything, initialize it in the constructor +// struct ACTIVE_TASK { #ifdef _WIN32 HANDLE process_handle, shm_handle; @@ -100,8 +102,12 @@ struct ACTIVE_TASK { // most recent CPU time reported by app bool once_ran_edf; - // END OF ITEMS SAVED IN STATE FILE + // END OF ITEMS SAVED IN STATE FILES + double wss_from_app; + // work set size reported by the app + // (e.g. docker_wrapper does this). + // If nonzero, use this instead of procinfo data double fraction_done; // App's estimate of how much of the work unit is done. // Passed from the application via an API call; diff --git a/client/app_control.cpp b/client/app_control.cpp index b03a9875db1..14c628461cb 100644 --- a/client/app_control.cpp +++ b/client/app_control.cpp @@ -1439,8 +1439,23 @@ bool ACTIVE_TASK::get_app_status_msg() { } } } - parse_double(msg_buf, "", current_cpu_time); - parse_double(msg_buf, "", checkpoint_cpu_time); + if (parse_double(msg_buf, "", current_cpu_time)) { + if (current_cpu_time < 0) { + msg_printf(result->project, MSG_INFO, + "app reporting negative CPU: %f", current_cpu_time + ); + current_cpu_time = 0; + } + } + if (parse_double(msg_buf, "", checkpoint_cpu_time)) { + if (checkpoint_cpu_time < 0) { + msg_printf(result->project, MSG_INFO, + "app reporting negative checkpoint CPU: %f", checkpoint_cpu_time + ); + checkpoint_cpu_time = 0; + } + } + parse_double(msg_buf, "", wss_from_app); parse_double(msg_buf, "", result->fpops_per_cpu_sec); parse_double(msg_buf, "", result->fpops_cumulative); parse_double(msg_buf, "", result->intops_per_cpu_sec); @@ -1470,18 +1485,6 @@ bool ACTIVE_TASK::get_app_status_msg() { if (parse_int(msg_buf, "", i)) { sporadic_ac_state = (SPORADIC_AC_STATE)i; } - if (current_cpu_time < 0) { - msg_printf(result->project, MSG_INFO, - "app reporting negative CPU: %f", current_cpu_time - ); - current_cpu_time = 0; - } - if (checkpoint_cpu_time < 0) { - msg_printf(result->project, MSG_INFO, - "app reporting negative checkpoint CPU: %f", checkpoint_cpu_time - ); - checkpoint_cpu_time = 0; - } return true; } diff --git a/client/app_test.cpp b/client/app_test.cpp index 62b9abebd31..57a2a862cfd 100644 --- a/client/app_test.cpp +++ b/client/app_test.cpp @@ -26,7 +26,7 @@ // input/output files, attributes, etc. // It currently has several test cases, selected with #ifdef // - build the BOINC client with these changes -// - make a BOINC data directory, say 'test' +// - Linux: make a BOINC data directory, say 'test' // (or you can use an existing BOINC data directory, // in which case the client will also run existing jobs) // - make a directory test/slots/app_test @@ -224,6 +224,20 @@ void CLIENT_STATE::app_test_init() { *make_file(app->project, "Dockerfile_copy", "Dockerfile", INPUT_FILE, true) ); #endif +#ifdef APP_DOCKER_WRAPPER_MOUNT + av->app_files.push_back( + *make_file(app->project, "docker_wrapper.exe", NULL, MAIN_PROG, false) + ); + av->app_files.push_back( + *make_file(app->project, "worker", NULL, INPUT_FILE, false) + ); + av->app_files.push_back( + *make_file(app->project, "job_copy.toml", "job.toml", INPUT_FILE, true) + ); + av->app_files.push_back( + *make_file(app->project, "Dockerfile_copy", "Dockerfile", INPUT_FILE, true) + ); +#endif // can put other stuff here like #if 0 @@ -243,11 +257,17 @@ void CLIENT_STATE::app_test_init() { ); #endif #ifdef APP_DOCKER_WRAPPER_COPY + wu->command_line = "--verbose"; wu->input_files.push_back( *make_file(proj, "infile", "in", INPUT_FILE, true) ); #endif - +#ifdef APP_DOCKER_WRAPPER_MOUNT + wu->command_line = "--verbose"; + wu->input_files.push_back( + *make_file(proj, "infile", "in", INPUT_FILE, false) + ); +#endif RESULT *result = make_result(av, wu); ////////////// OUTPUT FILES ///////////////// @@ -262,6 +282,11 @@ void CLIENT_STATE::app_test_init() { *make_file(proj, "outfile", "out", OUTPUT_FILE, true) ); #endif +#ifdef APP_DOCKER_WRAPPER_MOUNT + result->output_files.push_back( + *make_file(proj, "outfile", "out", OUTPUT_FILE, false) + ); +#endif // tell the client not to get work or run benchmarks // diff --git a/samples/docker_wrapper/docker_wrapper.cpp b/samples/docker_wrapper/docker_wrapper.cpp index 70d9df0039b..d31ebe6948d 100644 --- a/samples/docker_wrapper/docker_wrapper.cpp +++ b/samples/docker_wrapper/docker_wrapper.cpp @@ -30,7 +30,7 @@ // this is the first run of the job // if the image doesn't already exist // build image with 'docker build' -// (need a log around the above?) +// (need a lock around the above?) // create the container with -v to mount slot, project dirs // copy input files as needed // start container @@ -42,7 +42,7 @@ // image name // name: lower case letters, digits, separators (. _ -); max 4096 chars // tag: max 128 chars -// in the universal model, each WU has a different image +// in the universal model, each WU must have a different image // so we'll use: boinc____ // // container name: @@ -54,7 +54,7 @@ // image name: boinc // container name: boinc // slot dir: . -// project dir: project/ +// project dir (mount mode): project/ // enable standalone tests on Win // @@ -79,14 +79,16 @@ using std::string; using std::vector; #define POLL_PERIOD 1.0 +#define STATUS_PERIOD 10 + // reports status this often enum JOB_STATUS {JOB_IN_PROGRESS, JOB_SUCCESS, JOB_FAIL}; struct RSC_USAGE { - double cpu_time; + double cpu_frac; double wss; void clear() { - cpu_time = 0; + cpu_frac = 0; wss = 0; } }; @@ -127,7 +129,7 @@ char container_name[512]; APP_INIT_DATA aid; CONFIG config; bool running; -bool verbose = true; +bool verbose = false; const char* config_file = "job.toml"; const char* dockerfile = "Dockerfile"; const char* cli_prog; @@ -217,7 +219,24 @@ inline int run_docker_command(char* cmd, vector &out) { retval = read_from_pipe( ctl_wc.out_read, ctl_wc.proc_handle, output, TIMEOUT, "EOM" ); - if (retval) return retval; + if (retval) { + const char* msg = ""; + switch (retval) { + case PROC_DIED: + msg = "Process died"; + break; + case TIMEOUT: + msg = "Timeout"; + break; + case READ_ERROR: + msg = "Read Error"; + break; + default: + break; + } + fprintf(stderr, "read_from_pipe() error: %s\n", msg); + return retval; + } out = split(output, '\n'); #else retval = run_command(cmd, out); @@ -248,8 +267,9 @@ int image_exists(bool &exists) { sprintf(cmd, "%s images", cli_prog); int retval = run_docker_command(cmd, out); if (retval) return retval; + string image_name_space = image_name + string(" "); for (string line: out) { - if (line.find(image_name) != string::npos) { + if (line.find(image_name_space) != string::npos) { exists = true; return 0; } @@ -301,7 +321,7 @@ int container_exists(bool &exists) { int retval; vector out; - sprintf(cmd, "%s ps --filter \"name=%s\"", + sprintf(cmd, "%s ps --all --filter \"name=%s\"", cli_prog, container_name ); retval = run_docker_command(cmd, out); @@ -442,7 +462,10 @@ void poll_client_msgs() { } } -JOB_STATUS poll_app(RSC_USAGE &ru) { +// check whether job has exited +// Note: on both Podman and Docker this takes significant CPU time +// (like .03 sec) so do it infrequently (like 5 sec) +JOB_STATUS poll_app() { char cmd[1024]; vector out; int retval; @@ -461,6 +484,48 @@ JOB_STATUS poll_app(RSC_USAGE &ru) { return JOB_FAIL; } +// get CPU and mem usage +// This is also surprisingly slow +int get_stats(RSC_USAGE &ru) { + char cmd[1024]; + vector out; + int retval; + size_t n; + + sprintf(cmd, + "%s stats --no-stream --format \"{{.CPUPerc}} {{.MemUsage}}\" %s", + cli_prog, container_name + ); + retval = run_docker_command(cmd, out); + if (retval) return -1; + if (out.empty()) return -1; + const char *buf = out[0].c_str(); + // output is like + // 0.00% 420KiB / 503.8GiB + double cpu_pct, mem; + char mem_unit; + n = sscanf(buf, "%lf%% %lf%c", &cpu_pct, &mem, &mem_unit); + if (n != 3) return -1; + switch (mem_unit) { + case 'G': + case 'g': + mem *= GIGA; break; + case 'M': + case 'm': + mem *= MEGA; break; + case 'K': + case 'k': + mem *= KILO; break; + case 'B': + case 'b': + break; + default: return -1; + } + ru.cpu_frac = cpu_pct/100.; + ru.wss = mem; + return 0; +} + #ifdef _WIN32 // find a WSL distro with Docker and set up a command link to it // @@ -528,6 +593,7 @@ int main(int argc, char** argv) { boinc_init_options(&options); if (boinc_is_standalone()) { + verbose = true; strcpy(image_name, "boinc"); strcpy(container_name, "boinc"); strcpy(aid.project_dir, "./project"); @@ -543,6 +609,14 @@ int main(int argc, char** argv) { } if (verbose) config.print(); + if (sporadic) { + retval = boinc_sporadic_dir("."); + if (retval) { + fprintf(stderr, "can't create sporadic files\n"); + boinc_finish(retval); + } + } + #ifdef _WIN32 retval = wsl_init(); if (retval) { @@ -578,18 +652,38 @@ int main(int argc, char** argv) { boinc_finish(1); } running = true; - while (1) { + double cpu_time = 0; + for (int i=0; ; i++) { poll_client_msgs(); - switch(poll_app(ru)) { - case JOB_FAIL: - cleanup(); - boinc_finish(1); - break; - case JOB_SUCCESS: - copy_files_from_container(); - cleanup(); - boinc_finish(0); - break; + if (i%STATUS_PERIOD == 0) { + switch(poll_app()) { + case JOB_FAIL: + cleanup(); + boinc_finish(1); + break; + case JOB_SUCCESS: + copy_files_from_container(); + cleanup(); + boinc_finish(0); + break; + default: + break; + } + retval = get_stats(ru); + if (!retval) { + cpu_time += STATUS_PERIOD*ru.cpu_frac; + if (verbose) { + fprintf(stderr, "reporting CPU %f WSS %f\n", cpu_time, ru.wss); + } + boinc_report_app_status_aux( + cpu_time, + 0, // checkpoint CPU time + 0, // frac done + 0, // other PID + 0,0, // bytes send/received + ru.wss + ); + } } boinc_sleep(POLL_PERIOD); } diff --git a/samples/docker_wrapper/test_copy/in b/samples/docker_wrapper/test_copy/infile similarity index 100% rename from samples/docker_wrapper/test_copy/in rename to samples/docker_wrapper/test_copy/infile diff --git a/samples/vboxwrapper/vbox_common.cpp b/samples/vboxwrapper/vbox_common.cpp index 42664be0208..cd42a816ef1 100644 --- a/samples/vboxwrapper/vbox_common.cpp +++ b/samples/vboxwrapper/vbox_common.cpp @@ -316,7 +316,8 @@ void VBOX_VM::report_clean( fraction_done, vm_pid, bytes_sent, - bytes_received + bytes_received, + 0 ); } diff --git a/samples/vboxwrapper/vboxwrapper.cpp b/samples/vboxwrapper/vboxwrapper.cpp index f2392a2916c..efc478aae00 100644 --- a/samples/vboxwrapper/vboxwrapper.cpp +++ b/samples/vboxwrapper/vboxwrapper.cpp @@ -890,7 +890,8 @@ int main(int argc, char** argv) { fraction_done, pVM->vm_pid, bytes_sent, - bytes_received + bytes_received, + 0 ); // Wait for up to 5 minutes for the VM to switch states. @@ -1389,7 +1390,8 @@ int main(int argc, char** argv) { fraction_done, pVM->vm_pid, bytes_sent, - bytes_received + bytes_received, + 0 ); if (!retval) {