From 4adbf0f7d50c723d80ea16417fa3a1f25ecf2772 Mon Sep 17 00:00:00 2001 From: Abhinav Anil Sharma Date: Thu, 19 Dec 2024 09:52:30 -0500 Subject: [PATCH] i#7157 syscall sched: Handle static injected syscall traces in scheduler (#7158) Adds handling for statically-injected kernel syscall traces in the scheduler. Ensures that quantum and voluntary context switches are delayed until after the statically-injected syscall trace. This involved fixing the bookkeeping logic which is done on the next user-space instr now. Note that the injected kernel syscall traces do not include any scheduling-related markers such as TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE and TRACE_MARKER_TYPE_TIMESTAMP. For now we keep status quo on the scheduler behavior of showing the post-syscall markers before the switch. Adds a unit test for static-injected kernel syscall trace handling by the scheduler for various scenarios involving syscall sequences shorter and longer than the quantum, for system calls that do and do not cause context switches, and occurring at different offsets into the quantum. Issue: #7157 --- clients/drcachesim/scheduler/scheduler.h | 3 + .../scheduler/scheduler_dynamic.cpp | 71 +++-- clients/drcachesim/scheduler/scheduler_impl.h | 2 +- .../drcachesim/tests/scheduler_unit_tests.cpp | 256 +++++++++++++++++- 4 files changed, 306 insertions(+), 26 deletions(-) diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h index 3c7403e309c..dda51faa35c 100644 --- a/clients/drcachesim/scheduler/scheduler.h +++ b/clients/drcachesim/scheduler/scheduler.h @@ -758,6 +758,9 @@ template class scheduler_tmpl_t { * The scheduling quantum duration for preemption, in instruction count, * for #QUANTUM_INSTRUCTIONS. The time passed to next_record() is ignored * for purposes of quantum preempts. + * + * Instructions executed in a quantum may end up higher than the specified + * value to avoid interruption of the kernel system call sequence. */ // We pick 10 million to match 2 instructions per nanosecond with a 5ms quantum. uint64_t quantum_duration_instrs = 10 * 1000 * 1000; diff --git a/clients/drcachesim/scheduler/scheduler_dynamic.cpp b/clients/drcachesim/scheduler/scheduler_dynamic.cpp index 52e5e6352a6..8f8dad07eb8 100644 --- a/clients/drcachesim/scheduler/scheduler_dynamic.cpp +++ b/clients/drcachesim/scheduler/scheduler_dynamic.cpp @@ -458,7 +458,9 @@ scheduler_dynamic_tmpl_t::check_for_input_switch( // boundaries so we live with those being before the switch. // XXX: Once we insert kernel traces, we may have to try harder // to stop before the post-syscall records. - if (this->record_type_is_instr_boundary(record, outputs_[output].last_record)) { + if (this->record_type_is_instr_boundary(record, outputs_[output].last_record) && + // We want to delay the context switch until after the injected syscall trace. + !outputs_[output].in_syscall_code) { if (input->switch_to_input != sched_type_t::INVALID_INPUT_ORDINAL) { // The switch request overrides any latency threshold. need_new_input = true; @@ -506,18 +508,28 @@ scheduler_dynamic_tmpl_t::check_for_input_switch( } if (options_.quantum_unit == sched_type_t::QUANTUM_INSTRUCTIONS && this->record_type_is_instr_boundary(record, outputs_[output].last_record) && - !outputs_[output].in_kernel_code) { + !outputs_[output].in_context_switch_code) { ++input->instrs_in_quantum; if (input->instrs_in_quantum > options_.quantum_duration_instrs) { - // We again prefer to switch to another input even if the current - // input has the oldest timestamp, prioritizing context switches - // over timestamp ordering. - VPRINT(this, 4, "next_record[%d]: input %d hit end of instr quantum\n", - output, input->index); - preempt = true; - need_new_input = true; - input->instrs_in_quantum = 0; - ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS]; + if (outputs_[output].in_syscall_code) { + // XXX: Maybe this should be printed only once per-syscall-instance to + // reduce log spam. + VPRINT(this, 5, + "next_record[%d]: input %d delaying context switch " + "after end of instr quantum due to syscall trace\n", + output, input->index); + + } else { + // We again prefer to switch to another input even if the current + // input has the oldest timestamp, prioritizing context switches + // over timestamp ordering. + VPRINT(this, 4, "next_record[%d]: input %d hit end of instr quantum\n", + output, input->index); + preempt = true; + need_new_input = true; + input->instrs_in_quantum = 0; + ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS]; + } } } else if (options_.quantum_unit == sched_type_t::QUANTUM_TIME) { if (cur_time == 0 || cur_time < input->prev_time_in_quantum) { @@ -535,14 +547,23 @@ scheduler_dynamic_tmpl_t::check_for_input_switch( // in between (e.g., scatter/gather long sequence of reads/writes) by // setting input->switching_pre_instruction. this->record_type_is_instr_boundary(record, outputs_[output].last_record)) { - VPRINT(this, 4, - "next_record[%d]: input %d hit end of time quantum after %" PRIu64 - "\n", - output, input->index, input->time_spent_in_quantum); - preempt = true; - need_new_input = true; - input->time_spent_in_quantum = 0; - ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS]; + if (outputs_[output].in_syscall_code) { + // XXX: Maybe this should be printed only once per-syscall-instance to + // reduce log spam. + VPRINT(this, 5, + "next_record[%d]: input %d delaying context switch after end of " + "time quantum after %" PRIu64 " due to syscall trace\n", + output, input->index, input->time_spent_in_quantum); + } else { + VPRINT(this, 4, + "next_record[%d]: input %d hit end of time quantum after %" PRIu64 + "\n", + output, input->index, input->time_spent_in_quantum); + preempt = true; + need_new_input = true; + input->time_spent_in_quantum = 0; + ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS]; + } } } // For sched_type_t::DEPENDENCY_TIMESTAMPS: enforcing asked-for @@ -574,16 +595,20 @@ scheduler_dynamic_tmpl_t::process_marker( break; case TRACE_MARKER_TYPE_CONTEXT_SWITCH_START: outputs_[output].in_context_switch_code = true; - ANNOTATE_FALLTHROUGH; + break; case TRACE_MARKER_TYPE_SYSCALL_TRACE_START: - outputs_[output].in_kernel_code = true; + outputs_[output].in_syscall_code = true; break; case TRACE_MARKER_TYPE_CONTEXT_SWITCH_END: // We have to delay until the next record. outputs_[output].hit_switch_code_end = true; - ANNOTATE_FALLTHROUGH; + break; case TRACE_MARKER_TYPE_SYSCALL_TRACE_END: - outputs_[output].in_kernel_code = false; + outputs_[output].in_syscall_code = false; + break; + case TRACE_MARKER_TYPE_TIMESTAMP: + // Syscall sequences are not expected to have a timestamp. + assert(!outputs_[output].in_syscall_code); break; case TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH: { if (!options_.honor_direct_switches) diff --git a/clients/drcachesim/scheduler/scheduler_impl.h b/clients/drcachesim/scheduler/scheduler_impl.h index 260b5051aac..2ddbb710545 100644 --- a/clients/drcachesim/scheduler/scheduler_impl.h +++ b/clients/drcachesim/scheduler/scheduler_impl.h @@ -480,7 +480,7 @@ template class scheduler_impl_tmpl_t // This is accessed by other outputs for stealing and rebalancing. // Indirected so we can store it in our vector. std::unique_ptr> active; - bool in_kernel_code = false; + bool in_syscall_code = false; bool in_context_switch_code = false; bool hit_switch_code_end = false; // Used for time-based quanta. diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp index 83807e21a66..616112fbed4 100644 --- a/clients/drcachesim/tests/scheduler_unit_tests.cpp +++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp @@ -190,7 +190,8 @@ run_lockstep_simulation(scheduler_t &scheduler, int num_outputs, memref_tid_t ti int64_t meta_records = 0; // Record the threads, one char each. std::vector sched_as_string(num_outputs); - static constexpr char THREAD_LETTER_START = 'A'; + static constexpr char THREAD_LETTER_START_USER = 'A'; + static constexpr char THREAD_LETTER_START_KERNEL = 'a'; static constexpr char WAIT_SYMBOL = '-'; static constexpr char IDLE_SYMBOL = '_'; static constexpr char NON_INSTR_SYMBOL = '.'; @@ -229,8 +230,10 @@ run_lockstep_simulation(scheduler_t &scheduler, int num_outputs, memref_tid_t ti } assert(status == scheduler_t::STATUS_OK); if (type_is_instr(memref.instr.type)) { + bool is_kernel = outputs[i]->is_record_kernel(); sched_as_string[i] += - THREAD_LETTER_START + static_cast(memref.instr.tid - tid_base); + (is_kernel ? THREAD_LETTER_START_KERNEL : THREAD_LETTER_START_USER) + + static_cast(memref.instr.tid - tid_base); } else { // While this makes the string longer, it is just too confusing // with the same letter seemingly on 2 cores at once without these @@ -1354,6 +1357,254 @@ test_synthetic() } } +static void +test_synthetic_with_syscall_seq() +{ + std::cerr << "\n----------------\nTesting synthetic with syscall sequences\n"; + static constexpr int NUM_INPUTS = 7; + static constexpr int NUM_OUTPUTS = 2; + static constexpr int NUM_INSTRS = 9; + static constexpr int QUANTUM_DURATION = 3; + // We do not want to block for very long. + static constexpr double BLOCK_SCALE = 0.01; + static constexpr uint64_t BLOCK_THRESHOLD = 100; + static constexpr memref_tid_t TID_BASE = 100; + static constexpr uint64_t KERNEL_CODE_OFFSET = 123456; + static constexpr uint64_t SYSTRACE_NUM = 84; + std::vector inputs[NUM_INPUTS]; + for (int i = 0; i < NUM_INPUTS; i++) { + memref_tid_t tid = TID_BASE + i; + inputs[i].push_back(make_thread(tid)); + inputs[i].push_back(make_pid(1)); + inputs[i].push_back(make_version(TRACE_ENTRY_VERSION)); + inputs[i].push_back(make_timestamp(10)); // All the same time priority. + for (int j = 0; j < NUM_INSTRS; j++) { + inputs[i].push_back(make_instr(42 + j * 4)); + // Test a syscall sequence starting at each offset within a quantum + // of instrs. + if (i <= QUANTUM_DURATION && i == j) { + inputs[i].push_back(make_timestamp(20)); + inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, SYSTRACE_NUM)); + if (i < 2) { + // Thresholds for only blocking syscalls are low enough to + // cause a context switch. So only A and B will try a voluntary + // switch (which may be delayed due to the syscall trace) after + // 1 or 2 instrs respectively. + inputs[i].push_back( + make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0)); + } + inputs[i].push_back(make_timestamp(120)); + inputs[i].push_back( + make_marker(TRACE_MARKER_TYPE_SYSCALL_TRACE_START, SYSTRACE_NUM)); + // A has just one syscall seq instr to show that it still does the + // voluntary switch after the syscall trace is done, even though there + // is still room for one more instr in its quantum. + // D has just one syscall seq instr to show that it will continue + // on without a switch after the syscall trace is done because more + // instrs were left in the same quantum. + // B and C have longer syscall seq to show that they will not be + // preempted by voluntary or quantum switches respectively. + int count_syscall_instrs = (i == 0 || i == 3) ? 1 : QUANTUM_DURATION; + for (int k = 1; k <= count_syscall_instrs; ++k) + inputs[i].push_back(make_instr(KERNEL_CODE_OFFSET + k)); + inputs[i].push_back( + make_marker(TRACE_MARKER_TYPE_SYSCALL_TRACE_END, SYSTRACE_NUM)); + } + } + inputs[i].push_back(make_exit(tid)); + } + // A has a syscall sequence at [2,2], B has it at [3,5], C has it at [4,6], + // D has it at [5,5]. + + // Hardcoding here for the 2 outputs and 7 inputs. + // We make assumptions on the scheduler's initial runqueue assignment + // being round-robin, resulting in 4 on core0 (odd parity letters) and 3 on + // core1 (even parity letters). + // The dots are markers and thread exits. + // + // A has a voluntary switch after its first two letters, prompted by its + // first instr which is a blocking syscall with latency that exceeds switch + // threshold, but not before its 2nd instr which is from the syscall trace + // and must be shown before the switch happens. Despite there being room for + // 1 more instr left in the quantum, the voluntary switch still happens. + // When scheduled next, A has room to execute only one instr left in its + // quantum limit (which was carried over after the voluntary switch). + // + // B has a voluntary switch after its first 5 letters, prompted by its 2nd + // instr which is a blocking system call with latency that exceeds switch + // threshold, but not before its next three instrs which are from the + // syscall trace and must be shown before the switch happens. B ends up + // executing more instrs than its quantum limit because of the syscall + // trace. + // + // C has a syscall at its third letter (but it doesn't cause a switch + // because it doesn't have sufficiently high latency), followed by the + // syscall trace of three additional letters. C ends up + // executing more instrs than the quantum limit because of the syscall + // trace. + // + // D has a syscall at its 4th letter, followed by a 1-instr syscall + // trace. D continues with its regular instrs without a context switch + // at its 6th letter because there is still room for more instrs left in + // the quantum. + // + // Since core0 has an extra input, core1 finishes + // its runqueue first and then steals G from core0 (migration threshold is 0) + // and finishes it off. + static const char *const CORE0_SCHED_STRING = + "..A.....a...CCC....ccc...EEE..GGGACCCEEEGGGAAACCC.EEE.AAAA."; + static const char *const CORE1_SCHED_STRING = + "..BB.....bbb...DDD..FFFBBBD....d.DFFFBBBDDDFFF.B.D.GGG.____"; + { + // Test instruction quanta. + std::vector sched_inputs; + for (int i = 0; i < NUM_INPUTS; i++) { + std::vector readers; + readers.emplace_back( + std::unique_ptr(new mock_reader_t(inputs[i])), + std::unique_ptr(new mock_reader_t()), TID_BASE + i); + sched_inputs.emplace_back(std::move(readers)); + } + scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT, + scheduler_t::DEPENDENCY_IGNORE, + scheduler_t::SCHEDULER_DEFAULTS, + /*verbosity=*/4); + sched_ops.quantum_duration_instrs = QUANTUM_DURATION; + // This was tuned with a 100us threshold: so avoid scheduler.h defaults + // changes from affecting our output. + sched_ops.blocking_switch_threshold = BLOCK_THRESHOLD; + sched_ops.block_time_multiplier = BLOCK_SCALE; + sched_ops.time_units_per_us = 1.; + // Migration is measured in wall-clock-time for instr quanta + // so avoid non-determinism by having no threshold. + sched_ops.migration_threshold_us = 0; + scheduler_t scheduler; + if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != + scheduler_t::STATUS_SUCCESS) + assert(false); + std::vector sched_as_string = + run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE); + for (int i = 0; i < NUM_OUTPUTS; i++) { + std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n"; + } + // Check scheduler stats. # switches is the # of letter transitions; # preempts + // is the instances where the same letter appears 3 times without another letter + // appearing in between (and ignoring the last letter for an input: EOF doesn't + // count as a preempt). # nops are the instances where the same input is picked + // to run because nothing else is waiting. + verify_scheduler_stats(scheduler.get_stream(0), /*switch_input_to_input=*/11, + /*switch_input_to_idle=*/0, /*switch_idle_to_input=*/0, + /*switch_nop=*/1, /*preempts=*/9, /*direct_attempts=*/0, + /*direct_successes=*/0, /*migrations=*/1); + verify_scheduler_stats(scheduler.get_stream(1), /*switch_input_to_input=*/11, + /*switch_input_to_idle=*/1, /*switch_idle_to_input=*/0, + /*switch_nop=*/0, /*preempts=*/8, /*direct_attempts=*/0, + /*direct_successes=*/0, /*migrations=*/0); + assert(scheduler.get_stream(0)->get_schedule_statistic( + memtrace_stream_t::SCHED_STAT_RUNQUEUE_STEALS) == 0); + assert(scheduler.get_stream(1)->get_schedule_statistic( + memtrace_stream_t::SCHED_STAT_RUNQUEUE_STEALS) == 1); +#ifndef WIN32 + // XXX: Windows microseconds on test VMs are very coarse and stay the same + // for long periods. Instruction quanta use wall-clock idle times, so + // the result is extreme variations here. We try to adjust by handling + // any schedule with below specific patterns. We just give up on checking the + // precise output for this test on Windows. + if (sched_as_string[0] != CORE0_SCHED_STRING || + sched_as_string[1] != CORE1_SCHED_STRING) { + // XXX: These bools could potentially be made into ints, but then + // maybe our check will become too strict, defeating the purpose of + // this relaxation. + bool found_single_A = false, found_single_B = false, found_single_D = false; + for (int cpu = 0; cpu < NUM_OUTPUTS; ++cpu) { + for (size_t i = 1; i < sched_as_string[cpu].size() - 1; ++i) { + // We expect a single 'A' for the first instr executed by 'A', + // which will be followed by a marker ('.') for the syscall, + // and the third instr executed by it which will be the only + // instruction executed by it during that scheduling because + // prior bookkeeping for that quantum exhaused all-but-one + // instruction. + if (sched_as_string[cpu][i] == 'A' && + sched_as_string[cpu][i - 1] != 'A' && + sched_as_string[cpu][i + 1] != 'A') + found_single_A = true; + // We expect a single 'B' for the last instr executed by B + // which will have to be in its own separate 3-instr quantum. + if (sched_as_string[cpu][i] == 'B' && + sched_as_string[cpu][i - 1] != 'B' && + sched_as_string[cpu][i + 1] != 'B') + found_single_B = true; + // We expect a single 'D' for the one quantum where the + // 1st and 3rd instrs executed by D were regular, and the + // 2nd one was from a syscall (which is 'd'). Also, the + // last (10th) instr executed by D will have to be in its + // own separate 3-instr quantum. + if (sched_as_string[cpu][i] == 'D' && + sched_as_string[cpu][i - 1] != 'D' && + sched_as_string[cpu][i + 1] != 'D') + found_single_D = true; + } + } + bool found_syscall_a = false, found_syscall_b = false, + found_syscall_c = false, found_syscall_d = false; + for (int cpu = 0; cpu < NUM_OUTPUTS; ++cpu) { + // The '.' at beginning and end of each of the searched sequences + // below is for the syscall trace start and end markers. + if (sched_as_string[cpu].find(".a.") != std::string::npos) { + found_syscall_a = true; + } + if (sched_as_string[cpu].find(".bbb.") != std::string::npos) { + found_syscall_b = true; + } + if (sched_as_string[cpu].find(".ccc.") != std::string::npos) { + found_syscall_c = true; + } + if (sched_as_string[cpu].find(".d.") != std::string::npos) { + found_syscall_d = true; + } + } + assert(found_single_A && found_single_B && found_single_D); + assert(found_syscall_a && found_syscall_b && found_syscall_c && + found_syscall_d); + } +#endif + } + { + // Test time quanta. + std::vector sched_inputs; + for (int i = 0; i < NUM_INPUTS; i++) { + std::vector readers; + readers.emplace_back( + std::unique_ptr(new mock_reader_t(inputs[i])), + std::unique_ptr(new mock_reader_t()), TID_BASE + i); + sched_inputs.emplace_back(std::move(readers)); + } + scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT, + scheduler_t::DEPENDENCY_IGNORE, + scheduler_t::SCHEDULER_DEFAULTS, + /*verbosity=*/4); + sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.time_units_per_us = 1.; + // This was tuned with a 100us threshold: so avoid scheduler.h defaults + // changes from affecting our output. + sched_ops.blocking_switch_threshold = BLOCK_THRESHOLD; + sched_ops.quantum_duration_us = QUANTUM_DURATION; + sched_ops.block_time_multiplier = BLOCK_SCALE; + sched_ops.migration_threshold_us = 0; + scheduler_t scheduler; + if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != + scheduler_t::STATUS_SUCCESS) + assert(false); + std::vector sched_as_string = + run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true); + for (int i = 0; i < NUM_OUTPUTS; i++) { + std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n"; + } + assert(sched_as_string[0] == CORE0_SCHED_STRING); + assert(sched_as_string[1] == CORE1_SCHED_STRING); + } +} + static void test_synthetic_time_quanta() { @@ -6424,6 +6675,7 @@ test_main(int argc, const char *argv[]) test_only_threads(); test_real_file_queries_and_filters(argv[1]); test_synthetic(); + test_synthetic_with_syscall_seq(); test_synthetic_time_quanta(); test_synthetic_with_timestamps(); test_synthetic_with_priorities();