diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h index 3c7403e309c..dda51faa35c 100644 --- a/clients/drcachesim/scheduler/scheduler.h +++ b/clients/drcachesim/scheduler/scheduler.h @@ -758,6 +758,9 @@ template class scheduler_tmpl_t { * The scheduling quantum duration for preemption, in instruction count, * for #QUANTUM_INSTRUCTIONS. The time passed to next_record() is ignored * for purposes of quantum preempts. + * + * Instructions executed in a quantum may end up higher than the specified + * value to avoid interruption of the kernel system call sequence. */ // We pick 10 million to match 2 instructions per nanosecond with a 5ms quantum. uint64_t quantum_duration_instrs = 10 * 1000 * 1000; diff --git a/clients/drcachesim/scheduler/scheduler_dynamic.cpp b/clients/drcachesim/scheduler/scheduler_dynamic.cpp index 52e5e6352a6..8f8dad07eb8 100644 --- a/clients/drcachesim/scheduler/scheduler_dynamic.cpp +++ b/clients/drcachesim/scheduler/scheduler_dynamic.cpp @@ -458,7 +458,9 @@ scheduler_dynamic_tmpl_t::check_for_input_switch( // boundaries so we live with those being before the switch. // XXX: Once we insert kernel traces, we may have to try harder // to stop before the post-syscall records. - if (this->record_type_is_instr_boundary(record, outputs_[output].last_record)) { + if (this->record_type_is_instr_boundary(record, outputs_[output].last_record) && + // We want to delay the context switch until after the injected syscall trace. + !outputs_[output].in_syscall_code) { if (input->switch_to_input != sched_type_t::INVALID_INPUT_ORDINAL) { // The switch request overrides any latency threshold. need_new_input = true; @@ -506,18 +508,28 @@ scheduler_dynamic_tmpl_t::check_for_input_switch( } if (options_.quantum_unit == sched_type_t::QUANTUM_INSTRUCTIONS && this->record_type_is_instr_boundary(record, outputs_[output].last_record) && - !outputs_[output].in_kernel_code) { + !outputs_[output].in_context_switch_code) { ++input->instrs_in_quantum; if (input->instrs_in_quantum > options_.quantum_duration_instrs) { - // We again prefer to switch to another input even if the current - // input has the oldest timestamp, prioritizing context switches - // over timestamp ordering. - VPRINT(this, 4, "next_record[%d]: input %d hit end of instr quantum\n", - output, input->index); - preempt = true; - need_new_input = true; - input->instrs_in_quantum = 0; - ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS]; + if (outputs_[output].in_syscall_code) { + // XXX: Maybe this should be printed only once per-syscall-instance to + // reduce log spam. + VPRINT(this, 5, + "next_record[%d]: input %d delaying context switch " + "after end of instr quantum due to syscall trace\n", + output, input->index); + + } else { + // We again prefer to switch to another input even if the current + // input has the oldest timestamp, prioritizing context switches + // over timestamp ordering. + VPRINT(this, 4, "next_record[%d]: input %d hit end of instr quantum\n", + output, input->index); + preempt = true; + need_new_input = true; + input->instrs_in_quantum = 0; + ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS]; + } } } else if (options_.quantum_unit == sched_type_t::QUANTUM_TIME) { if (cur_time == 0 || cur_time < input->prev_time_in_quantum) { @@ -535,14 +547,23 @@ scheduler_dynamic_tmpl_t::check_for_input_switch( // in between (e.g., scatter/gather long sequence of reads/writes) by // setting input->switching_pre_instruction. this->record_type_is_instr_boundary(record, outputs_[output].last_record)) { - VPRINT(this, 4, - "next_record[%d]: input %d hit end of time quantum after %" PRIu64 - "\n", - output, input->index, input->time_spent_in_quantum); - preempt = true; - need_new_input = true; - input->time_spent_in_quantum = 0; - ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS]; + if (outputs_[output].in_syscall_code) { + // XXX: Maybe this should be printed only once per-syscall-instance to + // reduce log spam. + VPRINT(this, 5, + "next_record[%d]: input %d delaying context switch after end of " + "time quantum after %" PRIu64 " due to syscall trace\n", + output, input->index, input->time_spent_in_quantum); + } else { + VPRINT(this, 4, + "next_record[%d]: input %d hit end of time quantum after %" PRIu64 + "\n", + output, input->index, input->time_spent_in_quantum); + preempt = true; + need_new_input = true; + input->time_spent_in_quantum = 0; + ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS]; + } } } // For sched_type_t::DEPENDENCY_TIMESTAMPS: enforcing asked-for @@ -574,16 +595,20 @@ scheduler_dynamic_tmpl_t::process_marker( break; case TRACE_MARKER_TYPE_CONTEXT_SWITCH_START: outputs_[output].in_context_switch_code = true; - ANNOTATE_FALLTHROUGH; + break; case TRACE_MARKER_TYPE_SYSCALL_TRACE_START: - outputs_[output].in_kernel_code = true; + outputs_[output].in_syscall_code = true; break; case TRACE_MARKER_TYPE_CONTEXT_SWITCH_END: // We have to delay until the next record. outputs_[output].hit_switch_code_end = true; - ANNOTATE_FALLTHROUGH; + break; case TRACE_MARKER_TYPE_SYSCALL_TRACE_END: - outputs_[output].in_kernel_code = false; + outputs_[output].in_syscall_code = false; + break; + case TRACE_MARKER_TYPE_TIMESTAMP: + // Syscall sequences are not expected to have a timestamp. + assert(!outputs_[output].in_syscall_code); break; case TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH: { if (!options_.honor_direct_switches) diff --git a/clients/drcachesim/scheduler/scheduler_impl.h b/clients/drcachesim/scheduler/scheduler_impl.h index 260b5051aac..2ddbb710545 100644 --- a/clients/drcachesim/scheduler/scheduler_impl.h +++ b/clients/drcachesim/scheduler/scheduler_impl.h @@ -480,7 +480,7 @@ template class scheduler_impl_tmpl_t // This is accessed by other outputs for stealing and rebalancing. // Indirected so we can store it in our vector. std::unique_ptr> active; - bool in_kernel_code = false; + bool in_syscall_code = false; bool in_context_switch_code = false; bool hit_switch_code_end = false; // Used for time-based quanta. diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp index 83807e21a66..616112fbed4 100644 --- a/clients/drcachesim/tests/scheduler_unit_tests.cpp +++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp @@ -190,7 +190,8 @@ run_lockstep_simulation(scheduler_t &scheduler, int num_outputs, memref_tid_t ti int64_t meta_records = 0; // Record the threads, one char each. std::vector sched_as_string(num_outputs); - static constexpr char THREAD_LETTER_START = 'A'; + static constexpr char THREAD_LETTER_START_USER = 'A'; + static constexpr char THREAD_LETTER_START_KERNEL = 'a'; static constexpr char WAIT_SYMBOL = '-'; static constexpr char IDLE_SYMBOL = '_'; static constexpr char NON_INSTR_SYMBOL = '.'; @@ -229,8 +230,10 @@ run_lockstep_simulation(scheduler_t &scheduler, int num_outputs, memref_tid_t ti } assert(status == scheduler_t::STATUS_OK); if (type_is_instr(memref.instr.type)) { + bool is_kernel = outputs[i]->is_record_kernel(); sched_as_string[i] += - THREAD_LETTER_START + static_cast(memref.instr.tid - tid_base); + (is_kernel ? THREAD_LETTER_START_KERNEL : THREAD_LETTER_START_USER) + + static_cast(memref.instr.tid - tid_base); } else { // While this makes the string longer, it is just too confusing // with the same letter seemingly on 2 cores at once without these @@ -1354,6 +1357,254 @@ test_synthetic() } } +static void +test_synthetic_with_syscall_seq() +{ + std::cerr << "\n----------------\nTesting synthetic with syscall sequences\n"; + static constexpr int NUM_INPUTS = 7; + static constexpr int NUM_OUTPUTS = 2; + static constexpr int NUM_INSTRS = 9; + static constexpr int QUANTUM_DURATION = 3; + // We do not want to block for very long. + static constexpr double BLOCK_SCALE = 0.01; + static constexpr uint64_t BLOCK_THRESHOLD = 100; + static constexpr memref_tid_t TID_BASE = 100; + static constexpr uint64_t KERNEL_CODE_OFFSET = 123456; + static constexpr uint64_t SYSTRACE_NUM = 84; + std::vector inputs[NUM_INPUTS]; + for (int i = 0; i < NUM_INPUTS; i++) { + memref_tid_t tid = TID_BASE + i; + inputs[i].push_back(make_thread(tid)); + inputs[i].push_back(make_pid(1)); + inputs[i].push_back(make_version(TRACE_ENTRY_VERSION)); + inputs[i].push_back(make_timestamp(10)); // All the same time priority. + for (int j = 0; j < NUM_INSTRS; j++) { + inputs[i].push_back(make_instr(42 + j * 4)); + // Test a syscall sequence starting at each offset within a quantum + // of instrs. + if (i <= QUANTUM_DURATION && i == j) { + inputs[i].push_back(make_timestamp(20)); + inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, SYSTRACE_NUM)); + if (i < 2) { + // Thresholds for only blocking syscalls are low enough to + // cause a context switch. So only A and B will try a voluntary + // switch (which may be delayed due to the syscall trace) after + // 1 or 2 instrs respectively. + inputs[i].push_back( + make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0)); + } + inputs[i].push_back(make_timestamp(120)); + inputs[i].push_back( + make_marker(TRACE_MARKER_TYPE_SYSCALL_TRACE_START, SYSTRACE_NUM)); + // A has just one syscall seq instr to show that it still does the + // voluntary switch after the syscall trace is done, even though there + // is still room for one more instr in its quantum. + // D has just one syscall seq instr to show that it will continue + // on without a switch after the syscall trace is done because more + // instrs were left in the same quantum. + // B and C have longer syscall seq to show that they will not be + // preempted by voluntary or quantum switches respectively. + int count_syscall_instrs = (i == 0 || i == 3) ? 1 : QUANTUM_DURATION; + for (int k = 1; k <= count_syscall_instrs; ++k) + inputs[i].push_back(make_instr(KERNEL_CODE_OFFSET + k)); + inputs[i].push_back( + make_marker(TRACE_MARKER_TYPE_SYSCALL_TRACE_END, SYSTRACE_NUM)); + } + } + inputs[i].push_back(make_exit(tid)); + } + // A has a syscall sequence at [2,2], B has it at [3,5], C has it at [4,6], + // D has it at [5,5]. + + // Hardcoding here for the 2 outputs and 7 inputs. + // We make assumptions on the scheduler's initial runqueue assignment + // being round-robin, resulting in 4 on core0 (odd parity letters) and 3 on + // core1 (even parity letters). + // The dots are markers and thread exits. + // + // A has a voluntary switch after its first two letters, prompted by its + // first instr which is a blocking syscall with latency that exceeds switch + // threshold, but not before its 2nd instr which is from the syscall trace + // and must be shown before the switch happens. Despite there being room for + // 1 more instr left in the quantum, the voluntary switch still happens. + // When scheduled next, A has room to execute only one instr left in its + // quantum limit (which was carried over after the voluntary switch). + // + // B has a voluntary switch after its first 5 letters, prompted by its 2nd + // instr which is a blocking system call with latency that exceeds switch + // threshold, but not before its next three instrs which are from the + // syscall trace and must be shown before the switch happens. B ends up + // executing more instrs than its quantum limit because of the syscall + // trace. + // + // C has a syscall at its third letter (but it doesn't cause a switch + // because it doesn't have sufficiently high latency), followed by the + // syscall trace of three additional letters. C ends up + // executing more instrs than the quantum limit because of the syscall + // trace. + // + // D has a syscall at its 4th letter, followed by a 1-instr syscall + // trace. D continues with its regular instrs without a context switch + // at its 6th letter because there is still room for more instrs left in + // the quantum. + // + // Since core0 has an extra input, core1 finishes + // its runqueue first and then steals G from core0 (migration threshold is 0) + // and finishes it off. + static const char *const CORE0_SCHED_STRING = + "..A.....a...CCC....ccc...EEE..GGGACCCEEEGGGAAACCC.EEE.AAAA."; + static const char *const CORE1_SCHED_STRING = + "..BB.....bbb...DDD..FFFBBBD....d.DFFFBBBDDDFFF.B.D.GGG.____"; + { + // Test instruction quanta. + std::vector sched_inputs; + for (int i = 0; i < NUM_INPUTS; i++) { + std::vector readers; + readers.emplace_back( + std::unique_ptr(new mock_reader_t(inputs[i])), + std::unique_ptr(new mock_reader_t()), TID_BASE + i); + sched_inputs.emplace_back(std::move(readers)); + } + scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT, + scheduler_t::DEPENDENCY_IGNORE, + scheduler_t::SCHEDULER_DEFAULTS, + /*verbosity=*/4); + sched_ops.quantum_duration_instrs = QUANTUM_DURATION; + // This was tuned with a 100us threshold: so avoid scheduler.h defaults + // changes from affecting our output. + sched_ops.blocking_switch_threshold = BLOCK_THRESHOLD; + sched_ops.block_time_multiplier = BLOCK_SCALE; + sched_ops.time_units_per_us = 1.; + // Migration is measured in wall-clock-time for instr quanta + // so avoid non-determinism by having no threshold. + sched_ops.migration_threshold_us = 0; + scheduler_t scheduler; + if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != + scheduler_t::STATUS_SUCCESS) + assert(false); + std::vector sched_as_string = + run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE); + for (int i = 0; i < NUM_OUTPUTS; i++) { + std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n"; + } + // Check scheduler stats. # switches is the # of letter transitions; # preempts + // is the instances where the same letter appears 3 times without another letter + // appearing in between (and ignoring the last letter for an input: EOF doesn't + // count as a preempt). # nops are the instances where the same input is picked + // to run because nothing else is waiting. + verify_scheduler_stats(scheduler.get_stream(0), /*switch_input_to_input=*/11, + /*switch_input_to_idle=*/0, /*switch_idle_to_input=*/0, + /*switch_nop=*/1, /*preempts=*/9, /*direct_attempts=*/0, + /*direct_successes=*/0, /*migrations=*/1); + verify_scheduler_stats(scheduler.get_stream(1), /*switch_input_to_input=*/11, + /*switch_input_to_idle=*/1, /*switch_idle_to_input=*/0, + /*switch_nop=*/0, /*preempts=*/8, /*direct_attempts=*/0, + /*direct_successes=*/0, /*migrations=*/0); + assert(scheduler.get_stream(0)->get_schedule_statistic( + memtrace_stream_t::SCHED_STAT_RUNQUEUE_STEALS) == 0); + assert(scheduler.get_stream(1)->get_schedule_statistic( + memtrace_stream_t::SCHED_STAT_RUNQUEUE_STEALS) == 1); +#ifndef WIN32 + // XXX: Windows microseconds on test VMs are very coarse and stay the same + // for long periods. Instruction quanta use wall-clock idle times, so + // the result is extreme variations here. We try to adjust by handling + // any schedule with below specific patterns. We just give up on checking the + // precise output for this test on Windows. + if (sched_as_string[0] != CORE0_SCHED_STRING || + sched_as_string[1] != CORE1_SCHED_STRING) { + // XXX: These bools could potentially be made into ints, but then + // maybe our check will become too strict, defeating the purpose of + // this relaxation. + bool found_single_A = false, found_single_B = false, found_single_D = false; + for (int cpu = 0; cpu < NUM_OUTPUTS; ++cpu) { + for (size_t i = 1; i < sched_as_string[cpu].size() - 1; ++i) { + // We expect a single 'A' for the first instr executed by 'A', + // which will be followed by a marker ('.') for the syscall, + // and the third instr executed by it which will be the only + // instruction executed by it during that scheduling because + // prior bookkeeping for that quantum exhaused all-but-one + // instruction. + if (sched_as_string[cpu][i] == 'A' && + sched_as_string[cpu][i - 1] != 'A' && + sched_as_string[cpu][i + 1] != 'A') + found_single_A = true; + // We expect a single 'B' for the last instr executed by B + // which will have to be in its own separate 3-instr quantum. + if (sched_as_string[cpu][i] == 'B' && + sched_as_string[cpu][i - 1] != 'B' && + sched_as_string[cpu][i + 1] != 'B') + found_single_B = true; + // We expect a single 'D' for the one quantum where the + // 1st and 3rd instrs executed by D were regular, and the + // 2nd one was from a syscall (which is 'd'). Also, the + // last (10th) instr executed by D will have to be in its + // own separate 3-instr quantum. + if (sched_as_string[cpu][i] == 'D' && + sched_as_string[cpu][i - 1] != 'D' && + sched_as_string[cpu][i + 1] != 'D') + found_single_D = true; + } + } + bool found_syscall_a = false, found_syscall_b = false, + found_syscall_c = false, found_syscall_d = false; + for (int cpu = 0; cpu < NUM_OUTPUTS; ++cpu) { + // The '.' at beginning and end of each of the searched sequences + // below is for the syscall trace start and end markers. + if (sched_as_string[cpu].find(".a.") != std::string::npos) { + found_syscall_a = true; + } + if (sched_as_string[cpu].find(".bbb.") != std::string::npos) { + found_syscall_b = true; + } + if (sched_as_string[cpu].find(".ccc.") != std::string::npos) { + found_syscall_c = true; + } + if (sched_as_string[cpu].find(".d.") != std::string::npos) { + found_syscall_d = true; + } + } + assert(found_single_A && found_single_B && found_single_D); + assert(found_syscall_a && found_syscall_b && found_syscall_c && + found_syscall_d); + } +#endif + } + { + // Test time quanta. + std::vector sched_inputs; + for (int i = 0; i < NUM_INPUTS; i++) { + std::vector readers; + readers.emplace_back( + std::unique_ptr(new mock_reader_t(inputs[i])), + std::unique_ptr(new mock_reader_t()), TID_BASE + i); + sched_inputs.emplace_back(std::move(readers)); + } + scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT, + scheduler_t::DEPENDENCY_IGNORE, + scheduler_t::SCHEDULER_DEFAULTS, + /*verbosity=*/4); + sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.time_units_per_us = 1.; + // This was tuned with a 100us threshold: so avoid scheduler.h defaults + // changes from affecting our output. + sched_ops.blocking_switch_threshold = BLOCK_THRESHOLD; + sched_ops.quantum_duration_us = QUANTUM_DURATION; + sched_ops.block_time_multiplier = BLOCK_SCALE; + sched_ops.migration_threshold_us = 0; + scheduler_t scheduler; + if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != + scheduler_t::STATUS_SUCCESS) + assert(false); + std::vector sched_as_string = + run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true); + for (int i = 0; i < NUM_OUTPUTS; i++) { + std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n"; + } + assert(sched_as_string[0] == CORE0_SCHED_STRING); + assert(sched_as_string[1] == CORE1_SCHED_STRING); + } +} + static void test_synthetic_time_quanta() { @@ -6424,6 +6675,7 @@ test_main(int argc, const char *argv[]) test_only_threads(); test_real_file_queries_and_filters(argv[1]); test_synthetic(); + test_synthetic_with_syscall_seq(); test_synthetic_time_quanta(); test_synthetic_with_timestamps(); test_synthetic_with_priorities();