Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

i#7157 syscall sched: Handle static injected syscall traces in scheduler #7158

Merged
merged 10 commits into from
Dec 19, 2024
63 changes: 40 additions & 23 deletions clients/drcachesim/scheduler/scheduler_dynamic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,9 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::check_for_input_switch(
// boundaries so we live with those being before the switch.
// XXX: Once we insert kernel traces, we may have to try harder
// to stop before the post-syscall records.
if (this->record_type_is_instr_boundary(record, outputs_[output].last_record)) {
if (this->record_type_is_instr_boundary(record, outputs_[output].last_record) &&
// We want to delay the context switch until after the injected syscall trace.
!outputs_[output].in_syscall_code) {
if (input->switch_to_input != sched_type_t::INVALID_INPUT_ORDINAL) {
// The switch request overrides any latency threshold.
need_new_input = true;
Expand Down Expand Up @@ -506,18 +508,26 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::check_for_input_switch(
}
if (options_.quantum_unit == sched_type_t::QUANTUM_INSTRUCTIONS &&
this->record_type_is_instr_boundary(record, outputs_[output].last_record) &&
!outputs_[output].in_kernel_code) {
!outputs_[output].in_context_switch_code) {
++input->instrs_in_quantum;
if (input->instrs_in_quantum > options_.quantum_duration_instrs) {
// We again prefer to switch to another input even if the current
// input has the oldest timestamp, prioritizing context switches
// over timestamp ordering.
VPRINT(this, 4, "next_record[%d]: input %d hit end of instr quantum\n",
output, input->index);
preempt = true;
need_new_input = true;
input->instrs_in_quantum = 0;
++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS];
if (outputs_[output].in_syscall_code) {
VPRINT(this, 4,
abhinav92003 marked this conversation as resolved.
Show resolved Hide resolved
"next_record[%d]: input %d delaying context switch "
"after end of instr quantum due to syscall trace\n",
abhinav92003 marked this conversation as resolved.
Show resolved Hide resolved
output, input->index);

} else {
// We again prefer to switch to another input even if the current
// input has the oldest timestamp, prioritizing context switches
// over timestamp ordering.
VPRINT(this, 4, "next_record[%d]: input %d hit end of instr quantum\n",
output, input->index);
preempt = true;
need_new_input = true;
input->instrs_in_quantum = 0;
++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS];
}
}
} else if (options_.quantum_unit == sched_type_t::QUANTUM_TIME) {
if (cur_time == 0 || cur_time < input->prev_time_in_quantum) {
Expand All @@ -535,14 +545,21 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::check_for_input_switch(
// in between (e.g., scatter/gather long sequence of reads/writes) by
// setting input->switching_pre_instruction.
this->record_type_is_instr_boundary(record, outputs_[output].last_record)) {
VPRINT(this, 4,
"next_record[%d]: input %d hit end of time quantum after %" PRIu64
"\n",
output, input->index, input->time_spent_in_quantum);
preempt = true;
need_new_input = true;
input->time_spent_in_quantum = 0;
++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS];
if (outputs_[output].in_syscall_code) {
VPRINT(this, 4,
"next_record[%d]: input %d delaying context switch after end of "
"time quantum after %" PRIu64 " due to syscall trace\n",
output, input->index, input->time_spent_in_quantum);
} else {
VPRINT(this, 4,
"next_record[%d]: input %d hit end of time quantum after %" PRIu64
"\n",
output, input->index, input->time_spent_in_quantum);
preempt = true;
need_new_input = true;
input->time_spent_in_quantum = 0;
++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS];
}
abhinav92003 marked this conversation as resolved.
Show resolved Hide resolved
}
}
// For sched_type_t::DEPENDENCY_TIMESTAMPS: enforcing asked-for
Expand Down Expand Up @@ -574,16 +591,16 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::process_marker(
break;
case TRACE_MARKER_TYPE_CONTEXT_SWITCH_START:
outputs_[output].in_context_switch_code = true;
ANNOTATE_FALLTHROUGH;
break;
case TRACE_MARKER_TYPE_SYSCALL_TRACE_START:
outputs_[output].in_kernel_code = true;
outputs_[output].in_syscall_code = true;
break;
case TRACE_MARKER_TYPE_CONTEXT_SWITCH_END:
// We have to delay until the next record.
outputs_[output].hit_switch_code_end = true;
ANNOTATE_FALLTHROUGH;
break;
case TRACE_MARKER_TYPE_SYSCALL_TRACE_END:
outputs_[output].in_kernel_code = false;
outputs_[output].in_syscall_code = false;
break;
case TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH: {
if (!options_.honor_direct_switches)
Expand Down
2 changes: 1 addition & 1 deletion clients/drcachesim/scheduler/scheduler_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
// This is accessed by other outputs for stealing and rebalancing.
// Indirected so we can store it in our vector.
std::unique_ptr<std::atomic<bool>> active;
bool in_kernel_code = false;
bool in_syscall_code = false;
bool in_context_switch_code = false;
bool hit_switch_code_end = false;
// Used for time-based quanta.
Expand Down
178 changes: 178 additions & 0 deletions clients/drcachesim/tests/scheduler_unit_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1354,6 +1354,183 @@ test_synthetic()
}
}

static void
test_synthetic_with_syscall_seq()
{
std::cerr << "\n----------------\nTesting synthetic with syscall sequences\n";
static constexpr int NUM_INPUTS = 7;
static constexpr int NUM_OUTPUTS = 2;
static constexpr int NUM_INSTRS = 9;
static constexpr int QUANTUM_DURATION = 3;
// We do not want to block for very long.
static constexpr double BLOCK_SCALE = 0.01;
static constexpr uint64_t BLOCK_THRESHOLD = 100;
static constexpr memref_tid_t TID_BASE = 100;
static constexpr uint64_t KERNEL_CODE_OFFSET = 123456;
std::vector<trace_entry_t> inputs[NUM_INPUTS];
for (int i = 0; i < NUM_INPUTS; i++) {
memref_tid_t tid = TID_BASE + i;
inputs[i].push_back(make_thread(tid));
inputs[i].push_back(make_pid(1));
inputs[i].push_back(make_version(TRACE_ENTRY_VERSION));
inputs[i].push_back(make_timestamp(10)); // All the same time priority.
for (int j = 0; j < NUM_INSTRS; j++) {
inputs[i].push_back(make_instr(42 + j * 4));
// Test accumulation of usage across voluntary switches.
if ((i == 0 || i == 1) && j == 1) {
inputs[i].push_back(make_timestamp(20));
inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, 42));
inputs[i].push_back(
make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0));
inputs[i].push_back(make_timestamp(120));
}
// Test a syscall sequence starting at each offset within a quantum
// of instrs.
if (i <= QUANTUM_DURATION && i == j) {
inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, 84));
inputs[i].push_back(
make_marker(TRACE_MARKER_TYPE_SYSCALL_TRACE_START, 84));
for (int k = 0; k <= QUANTUM_DURATION; ++k)
inputs[i].push_back(make_instr(KERNEL_CODE_OFFSET + k));
inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL_TRACE_END, 84));
abhinav92003 marked this conversation as resolved.
Show resolved Hide resolved
}
}
inputs[i].push_back(make_exit(tid));
}
// A has a syscall sequence at [2,5], B has it at [3,6], C has it at [4,7],
// D has it at [5,8].
// The syscall sequence consists of 4 instrs which is greater than the
// #instr quanta.
// Total instrs in A, B, C, and D are 9 + 4 == 13, others have just 9.

// Hardcoding here for the 2 outputs and 7 inputs.
// We make assumptions on the scheduler's initial runqueue assignment
// being round-robin, resulting in 4 on core0 (odd parity letters) and 3 on
// core1 (even parity letters).
// The dots are markers and thread exits.
// A has a voluntary switch at its 6th instr (1st in that scheduling). Its
abhinav92003 marked this conversation as resolved.
Show resolved Hide resolved
// CPU usage persists to its next scheduling which has only 2 letters.
// B has a voluntary switch at its 2nd instr, but it doesn't take because a
// syscall sequence starts just then.
abhinav92003 marked this conversation as resolved.
Show resolved Hide resolved
// Since core0 has an extra input, core1 finishes
// its runqueue first and then steals G from core0 (migration threshold is 0)
// and finishes it off.
static const char *const CORE0_SCHED_STRING =
"..A..AAAA...CCC..CCCC...EEE..GGGA....CCCEEEGGGAACCC.EEE.AAAAA.";
static const char *const CORE1_SCHED_STRING =
"..BB......BBBB...DDD..FFFBBBD..DDDD.FFFBBBDDDFFF.B.DD.GGG.____";
{
// Test instruction quanta.
std::vector<scheduler_t::input_workload_t> sched_inputs;
for (int i = 0; i < NUM_INPUTS; i++) {
std::vector<scheduler_t::input_reader_t> readers;
readers.emplace_back(
std::unique_ptr<mock_reader_t>(new mock_reader_t(inputs[i])),
std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_BASE + i);
sched_inputs.emplace_back(std::move(readers));
}
scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
scheduler_t::DEPENDENCY_IGNORE,
scheduler_t::SCHEDULER_DEFAULTS,
/*verbosity=*/4);
sched_ops.quantum_duration_instrs = QUANTUM_DURATION;
// This was tuned with a 100us threshold: so avoid scheduler.h defaults
// changes from affecting our output.
sched_ops.blocking_switch_threshold = BLOCK_THRESHOLD;
sched_ops.block_time_multiplier = BLOCK_SCALE;
sched_ops.time_units_per_us = 1.;
// Migration is measured in wall-clock-time for instr quanta
// so avoid non-determinism by having no threshold.
sched_ops.migration_threshold_us = 0;
scheduler_t scheduler;
if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
scheduler_t::STATUS_SUCCESS)
assert(false);
std::vector<std::string> sched_as_string =
run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE);
for (int i = 0; i < NUM_OUTPUTS; i++) {
std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
}
// Check scheduler stats. # switches is the # of letter transitions; # preempts
// is the instances where the same letter appears 3 times without another letter
// appearing in between (and ignoring the last letter for an input: EOF doesn't
// count as a preempt). # nops are the instances where the same input is picked
// to run because nothing else is waiting.
verify_scheduler_stats(scheduler.get_stream(0), /*switch_input_to_input=*/11,
/*switch_input_to_idle=*/0, /*switch_idle_to_input=*/0,
/*switch_nop=*/1, /*preempts=*/9, /*direct_attempts=*/0,
/*direct_successes=*/0, /*migrations=*/1);
verify_scheduler_stats(scheduler.get_stream(1), /*switch_input_to_input=*/11,
/*switch_input_to_idle=*/1, /*switch_idle_to_input=*/0,
/*switch_nop=*/0, /*preempts=*/8, /*direct_attempts=*/0,
/*direct_successes=*/0, /*migrations=*/0);
assert(scheduler.get_stream(0)->get_schedule_statistic(
memtrace_stream_t::SCHED_STAT_RUNQUEUE_STEALS) == 0);
assert(scheduler.get_stream(1)->get_schedule_statistic(
memtrace_stream_t::SCHED_STAT_RUNQUEUE_STEALS) == 1);
#ifndef WIN32
// XXX: Windows microseconds on test VMs are very coarse and stay the same
// for long periods. Instruction quanta use wall-clock idle times, so
// the result is extreme variations here. We try to adjust by handling
// any schedule with singleton 'A' and 'B', but in some cases on Windows
// we see the A and B delayed all the way to the very end where they
// are adjacent to their own letters. We just give up on checking the
// precise output for this test on Windows.
if (sched_as_string[0] != CORE0_SCHED_STRING ||
sched_as_string[1] != CORE1_SCHED_STRING) {
bool found_single_A = false, found_single_B = false;
for (int cpu = 0; cpu < NUM_OUTPUTS; ++cpu) {
for (size_t i = 1; i < sched_as_string[cpu].size() - 1; ++i) {
if (sched_as_string[cpu][i] == 'A' &&
sched_as_string[cpu][i - 1] != 'A' &&
sched_as_string[cpu][i + 1] != 'A')
found_single_A = true;
if (sched_as_string[cpu][i] == 'B' &&
sched_as_string[cpu][i - 1] != 'B' &&
sched_as_string[cpu][i + 1] != 'B')
found_single_B = true;
}
}
assert(found_single_A && found_single_B);
}
#endif
}
{
// Test time quanta.
std::vector<scheduler_t::input_workload_t> sched_inputs;
for (int i = 0; i < NUM_INPUTS; i++) {
std::vector<scheduler_t::input_reader_t> readers;
readers.emplace_back(
std::unique_ptr<mock_reader_t>(new mock_reader_t(inputs[i])),
std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_BASE + i);
sched_inputs.emplace_back(std::move(readers));
}
scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
scheduler_t::DEPENDENCY_IGNORE,
scheduler_t::SCHEDULER_DEFAULTS,
/*verbosity=*/4);
sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
sched_ops.time_units_per_us = 1.;
// This was tuned with a 100us threshold: so avoid scheduler.h defaults
// changes from affecting our output.
sched_ops.blocking_switch_threshold = BLOCK_THRESHOLD;
sched_ops.quantum_duration_us = QUANTUM_DURATION;
sched_ops.block_time_multiplier = BLOCK_SCALE;
sched_ops.migration_threshold_us = 0;
scheduler_t scheduler;
if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
scheduler_t::STATUS_SUCCESS)
assert(false);
std::vector<std::string> sched_as_string =
run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true);
for (int i = 0; i < NUM_OUTPUTS; i++) {
std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
}
assert(sched_as_string[0] == CORE0_SCHED_STRING);
assert(sched_as_string[1] == CORE1_SCHED_STRING);
}
}

static void
test_synthetic_time_quanta()
{
Expand Down Expand Up @@ -6424,6 +6601,7 @@ test_main(int argc, const char *argv[])
test_only_threads();
test_real_file_queries_and_filters(argv[1]);
test_synthetic();
test_synthetic_with_syscall_seq();
test_synthetic_time_quanta();
test_synthetic_with_timestamps();
test_synthetic_with_priorities();
Expand Down
Loading