diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h
index 3c7403e309c..dda51faa35c 100644
--- a/clients/drcachesim/scheduler/scheduler.h
+++ b/clients/drcachesim/scheduler/scheduler.h
@@ -758,6 +758,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
          * The scheduling quantum duration for preemption, in instruction count,
          * for #QUANTUM_INSTRUCTIONS.  The time passed to next_record() is ignored
          * for purposes of quantum preempts.
+         *
+         * Instructions executed in a quantum may end up higher than the specified
+         * value to avoid interruption of the kernel system call sequence.
          */
         // We pick 10 million to match 2 instructions per nanosecond with a 5ms quantum.
         uint64_t quantum_duration_instrs = 10 * 1000 * 1000;
diff --git a/clients/drcachesim/scheduler/scheduler_dynamic.cpp b/clients/drcachesim/scheduler/scheduler_dynamic.cpp
index 52e5e6352a6..8f8dad07eb8 100644
--- a/clients/drcachesim/scheduler/scheduler_dynamic.cpp
+++ b/clients/drcachesim/scheduler/scheduler_dynamic.cpp
@@ -458,7 +458,9 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::check_for_input_switch(
         // boundaries so we live with those being before the switch.
         // XXX: Once we insert kernel traces, we may have to try harder
         // to stop before the post-syscall records.
-        if (this->record_type_is_instr_boundary(record, outputs_[output].last_record)) {
+        if (this->record_type_is_instr_boundary(record, outputs_[output].last_record) &&
+            // We want to delay the context switch until after the injected syscall trace.
+            !outputs_[output].in_syscall_code) {
             if (input->switch_to_input != sched_type_t::INVALID_INPUT_ORDINAL) {
                 // The switch request overrides any latency threshold.
                 need_new_input = true;
@@ -506,18 +508,28 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::check_for_input_switch(
     }
     if (options_.quantum_unit == sched_type_t::QUANTUM_INSTRUCTIONS &&
         this->record_type_is_instr_boundary(record, outputs_[output].last_record) &&
-        !outputs_[output].in_kernel_code) {
+        !outputs_[output].in_context_switch_code) {
         ++input->instrs_in_quantum;
         if (input->instrs_in_quantum > options_.quantum_duration_instrs) {
-            // We again prefer to switch to another input even if the current
-            // input has the oldest timestamp, prioritizing context switches
-            // over timestamp ordering.
-            VPRINT(this, 4, "next_record[%d]: input %d hit end of instr quantum\n",
-                   output, input->index);
-            preempt = true;
-            need_new_input = true;
-            input->instrs_in_quantum = 0;
-            ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS];
+            if (outputs_[output].in_syscall_code) {
+                // XXX: Maybe this should be printed only once per-syscall-instance to
+                // reduce log spam.
+                VPRINT(this, 5,
+                       "next_record[%d]: input %d delaying context switch "
+                       "after end of instr quantum due to syscall trace\n",
+                       output, input->index);
+
+            } else {
+                // We again prefer to switch to another input even if the current
+                // input has the oldest timestamp, prioritizing context switches
+                // over timestamp ordering.
+                VPRINT(this, 4, "next_record[%d]: input %d hit end of instr quantum\n",
+                       output, input->index);
+                preempt = true;
+                need_new_input = true;
+                input->instrs_in_quantum = 0;
+                ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS];
+            }
         }
     } else if (options_.quantum_unit == sched_type_t::QUANTUM_TIME) {
         if (cur_time == 0 || cur_time < input->prev_time_in_quantum) {
@@ -535,14 +547,23 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::check_for_input_switch(
             // in between (e.g., scatter/gather long sequence of reads/writes) by
             // setting input->switching_pre_instruction.
             this->record_type_is_instr_boundary(record, outputs_[output].last_record)) {
-            VPRINT(this, 4,
-                   "next_record[%d]: input %d hit end of time quantum after %" PRIu64
-                   "\n",
-                   output, input->index, input->time_spent_in_quantum);
-            preempt = true;
-            need_new_input = true;
-            input->time_spent_in_quantum = 0;
-            ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS];
+            if (outputs_[output].in_syscall_code) {
+                // XXX: Maybe this should be printed only once per-syscall-instance to
+                // reduce log spam.
+                VPRINT(this, 5,
+                       "next_record[%d]: input %d delaying context switch after end of "
+                       "time quantum after %" PRIu64 " due to syscall trace\n",
+                       output, input->index, input->time_spent_in_quantum);
+            } else {
+                VPRINT(this, 4,
+                       "next_record[%d]: input %d hit end of time quantum after %" PRIu64
+                       "\n",
+                       output, input->index, input->time_spent_in_quantum);
+                preempt = true;
+                need_new_input = true;
+                input->time_spent_in_quantum = 0;
+                ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS];
+            }
         }
     }
     // For sched_type_t::DEPENDENCY_TIMESTAMPS: enforcing asked-for
@@ -574,16 +595,20 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::process_marker(
         break;
     case TRACE_MARKER_TYPE_CONTEXT_SWITCH_START:
         outputs_[output].in_context_switch_code = true;
-        ANNOTATE_FALLTHROUGH;
+        break;
     case TRACE_MARKER_TYPE_SYSCALL_TRACE_START:
-        outputs_[output].in_kernel_code = true;
+        outputs_[output].in_syscall_code = true;
         break;
     case TRACE_MARKER_TYPE_CONTEXT_SWITCH_END:
         // We have to delay until the next record.
         outputs_[output].hit_switch_code_end = true;
-        ANNOTATE_FALLTHROUGH;
+        break;
     case TRACE_MARKER_TYPE_SYSCALL_TRACE_END:
-        outputs_[output].in_kernel_code = false;
+        outputs_[output].in_syscall_code = false;
+        break;
+    case TRACE_MARKER_TYPE_TIMESTAMP:
+        // Syscall sequences are not expected to have a timestamp.
+        assert(!outputs_[output].in_syscall_code);
         break;
     case TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH: {
         if (!options_.honor_direct_switches)
diff --git a/clients/drcachesim/scheduler/scheduler_impl.h b/clients/drcachesim/scheduler/scheduler_impl.h
index 260b5051aac..2ddbb710545 100644
--- a/clients/drcachesim/scheduler/scheduler_impl.h
+++ b/clients/drcachesim/scheduler/scheduler_impl.h
@@ -480,7 +480,7 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
         // This is accessed by other outputs for stealing and rebalancing.
         // Indirected so we can store it in our vector.
         std::unique_ptr<std::atomic<bool>> active;
-        bool in_kernel_code = false;
+        bool in_syscall_code = false;
         bool in_context_switch_code = false;
         bool hit_switch_code_end = false;
         // Used for time-based quanta.
diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
index 83807e21a66..616112fbed4 100644
--- a/clients/drcachesim/tests/scheduler_unit_tests.cpp
+++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -190,7 +190,8 @@ run_lockstep_simulation(scheduler_t &scheduler, int num_outputs, memref_tid_t ti
     int64_t meta_records = 0;
     // Record the threads, one char each.
     std::vector<std::string> sched_as_string(num_outputs);
-    static constexpr char THREAD_LETTER_START = 'A';
+    static constexpr char THREAD_LETTER_START_USER = 'A';
+    static constexpr char THREAD_LETTER_START_KERNEL = 'a';
     static constexpr char WAIT_SYMBOL = '-';
     static constexpr char IDLE_SYMBOL = '_';
     static constexpr char NON_INSTR_SYMBOL = '.';
@@ -229,8 +230,10 @@ run_lockstep_simulation(scheduler_t &scheduler, int num_outputs, memref_tid_t ti
             }
             assert(status == scheduler_t::STATUS_OK);
             if (type_is_instr(memref.instr.type)) {
+                bool is_kernel = outputs[i]->is_record_kernel();
                 sched_as_string[i] +=
-                    THREAD_LETTER_START + static_cast<char>(memref.instr.tid - tid_base);
+                    (is_kernel ? THREAD_LETTER_START_KERNEL : THREAD_LETTER_START_USER) +
+                    static_cast<char>(memref.instr.tid - tid_base);
             } else {
                 // While this makes the string longer, it is just too confusing
                 // with the same letter seemingly on 2 cores at once without these
@@ -1354,6 +1357,254 @@ test_synthetic()
     }
 }
 
+static void
+test_synthetic_with_syscall_seq()
+{
+    std::cerr << "\n----------------\nTesting synthetic with syscall sequences\n";
+    static constexpr int NUM_INPUTS = 7;
+    static constexpr int NUM_OUTPUTS = 2;
+    static constexpr int NUM_INSTRS = 9;
+    static constexpr int QUANTUM_DURATION = 3;
+    // We do not want to block for very long.
+    static constexpr double BLOCK_SCALE = 0.01;
+    static constexpr uint64_t BLOCK_THRESHOLD = 100;
+    static constexpr memref_tid_t TID_BASE = 100;
+    static constexpr uint64_t KERNEL_CODE_OFFSET = 123456;
+    static constexpr uint64_t SYSTRACE_NUM = 84;
+    std::vector<trace_entry_t> inputs[NUM_INPUTS];
+    for (int i = 0; i < NUM_INPUTS; i++) {
+        memref_tid_t tid = TID_BASE + i;
+        inputs[i].push_back(make_thread(tid));
+        inputs[i].push_back(make_pid(1));
+        inputs[i].push_back(make_version(TRACE_ENTRY_VERSION));
+        inputs[i].push_back(make_timestamp(10)); // All the same time priority.
+        for (int j = 0; j < NUM_INSTRS; j++) {
+            inputs[i].push_back(make_instr(42 + j * 4));
+            // Test a syscall sequence starting at each offset within a quantum
+            // of instrs.
+            if (i <= QUANTUM_DURATION && i == j) {
+                inputs[i].push_back(make_timestamp(20));
+                inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, SYSTRACE_NUM));
+                if (i < 2) {
+                    // Thresholds for only blocking syscalls are low enough to
+                    // cause a context switch. So only A and B will try a voluntary
+                    // switch (which may be delayed due to the syscall trace) after
+                    // 1 or 2 instrs respectively.
+                    inputs[i].push_back(
+                        make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0));
+                }
+                inputs[i].push_back(make_timestamp(120));
+                inputs[i].push_back(
+                    make_marker(TRACE_MARKER_TYPE_SYSCALL_TRACE_START, SYSTRACE_NUM));
+                // A has just one syscall seq instr to show that it still does the
+                // voluntary switch after the syscall trace is done, even though there
+                // is still room for one more instr in its quantum.
+                // D has just one syscall seq instr to show that it will continue
+                // on without a switch after the syscall trace is done because more
+                // instrs were left in the same quantum.
+                // B and C have longer syscall seq to show that they will not be
+                // preempted by voluntary or quantum switches respectively.
+                int count_syscall_instrs = (i == 0 || i == 3) ? 1 : QUANTUM_DURATION;
+                for (int k = 1; k <= count_syscall_instrs; ++k)
+                    inputs[i].push_back(make_instr(KERNEL_CODE_OFFSET + k));
+                inputs[i].push_back(
+                    make_marker(TRACE_MARKER_TYPE_SYSCALL_TRACE_END, SYSTRACE_NUM));
+            }
+        }
+        inputs[i].push_back(make_exit(tid));
+    }
+    // A has a syscall sequence at [2,2], B has it at [3,5], C has it at [4,6],
+    // D has it at [5,5].
+
+    // Hardcoding here for the 2 outputs and 7 inputs.
+    // We make assumptions on the scheduler's initial runqueue assignment
+    // being round-robin, resulting in 4 on core0 (odd parity letters) and 3 on
+    // core1 (even parity letters).
+    // The dots are markers and thread exits.
+    //
+    // A has a voluntary switch after its first two letters, prompted by its
+    // first instr which is a blocking syscall with latency that exceeds switch
+    // threshold, but not before its 2nd instr which is from the syscall trace
+    // and must be shown before the switch happens. Despite there being room for
+    // 1 more instr left in the quantum, the voluntary switch still happens.
+    // When scheduled next, A has room to execute only one instr left in its
+    // quantum limit (which was carried over after the voluntary switch).
+    //
+    // B has a voluntary switch after its first 5 letters, prompted by its 2nd
+    // instr which is a blocking system call with latency that exceeds switch
+    // threshold, but not before its next three instrs which are from the
+    // syscall trace and must be shown before the switch happens. B ends up
+    // executing more instrs than its quantum limit because of the syscall
+    // trace.
+    //
+    // C has a syscall at its third letter (but it doesn't cause a switch
+    // because it doesn't have sufficiently high latency), followed by the
+    // syscall trace of three additional letters. C ends up
+    // executing more instrs than the quantum limit because of the syscall
+    // trace.
+    //
+    // D has a syscall at its 4th letter, followed by a 1-instr syscall
+    // trace. D continues with its regular instrs without a context switch
+    // at its 6th letter because there is still room for more instrs left in
+    // the quantum.
+    //
+    // Since core0 has an extra input, core1 finishes
+    // its runqueue first and then steals G from core0 (migration threshold is 0)
+    // and finishes it off.
+    static const char *const CORE0_SCHED_STRING =
+        "..A.....a...CCC....ccc...EEE..GGGACCCEEEGGGAAACCC.EEE.AAAA.";
+    static const char *const CORE1_SCHED_STRING =
+        "..BB.....bbb...DDD..FFFBBBD....d.DFFFBBBDDDFFF.B.D.GGG.____";
+    {
+        // Test instruction quanta.
+        std::vector<scheduler_t::input_workload_t> sched_inputs;
+        for (int i = 0; i < NUM_INPUTS; i++) {
+            std::vector<scheduler_t::input_reader_t> readers;
+            readers.emplace_back(
+                std::unique_ptr<mock_reader_t>(new mock_reader_t(inputs[i])),
+                std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_BASE + i);
+            sched_inputs.emplace_back(std::move(readers));
+        }
+        scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                                   scheduler_t::DEPENDENCY_IGNORE,
+                                                   scheduler_t::SCHEDULER_DEFAULTS,
+                                                   /*verbosity=*/4);
+        sched_ops.quantum_duration_instrs = QUANTUM_DURATION;
+        // This was tuned with a 100us threshold: so avoid scheduler.h defaults
+        // changes from affecting our output.
+        sched_ops.blocking_switch_threshold = BLOCK_THRESHOLD;
+        sched_ops.block_time_multiplier = BLOCK_SCALE;
+        sched_ops.time_units_per_us = 1.;
+        // Migration is measured in wall-clock-time for instr quanta
+        // so avoid non-determinism by having no threshold.
+        sched_ops.migration_threshold_us = 0;
+        scheduler_t scheduler;
+        if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
+            scheduler_t::STATUS_SUCCESS)
+            assert(false);
+        std::vector<std::string> sched_as_string =
+            run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE);
+        for (int i = 0; i < NUM_OUTPUTS; i++) {
+            std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
+        }
+        // Check scheduler stats.  # switches is the # of letter transitions; # preempts
+        // is the instances where the same letter appears 3 times without another letter
+        // appearing in between (and ignoring the last letter for an input: EOF doesn't
+        // count as a preempt). # nops are the instances where the same input is picked
+        // to run because nothing else is waiting.
+        verify_scheduler_stats(scheduler.get_stream(0), /*switch_input_to_input=*/11,
+                               /*switch_input_to_idle=*/0, /*switch_idle_to_input=*/0,
+                               /*switch_nop=*/1, /*preempts=*/9, /*direct_attempts=*/0,
+                               /*direct_successes=*/0, /*migrations=*/1);
+        verify_scheduler_stats(scheduler.get_stream(1), /*switch_input_to_input=*/11,
+                               /*switch_input_to_idle=*/1, /*switch_idle_to_input=*/0,
+                               /*switch_nop=*/0, /*preempts=*/8, /*direct_attempts=*/0,
+                               /*direct_successes=*/0, /*migrations=*/0);
+        assert(scheduler.get_stream(0)->get_schedule_statistic(
+                   memtrace_stream_t::SCHED_STAT_RUNQUEUE_STEALS) == 0);
+        assert(scheduler.get_stream(1)->get_schedule_statistic(
+                   memtrace_stream_t::SCHED_STAT_RUNQUEUE_STEALS) == 1);
+#ifndef WIN32
+        // XXX: Windows microseconds on test VMs are very coarse and stay the same
+        // for long periods.  Instruction quanta use wall-clock idle times, so
+        // the result is extreme variations here.  We try to adjust by handling
+        // any schedule with below specific patterns.  We just give up on checking the
+        // precise output for this test on Windows.
+        if (sched_as_string[0] != CORE0_SCHED_STRING ||
+            sched_as_string[1] != CORE1_SCHED_STRING) {
+            // XXX: These bools could potentially be made into ints, but then
+            // maybe our check will become too strict, defeating the purpose of
+            // this relaxation.
+            bool found_single_A = false, found_single_B = false, found_single_D = false;
+            for (int cpu = 0; cpu < NUM_OUTPUTS; ++cpu) {
+                for (size_t i = 1; i < sched_as_string[cpu].size() - 1; ++i) {
+                    // We expect a single 'A' for the first instr executed by 'A',
+                    // which will be followed by a marker ('.') for the syscall,
+                    // and the third instr executed by it which will be the only
+                    // instruction executed by it during that scheduling because
+                    // prior bookkeeping for that quantum exhaused all-but-one
+                    // instruction.
+                    if (sched_as_string[cpu][i] == 'A' &&
+                        sched_as_string[cpu][i - 1] != 'A' &&
+                        sched_as_string[cpu][i + 1] != 'A')
+                        found_single_A = true;
+                    // We expect a single 'B' for the last instr executed by B
+                    // which will have to be in its own separate 3-instr quantum.
+                    if (sched_as_string[cpu][i] == 'B' &&
+                        sched_as_string[cpu][i - 1] != 'B' &&
+                        sched_as_string[cpu][i + 1] != 'B')
+                        found_single_B = true;
+                    // We expect a single 'D' for the one quantum where the
+                    // 1st and 3rd instrs executed by D were regular, and the
+                    // 2nd one was from a syscall (which is 'd'). Also, the
+                    // last (10th) instr executed by D will have to be in its
+                    // own separate 3-instr quantum.
+                    if (sched_as_string[cpu][i] == 'D' &&
+                        sched_as_string[cpu][i - 1] != 'D' &&
+                        sched_as_string[cpu][i + 1] != 'D')
+                        found_single_D = true;
+                }
+            }
+            bool found_syscall_a = false, found_syscall_b = false,
+                 found_syscall_c = false, found_syscall_d = false;
+            for (int cpu = 0; cpu < NUM_OUTPUTS; ++cpu) {
+                // The '.' at beginning and end of each of the searched sequences
+                // below is for the syscall trace start and end markers.
+                if (sched_as_string[cpu].find(".a.") != std::string::npos) {
+                    found_syscall_a = true;
+                }
+                if (sched_as_string[cpu].find(".bbb.") != std::string::npos) {
+                    found_syscall_b = true;
+                }
+                if (sched_as_string[cpu].find(".ccc.") != std::string::npos) {
+                    found_syscall_c = true;
+                }
+                if (sched_as_string[cpu].find(".d.") != std::string::npos) {
+                    found_syscall_d = true;
+                }
+            }
+            assert(found_single_A && found_single_B && found_single_D);
+            assert(found_syscall_a && found_syscall_b && found_syscall_c &&
+                   found_syscall_d);
+        }
+#endif
+    }
+    {
+        // Test time quanta.
+        std::vector<scheduler_t::input_workload_t> sched_inputs;
+        for (int i = 0; i < NUM_INPUTS; i++) {
+            std::vector<scheduler_t::input_reader_t> readers;
+            readers.emplace_back(
+                std::unique_ptr<mock_reader_t>(new mock_reader_t(inputs[i])),
+                std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_BASE + i);
+            sched_inputs.emplace_back(std::move(readers));
+        }
+        scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                                   scheduler_t::DEPENDENCY_IGNORE,
+                                                   scheduler_t::SCHEDULER_DEFAULTS,
+                                                   /*verbosity=*/4);
+        sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+        sched_ops.time_units_per_us = 1.;
+        // This was tuned with a 100us threshold: so avoid scheduler.h defaults
+        // changes from affecting our output.
+        sched_ops.blocking_switch_threshold = BLOCK_THRESHOLD;
+        sched_ops.quantum_duration_us = QUANTUM_DURATION;
+        sched_ops.block_time_multiplier = BLOCK_SCALE;
+        sched_ops.migration_threshold_us = 0;
+        scheduler_t scheduler;
+        if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
+            scheduler_t::STATUS_SUCCESS)
+            assert(false);
+        std::vector<std::string> sched_as_string =
+            run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true);
+        for (int i = 0; i < NUM_OUTPUTS; i++) {
+            std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
+        }
+        assert(sched_as_string[0] == CORE0_SCHED_STRING);
+        assert(sched_as_string[1] == CORE1_SCHED_STRING);
+    }
+}
+
 static void
 test_synthetic_time_quanta()
 {
@@ -6424,6 +6675,7 @@ test_main(int argc, const char *argv[])
     test_only_threads();
     test_real_file_queries_and_filters(argv[1]);
     test_synthetic();
+    test_synthetic_with_syscall_seq();
     test_synthetic_time_quanta();
     test_synthetic_with_timestamps();
     test_synthetic_with_priorities();