Skip to content

Commit

Permalink
Merge branch 'master' into i6495-fix-format-spec
Browse files Browse the repository at this point in the history
  • Loading branch information
abhinav92003 authored Dec 12, 2023
2 parents 5331484 + 278fc51 commit f5a5252
Show file tree
Hide file tree
Showing 9 changed files with 336 additions and 41 deletions.
2 changes: 2 additions & 0 deletions api/docs/release.dox
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@ changes:
to include direct switch dependencies. This is not a binary compatibility change
as the old value still refers purely to timestamps, but on a recompile it
refers to timestamps and direct switches, which is what most users should want.
- Rename the macro INSTR_CREATE_mul_sve to INSTR_CREATE_mul_sve_imm to
differentiate it from the other SVE MUL instructions.

Further non-compatibility-affecting changes include:
- Added raw2trace support to inject system call kernel trace templates collected from
Expand Down
40 changes: 28 additions & 12 deletions clients/drcachesim/scheduler/scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1724,8 +1724,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::set_cur_input(output_ordinal_t output,
if (prev_input == input)
return STATUS_OK;
std::lock_guard<std::mutex> lock(*inputs_[input].lock);
inputs_[input].instrs_in_quantum = 0;
inputs_[input].start_time_in_quantum = outputs_[output].cur_time;
inputs_[input].prev_time_in_quantum = outputs_[output].cur_time;
if (options_.schedule_record_ostream != nullptr) {
uint64_t instr_ord = inputs_[input].reader->get_instruction_ordinal();
if (!inputs_[input].recorded_in_schedule && instr_ord == 1) {
Expand Down Expand Up @@ -2104,8 +2103,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
auto lock = std::unique_lock<std::mutex>(*input->lock);
// Since we do not ask for a start time, we have to check for the first record from
// each input and set the time here.
if (input->start_time_in_quantum == 0)
input->start_time_in_quantum = cur_time;
if (input->prev_time_in_quantum == 0)
input->prev_time_in_quantum = cur_time;
if (!outputs_[output].speculation_stack.empty()) {
outputs_[output].prev_speculate_pc = outputs_[output].speculate_pc;
error_string_ = outputs_[output].speculator.next_record(
Expand Down Expand Up @@ -2170,7 +2169,9 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
input->index, input->reader->get_instruction_ordinal());
VDO(this, 5, print_record(record););
bool need_new_input = false;
bool preempt = false;
double block_time_factor = 0.;
uint64_t prev_time_in_quantum = 0;
if (options_.mapping == MAP_AS_PREVIOUSLY) {
assert(outputs_[output].record_index >= 0);
if (outputs_[output].record_index >=
Expand Down Expand Up @@ -2275,28 +2276,32 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
VPRINT(this, 4,
"next_record[%d]: input %d hit end of instr quantum\n", output,
input->index);
preempt = !need_new_input;
need_new_input = true;
input->instrs_in_quantum = 0;
}
} else if (options_.quantum_unit == QUANTUM_TIME) {
if (cur_time == 0 || cur_time < input->start_time_in_quantum) {
if (cur_time == 0 || cur_time < input->prev_time_in_quantum) {
VPRINT(this, 1,
"next_record[%d]: invalid time %" PRIu64 " vs start %" PRIu64
"\n",
output, cur_time, input->start_time_in_quantum);
output, cur_time, input->prev_time_in_quantum);
return sched_type_t::STATUS_INVALID;
}
if (cur_time - input->start_time_in_quantum >=
options_.quantum_duration &&
input->time_spent_in_quantum += cur_time - input->prev_time_in_quantum;
prev_time_in_quantum = input->prev_time_in_quantum;
input->prev_time_in_quantum = cur_time;
if (input->time_spent_in_quantum >= options_.quantum_duration &&
// We only switch on instruction boundaries. We could possibly switch
// in between (e.g., scatter/gather long sequence of reads/writes) by
// setting input->switching_pre_instruction.
record_type_is_instr(record)) {
VPRINT(this, 4,
"next_record[%d]: hit end of time quantum after %" PRIu64
" (%" PRIu64 " - %" PRIu64 ")\n",
output, cur_time - input->start_time_in_quantum, cur_time,
input->start_time_in_quantum);
"next_record[%d]: hit end of time quantum after %" PRIu64 "\n",
output, input->time_spent_in_quantum);
preempt = !need_new_input;
need_new_input = true;
input->time_spent_in_quantum = 0;
}
}
}
Expand Down Expand Up @@ -2335,6 +2340,15 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
lock.lock();
VPRINT(this, 5, "next_record_mid[%d]: switching from %d to %d\n", output,
prev_input, outputs_[output].cur_input);
if (!preempt) {
if (options_.quantum_unit == QUANTUM_INSTRUCTIONS &&
record_type_is_instr(record)) {
--inputs_[prev_input].instrs_in_quantum;
} else if (options_.quantum_unit == QUANTUM_TIME) {
inputs_[prev_input].time_spent_in_quantum -=
(cur_time - prev_time_in_quantum);
}
}
if (res == sched_type_t::STATUS_WAIT)
return res;
input = &inputs_[outputs_[output].cur_input];
Expand Down Expand Up @@ -2392,6 +2406,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::unread_last_record(output_ordinal_t ou
record = outinfo.last_record;
input = &inputs_[outinfo.cur_input];
std::lock_guard<std::mutex> lock(*input->lock);
VPRINT(this, 4, "next_record[%d]: unreading last record, from %d\n", output,
input->index);
input->queue.push_back(outinfo.last_record);
if (options_.quantum_unit == QUANTUM_INSTRUCTIONS && record_type_is_instr(record))
--input->instrs_in_quantum;
Expand Down
3 changes: 2 additions & 1 deletion clients/drcachesim/scheduler/scheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -1015,7 +1015,8 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
// Used to switch before we've read the next instruction.
bool switching_pre_instruction = false;
// Used for time-based quanta.
uint64_t start_time_in_quantum = 0;
uint64_t prev_time_in_quantum = 0;
uint64_t time_spent_in_quantum = 0;
// These fields model waiting at a blocking syscall.
double block_time_factor = 0.;
uint64_t blocked_start_time = 0; // For QUANTUM_TIME only.
Expand Down
45 changes: 33 additions & 12 deletions clients/drcachesim/tests/scheduler_unit_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -898,22 +898,38 @@ test_synthetic()
static constexpr int NUM_OUTPUTS = 2;
static constexpr int NUM_INSTRS = 9;
static constexpr int QUANTUM_DURATION = 3;
static constexpr double BLOCK_SCALE = 0.1;
static constexpr memref_tid_t TID_BASE = 100;
std::vector<trace_entry_t> inputs[NUM_INPUTS];
for (int i = 0; i < NUM_INPUTS; i++) {
memref_tid_t tid = TID_BASE + i;
inputs[i].push_back(make_thread(tid));
inputs[i].push_back(make_pid(1));
for (int j = 0; j < NUM_INSTRS; j++)
inputs[i].push_back(make_version(TRACE_ENTRY_VERSION));
inputs[i].push_back(make_timestamp(10)); // All the same time priority.
for (int j = 0; j < NUM_INSTRS; j++) {
inputs[i].push_back(make_instr(42 + j * 4));
// Test accumulation of usage across voluntary switches.
if ((i == 0 || i == 1) && j == 1) {
inputs[i].push_back(make_timestamp(20));
inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, 42));
inputs[i].push_back(
make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0));
inputs[i].push_back(make_timestamp(120));
}
}
inputs[i].push_back(make_exit(tid));
}
// Hardcoding here for the 2 outputs and 7 inputs.
// We expect 3 letter sequences (our quantum) alternating every-other as each
// core alternates; with an odd number the 2nd core finishes early.
// The dots are thread exits.
static const char *const CORE0_SCHED_STRING = "AAACCCEEEGGGBBBDDDFFFAAA.CCC.EEE.GGG.";
static const char *const CORE1_SCHED_STRING = "BBBDDDFFFAAACCCEEEGGGBBB.DDD.FFF.____";
// core alternates. The dots are markers and thread exits.
// A and B have a voluntary switch after their 1st 2 letters, but we expect
// the usage to persist to their next scheduling which should only have
// a single letter.
static const char *const CORE0_SCHED_STRING =
"..AA......CCC..EEE..GGGEEEABGGGDDD.AAABBBAAA.___";
static const char *const CORE1_SCHED_STRING =
"..BB......DDD..FFFCCCDDDFFFCCC.EEE.FFF.GGG.BBB.";
{
// Test instruction quanta.
std::vector<scheduler_t::input_workload_t> sched_inputs;
Expand All @@ -929,6 +945,8 @@ test_synthetic()
scheduler_t::SCHEDULER_DEFAULTS,
/*verbosity=*/3);
sched_ops.quantum_duration = QUANTUM_DURATION;
// We do not want to block for very long.
sched_ops.block_time_scale = BLOCK_SCALE;
scheduler_t scheduler;
if (scheduler.init(sched_inputs, NUM_OUTPUTS, sched_ops) !=
scheduler_t::STATUS_SUCCESS)
Expand Down Expand Up @@ -957,6 +975,8 @@ test_synthetic()
/*verbosity=*/3);
sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
sched_ops.quantum_duration = QUANTUM_DURATION;
// QUANTUM_INSTRUCTIONS divides by the threshold so to match we multiply.
sched_ops.block_time_scale = sched_ops.blocking_switch_threshold * BLOCK_SCALE;
scheduler_t scheduler;
if (scheduler.init(sched_inputs, NUM_OUTPUTS, sched_ops) !=
scheduler_t::STATUS_SUCCESS)
Expand Down Expand Up @@ -1530,9 +1550,9 @@ test_synthetic_with_syscalls_multiple()
// with the "." in run_lockstep_simulation(). The omitted "." markers also
// explains why the two strings are different lengths.
assert(sched_as_string[0] ==
"BHHHFFFJJJJJJJBEEHHHIIIFFFAAAHHHBAAAGGGAAABGGG__B___B___B");
"BHHHFFFJJJJJJJBEEHHHIIIBIIIEEDDDBAAAEEGGGBDDD___B___B___B___B");
assert(sched_as_string[1] ==
"EECCCIIICCCJJFFFCCCBIIIEEDDDGGGDDDEEDDD____EB__________________________");
"EECCCIIICCCJJFFFCCCFFFAAAHHHGGGDDDAAAGGGE__________________________");
}

static void
Expand Down Expand Up @@ -1900,9 +1920,10 @@ test_synthetic_with_syscalls_idle()
// The timestamps provide the ABCD ordering, but A's blocking syscall after its
// 2nd instr makes it delayed for 3 full queue cycles of BCD BCD: A's duration
// of 2 is decremented after the 1st (to 1) and 2nd (to 0) and A is finally
// schedulable after the 3rd.
// schedulable after the 3rd, when it just gets 1 instruction in before its
// (accumulated) count equals the quantum.
assert(sched_as_string[0] ==
"..AA......BB.B..CC.C..DD.DBBBCCCDDDBBBCCCDDDAAABBB.CCC.DDD.AAAAAAA.");
"..AA......BB.B..CC.C..DD.DBBBCCCDDDBBBCCCDDDABBB.CCC.DDD.AAAAAAAAA.");
}

static void
Expand Down Expand Up @@ -3285,7 +3306,6 @@ test_inactive()
// Ensure cpu0 now picks up the input that was on cpu1.
// This is also the record we un-read earlier.
check_next(stream0, scheduler_t::STATUS_OK, TID_B, TRACE_TYPE_INSTR);
check_next(stream0, scheduler_t::STATUS_OK, TID_B, TRACE_TYPE_INSTR);
// End of quantum.
check_next(stream0, scheduler_t::STATUS_OK, TID_A, TRACE_TYPE_INSTR);
// Make cpu0 inactive and cpu1 active.
Expand All @@ -3296,6 +3316,7 @@ test_inactive()
assert(status == scheduler_t::STATUS_OK);
// Now cpu1 should finish things.
check_next(stream1, scheduler_t::STATUS_OK, TID_B, TRACE_TYPE_INSTR);
check_next(stream1, scheduler_t::STATUS_OK, TID_B, TRACE_TYPE_INSTR);
check_next(stream1, scheduler_t::STATUS_OK, TID_B, TRACE_TYPE_THREAD_EXIT);
check_next(stream1, scheduler_t::STATUS_OK, TID_A, TRACE_TYPE_THREAD_EXIT);
check_next(stream1, scheduler_t::STATUS_EOF);
Expand Down Expand Up @@ -3326,8 +3347,8 @@ test_inactive()
for (int i = 0; i < NUM_OUTPUTS; i++) {
std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
}
assert(sched_as_string[0] == "..AABBA._");
assert(sched_as_string[1] == "..B---B.");
assert(sched_as_string[0] == "..AABA.__");
assert(sched_as_string[1] == "..B--BB.");
}
#endif // HAS_ZIP
}
Expand Down
22 changes: 13 additions & 9 deletions core/ir/aarch64/codec_sve2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@
01000101001xxxxx101000xxxxxxxxxx n 1071 SVE2 histseg z_b_0 : z_b_5 z_b_16
11000100000xxxxx110xxxxxxxxxxxxx n 950 SVE2 ldnt1b z_d_0 : svemem_vec_30sd_gpr16 p10_zer_lo
10000100000xxxxx101xxxxxxxxxxxxx n 950 SVE2 ldnt1b z_s_0 : svemem_vec_30sd_gpr16 p10_zer_lo
11000101100xxxxx110xxxxxxxxxxxxx n 992 SVE2 ldnt1d z_d_0 : svemem_vec_30sd_gpr16 p10_zer_lo
11000101100xxxxx110xxxxxxxxxxxxx n 992 SVE2 ldnt1d z_d_0 : svemem_vec_30sd_gpr16 p10_zer_lo
11000100100xxxxx110xxxxxxxxxxxxx n 993 SVE2 ldnt1h z_d_0 : svemem_vec_30sd_gpr16 p10_zer_lo
10000100100xxxxx101xxxxxxxxxxxxx n 993 SVE2 ldnt1h z_s_0 : svemem_vec_30sd_gpr16 p10_zer_lo
11000100000xxxxx100xxxxxxxxxxxxx n 1186 SVE2 ldnt1sb z_d_0 : svemem_vec_30sd_gpr16 p10_zer_lo
Expand All @@ -97,8 +97,12 @@
10000100100xxxxx100xxxxxxxxxxxxx n 1187 SVE2 ldnt1sh z_s_0 : svemem_vec_30sd_gpr16 p10_zer_lo
11000101000xxxxx100xxxxxxxxxxxxx n 1188 SVE2 ldnt1sw z_d_0 : svemem_vec_30sd_gpr16 p10_zer_lo
11000101000xxxxx110xxxxxxxxxxxxx n 994 SVE2 ldnt1w z_d_0 : svemem_vec_30sd_gpr16 p10_zer_lo
10000101000xxxxx101xxxxxxxxxxxxx n 994 SVE2 ldnt1w z_s_0 : svemem_vec_30sd_gpr16 p10_zer_lo
10000101000xxxxx101xxxxxxxxxxxxx n 994 SVE2 ldnt1w z_s_0 : svemem_vec_30sd_gpr16 p10_zer_lo
01000101xx1xxxxx100xxxxxxxx0xxxx w 1189 SVE2 match p_size_bh_0 : p10_zer_lo z_size_bh_5 z_size_bh_16
00000100xx1xxxxx011000xxxxxxxxxx n 321 SVE2 mul z_size_bhsd_0 : z_size_bhsd_5 z_size_bhsd_16
01000100111xxxxx111110xxxxxxxxxx n 321 SVE2 mul z_d_0 : z_d_5 z4_d_16 i1_index_20
010001000x1xxxxx111110xxxxxxxxxx n 321 SVE2 mul z_h_0 : z_h_5 z3_h_16 i3_index_19
01000100101xxxxx111110xxxxxxxxxx n 321 SVE2 mul z_s_0 : z_s_5 z3_s_16 i2_index_19
00000100111xxxxx001111xxxxxxxxxx n 1072 SVE2 nbsl z_d_0 : z_d_0 z_d_16 z_d_5
01000101xx1xxxxx100xxxxxxxx1xxxx w 1190 SVE2 nmatch p_size_bh_0 : p10_zer_lo z_size_bh_5 z_size_bh_16
00000100001xxxxx011001xxxxxxxxxx n 328 SVE2 pmul z_msz_bhsd_0 : z_msz_bhsd_5 z_msz_bhsd_16
Expand Down Expand Up @@ -230,13 +234,13 @@
01000101xx0xxxxx100011xxxxxxxxxx n 1116 SVE2 ssubltb z_size_hsd_0 : z_sizep1_bhs_5 z_sizep1_bhs_16
01000101xx0xxxxx010100xxxxxxxxxx n 1117 SVE2 ssubwb z_size_hsd_0 : z_size_hsd_5 z_sizep1_bhs_16
01000101xx0xxxxx010101xxxxxxxxxx n 1118 SVE2 ssubwt z_size_hsd_0 : z_size_hsd_5 z_sizep1_bhs_16
11100100000xxxxx001xxxxxxxxxxxxx n 952 SVE2 stnt1b svemem_vec_22sd_gpr16 : z_d_0 p10_lo
11100100010xxxxx001xxxxxxxxxxxxx n 952 SVE2 stnt1b svemem_vec_22sd_gpr16 : z_s_0 p10_lo
11100101100xxxxx001xxxxxxxxxxxxx n 1004 SVE2 stnt1d svemem_vec_30sd_gpr16 : z_d_0 p10_lo
11100100100xxxxx001xxxxxxxxxxxxx n 1005 SVE2 stnt1h svemem_vec_22sd_gpr16 : z_d_0 p10_lo
11100100110xxxxx001xxxxxxxxxxxxx n 1005 SVE2 stnt1h svemem_vec_22sd_gpr16 : z_s_0 p10_lo
11100101000xxxxx001xxxxxxxxxxxxx n 1006 SVE2 stnt1w svemem_vec_22sd_gpr16 : z_d_0 p10_lo
11100101010xxxxx001xxxxxxxxxxxxx n 1006 SVE2 stnt1w svemem_vec_22sd_gpr16 : z_s_0 p10_lo
11100100000xxxxx001xxxxxxxxxxxxx n 952 SVE2 stnt1b svemem_vec_22sd_gpr16 : z_d_0 p10_lo
11100100010xxxxx001xxxxxxxxxxxxx n 952 SVE2 stnt1b svemem_vec_22sd_gpr16 : z_s_0 p10_lo
11100101100xxxxx001xxxxxxxxxxxxx n 1004 SVE2 stnt1d svemem_vec_30sd_gpr16 : z_d_0 p10_lo
11100100100xxxxx001xxxxxxxxxxxxx n 1005 SVE2 stnt1h svemem_vec_22sd_gpr16 : z_d_0 p10_lo
11100100110xxxxx001xxxxxxxxxxxxx n 1005 SVE2 stnt1h svemem_vec_22sd_gpr16 : z_s_0 p10_lo
11100101000xxxxx001xxxxxxxxxxxxx n 1006 SVE2 stnt1w svemem_vec_22sd_gpr16 : z_d_0 p10_lo
11100101010xxxxx001xxxxxxxxxxxxx n 1006 SVE2 stnt1w svemem_vec_22sd_gpr16 : z_s_0 p10_lo
01000101xx1xxxxx011100xxxxxxxxxx n 1119 SVE2 subhnb z_sizep1_bhs_0 : z_size_hsd_5 z_size_hsd_16
01000101xx1xxxxx011101xxxxxxxxxx n 1120 SVE2 subhnt z_sizep1_bhs_0 : z_sizep1_bhs_0 z_size_hsd_5 z_size_hsd_16
01000100xx011100100xxxxxxxxxxxxx n 474 SVE2 suqadd z_size_bhsd_0 : p10_mrg_lo z_size_bhsd_0 z_size_bhsd_5
Expand Down
35 changes: 34 additions & 1 deletion core/ir/aarch64/instr_create_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -5727,7 +5727,7 @@
* \param Zdn The first source and destination vector register, Z (Scalable)
* \param simm The signed immediate imm
*/
#define INSTR_CREATE_mul_sve(dc, Zdn, simm) \
#define INSTR_CREATE_mul_sve_imm(dc, Zdn, simm) \
instr_create_1dst_2src(dc, OP_mul, Zdn, Zdn, simm)

/**
Expand Down Expand Up @@ -18150,4 +18150,37 @@
dc, OP_dc_gzva, \
opnd_create_base_disp(opnd_get_reg(Rn), DR_REG_NULL, 0, 0, OPSZ_sys))

/**
* Creates a MUL instruction.
*
* This macro is used to encode the forms:
\verbatim
MUL <Zd>.<Ts>, <Zn>.<Ts>, <Zm>.<Ts>
\endverbatim
* \param dc The void * dcontext used to allocate memory for the #instr_t.
* \param Zd The destination vector register. Can be Z.b, Z.h, Z.s or Z.d.
* \param Zn The first source vector register. Can be Z.b, Z.h, Z.s or Z.d.
* \param Zm The second source vector register. Can be Z.b, Z.h, Z.s or Z.d.
*/
#define INSTR_CREATE_mul_sve_vector(dc, Zd, Zn, Zm) \
instr_create_1dst_2src(dc, OP_mul, Zd, Zn, Zm)

/**
* Creates a MUL instruction.
*
* This macro is used to encode the forms:
\verbatim
MUL <Zd>.D, <Zn>.D, <Zm>.D[<index>]
MUL <Zd>.H, <Zn>.H, <Zm>.H[<index>]
MUL <Zd>.S, <Zn>.S, <Zm>.S[<index>]
\endverbatim
* \param dc The void * dcontext used to allocate memory for the #instr_t.
* \param Zd The destination vector register, Z.d.
* \param Zn The first source vector register, Z.d.
* \param Zm The second source vector register, Z.d.
* \param index The immediate index for Zm.
*/
#define INSTR_CREATE_mul_sve_idx(dc, Zd, Zn, Zm, index) \
instr_create_1dst_3src(dc, OP_mul, Zd, Zn, Zm, index)

#endif /* DR_IR_MACROS_AARCH64_H */
Loading

0 comments on commit f5a5252

Please sign in to comment.