-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathloops.asm
165 lines (125 loc) · 2.96 KB
/
loops.asm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
bits 64
section .text
%macro public_fn 1
global %1
%1:
%endmacro
%macro perf_start 1
public_fn %1
mov r9, rcx ; save first arg into r9
rdtsc
shl rdx, 32
or rdx, rax
mov r8, rdx
mov ecx, 10000
.lp:
%endmacro
%macro perf_end 0
dec ecx
jnz .lp
rdtsc
shl rdx, 32
or rax, rdx
sub rax, r8
push rbx
push rax
cpuid
pop rax
pop rbx
ret
%endmacro
%macro manual_xadd 2 ; addr, amount
%%try:
mov rax, %1
lea rdx, [rax+%2]
lock cmpxchg %1, rdx
jne %%try
%endmacro
; ---- test kernels
; test_add
perf_start test_add
add qword [r9], rax
add qword [r9+8], rax
add qword [r9+16], rax
add qword [r9+24], rax
perf_end
; test_dependent_adds
perf_start test_dependent_adds
add rax, [r9]
add rax, [r9+8]
add rax, [r9+16]
add rax, [r9+24]
perf_end
; test_add_mfence
perf_start test_add_mfence
add qword [r9], rax
mfence
add qword [r9+8], rax
mfence
add qword [r9+16], rax
mfence
add qword [r9+24], rax
mfence
perf_end
; test_lockadd
perf_start test_lockadd
lock add qword [r9], rax
lock add qword [r9+8], rax
lock add qword [r9+16], rax
lock add qword [r9+24], rax
perf_end
; test_xadd
perf_start test_xadd
lock xadd qword [r9], rax
lock xadd qword [r9+8], rax
lock xadd qword [r9+16], rax
lock xadd qword [r9+24], rax
perf_end
; test_cmpxchg
perf_start test_cmpxchg
manual_xadd [r9], rbx
manual_xadd [r9+8], rbx
manual_xadd [r9+16], rbx
manual_xadd [r9+24], rbx
perf_end
; test_swap
perf_start test_swap
xchg [r9], rax
xchg [r9+8], rdx
xchg [r9+16], rax
xchg [r9+24], rdx
perf_end
; test_lockadd_unalign
perf_start test_lockadd_unalign
lock add qword [r9+33], 1
lock add qword [r9+41], 1
lock add qword [r9+49], 1
lock add qword [r9+57], 1
perf_end
; ---- interference kernels
public_fn interference_read
mov edx, 10000000
.lp:
mov rax, [rcx]
mov rax, [rcx+8]
mov rax, [rcx+16]
mov rax, [rcx+24]
dec edx
jnz .lp
ret
public_fn interference_write
mov edx, 10000000
.lp:
; NOTE: mix of reads and writes here;
; with pure writes it's easy to completely starve the cmpxchg variants.
add rax, [rcx]
mov [rcx], rdx
add rax, [rcx+8]
mov [rcx+8], rdx
add rax, [rcx+16]
mov [rcx+16], rdx
add rax, [rcx+24]
mov [rcx+24], rdx
dec edx
jnz .lp
ret