Skip to content

Commit

Permalink
Merge pull request OpenMathLib#4656 from zboszor/fix-x86-64-build-v2
Browse files Browse the repository at this point in the history
Add forgotten conditional uses of PREFETCH
  • Loading branch information
martin-frbg authored Apr 23, 2024
2 parents ae695d4 + ca64861 commit d421dec
Show file tree
Hide file tree
Showing 8 changed files with 104 additions and 8 deletions.
18 changes: 18 additions & 0 deletions kernel/x86_64/gemm_ncopy_4.S
Original file line number Diff line number Diff line change
Expand Up @@ -189,12 +189,16 @@
movss %xmm6, 6 * SIZE(B)
movss %xmm7, 7 * SIZE(B)

#ifdef PREFETCH
PREFETCH RPREFETCHSIZE * SIZE(AO1)
PREFETCH RPREFETCHSIZE * SIZE(AO2)
PREFETCH RPREFETCHSIZE * SIZE(AO3)
PREFETCH RPREFETCHSIZE * SIZE(AO4)
#endif

#ifdef PREFETCHW
PREFETCHW WPREFETCHSIZE * SIZE(B)
#endif

movss %xmm8, 8 * SIZE(B)
movss %xmm9, 9 * SIZE(B)
Expand All @@ -205,29 +209,39 @@
movss %xmm14, 14 * SIZE(B)
movss %xmm15, 15 * SIZE(B)
#else
#ifdef PREFETCH
PREFETCH RPREFETCHSIZE * SIZE(AO1)
#endif
movsd 0 * SIZE(AO1), %xmm0
movhpd 0 * SIZE(AO2), %xmm0
movsd 1 * SIZE(AO1), %xmm2
movhpd 1 * SIZE(AO2), %xmm2
#ifdef PREFETCH
PREFETCH RPREFETCHSIZE * SIZE(AO2)
#endif
movsd 2 * SIZE(AO1), %xmm4
movhpd 2 * SIZE(AO2), %xmm4
movsd 3 * SIZE(AO1), %xmm6
movhpd 3 * SIZE(AO2), %xmm6

#ifdef PREFETCH
PREFETCH RPREFETCHSIZE * SIZE(AO3)
#endif
movsd 0 * SIZE(AO3), %xmm1
movhpd 0 * SIZE(AO4), %xmm1
movsd 1 * SIZE(AO3), %xmm3
movhpd 1 * SIZE(AO4), %xmm3
#ifdef PREFETCH
PREFETCH RPREFETCHSIZE * SIZE(AO4)
#endif
movsd 2 * SIZE(AO3), %xmm5
movhpd 2 * SIZE(AO4), %xmm5
movsd 3 * SIZE(AO3), %xmm7
movhpd 3 * SIZE(AO4), %xmm7

#ifdef PREFETCHW
PREFETCHW WPREFETCHSIZE * SIZE(B)
#endif
movapd %xmm0, 0 * SIZE(B)
movapd %xmm1, 2 * SIZE(B)
movapd %xmm2, 4 * SIZE(B)
Expand Down Expand Up @@ -342,10 +356,14 @@
movapd %xmm3, 6 * SIZE(B)
#endif

#ifdef PREFETCH
PREFETCH RPREFETCHSIZE * SIZE(AO1)
PREFETCH RPREFETCHSIZE * SIZE(AO2)
#endif

#ifdef PREFETCHW
PREFETCHW WPREFETCHSIZE * SIZE(B)
#endif

addq $4 * SIZE, AO1
addq $4 * SIZE, AO2
Expand Down
10 changes: 10 additions & 0 deletions kernel/x86_64/gemm_tcopy_4.S
Original file line number Diff line number Diff line change
Expand Up @@ -219,31 +219,41 @@
movaps %xmm3, 12 * SIZE(BO)
#else

#ifdef PREFETCH
PREFETCH RPREFETCHSIZE * SIZE(AO1)
#endif
movsd 0 * SIZE(AO1), %xmm0
movhpd 1 * SIZE(AO1), %xmm0
movsd 2 * SIZE(AO1), %xmm1
movhpd 3 * SIZE(AO1), %xmm1

#ifdef PREFETCH
PREFETCH RPREFETCHSIZE * SIZE(AO2)
#endif
movsd 0 * SIZE(AO2), %xmm2
movhpd 1 * SIZE(AO2), %xmm2
movsd 2 * SIZE(AO2), %xmm3
movhpd 3 * SIZE(AO2), %xmm3

#ifdef PREFETCH
PREFETCH RPREFETCHSIZE * SIZE(AO3)
#endif
movsd 0 * SIZE(AO3), %xmm4
movhpd 1 * SIZE(AO3), %xmm4
movsd 2 * SIZE(AO3), %xmm5
movhpd 3 * SIZE(AO3), %xmm5

#ifdef PREFETCH
PREFETCH RPREFETCHSIZE * SIZE(AO4)
#endif
movsd 0 * SIZE(AO4), %xmm6
movhpd 1 * SIZE(AO4), %xmm6
movsd 2 * SIZE(AO4), %xmm7
movhpd 3 * SIZE(AO4), %xmm7

#ifdef PREFETCHW
PREFETCHW WPREFETCHSIZE * SIZE(BO)
#endif
movapd %xmm0, 0 * SIZE(BO)
movapd %xmm1, 2 * SIZE(BO)
movapd %xmm2, 4 * SIZE(BO)
Expand Down
32 changes: 30 additions & 2 deletions kernel/x86_64/zgemm_kernel_4x2_sse.S
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,14 @@
#define RPREFETCHSIZE (8 * 7 + 4)
#define WPREFETCHSIZE (8 * 8 + 4)

#ifdef PREFETCH
#define PREFETCH_KERNEL1(xx) PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;
#define PREFETCH_KERNEL4(xx) PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;
#else
#define PREFETCH_KERNEL1(xx)
#define PREFETCH_KERNEL4(xx)
#endif

#ifndef GENERIC
#define KERNEL1(xx) \
mulps %xmm0, %xmm1 ;\
Expand All @@ -111,7 +119,7 @@
addps %xmm3, %xmm9 ;\
movaps -28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
mulps %xmm0, %xmm5 ;\
PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\
PREFETCH_KERNEL1(xx) \
mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
addps %xmm5, %xmm10 ;\
movaps -24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
Expand Down Expand Up @@ -157,7 +165,7 @@
mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
addps %xmm5, %xmm14 ;\
movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\
PREFETCH_KERNEL4(xx) \
addps %xmm6, %xmm15 ;\
movaps -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6

Expand Down Expand Up @@ -1026,7 +1034,9 @@

.L22:
mulps %xmm0, %xmm1
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif
addps %xmm1, %xmm8
movaps -28 * SIZE(BO), %xmm1
mulps %xmm0, %xmm1
Expand Down Expand Up @@ -1079,7 +1089,9 @@
movaps 0 * SIZE(AO), %xmm0

mulps %xmm2, %xmm1
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
#endif
addps %xmm1, %xmm8
movaps 36 * SIZE(BO), %xmm1
mulps %xmm2, %xmm1
Expand Down Expand Up @@ -1285,7 +1297,9 @@

.L32:
mulps %xmm0, %xmm1
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif
addps %xmm1, %xmm8
movaps -28 * SIZE(BO), %xmm1
mulps %xmm0, %xmm1
Expand Down Expand Up @@ -1679,7 +1693,9 @@

.L52:
mulps %xmm0, %xmm1
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif
mulps -28 * SIZE(BO), %xmm0
addps %xmm1, %xmm8
movaps -32 * SIZE(BO), %xmm1
Expand All @@ -1705,7 +1721,9 @@
addps %xmm0, %xmm13
movaps 32 * SIZE(AO), %xmm0

#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
#endif

mulps %xmm2, %xmm3
mulps -12 * SIZE(BO), %xmm2
Expand Down Expand Up @@ -1733,7 +1751,9 @@
addps %xmm2, %xmm13
movaps 48 * SIZE(AO), %xmm2

#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 32) * SIZE(AO)
#endif

mulps %xmm4, %xmm5
mulps 4 * SIZE(BO), %xmm4
Expand Down Expand Up @@ -1761,7 +1781,9 @@
addps %xmm4, %xmm13
movaps 64 * SIZE(AO), %xmm4

#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 48) * SIZE(AO)
#endif

mulps %xmm6, %xmm7
mulps 20 * SIZE(BO), %xmm6
Expand Down Expand Up @@ -1942,7 +1964,9 @@

.L62:
mulps %xmm0, %xmm1
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif
mulps -28 * SIZE(BO), %xmm0
addps %xmm1, %xmm8
movaps -24 * SIZE(BO), %xmm1
Expand All @@ -1968,7 +1992,9 @@
addps %xmm0, %xmm11
movaps 0 * SIZE(AO), %xmm0

#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
#endif

mulps %xmm2, %xmm5
mulps 4 * SIZE(BO), %xmm2
Expand Down Expand Up @@ -2130,7 +2156,9 @@

.L72:
mulps %xmm0, %xmm1
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif
addps %xmm1, %xmm8
movaps -28 * SIZE(BO), %xmm1
mulps %xmm0, %xmm1
Expand Down
8 changes: 8 additions & 0 deletions kernel/x86_64/zsymv_L_sse2.S
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,9 @@
addpd a1, yy1
MOVDDUP(1 * SIZE, A1, a1)

#ifdef PREFETCH
PREFETCH PREFETCHSIZE(A1)
#endif

movapd xtemp3, xt1
mulpd a2, xt1
Expand All @@ -507,7 +509,9 @@
addpd a2, yy2
MOVDDUP(0 * SIZE, A2, a2)

#ifdef PREFETCH
PREFETCH PREFETCHSIZE(XX)
#endif

movapd xtemp3, xt1
movapd 12 * SIZE(XX), xtemp3
Expand Down Expand Up @@ -546,7 +550,9 @@
addpd a2, yy1
MOVDDUP(6 * SIZE, A2, a2)

#ifdef PREFETCH
PREFETCH PREFETCHSIZE(A2)
#endif

movlpd yy1, 0 * SIZE(YY)
movhpd yy1, 1 * SIZE(YY)
Expand Down Expand Up @@ -574,7 +580,9 @@
addpd a1, yy1
MOVDDUP(6 * SIZE, A1, a1)

#ifdef PREFETCHW
PREFETCHW PREFETCHSIZE(YY)
#endif

movapd xtemp4, xt1
mulpd a2, xt1
Expand Down
8 changes: 8 additions & 0 deletions kernel/x86_64/zsymv_U_sse2.S
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,9 @@
addpd a1, yy1
MOVDDUP(3 * SIZE, A2, a1)

#ifdef PREFETCH
PREFETCH PREFETCHSIZE(A1)
#endif

movapd xtemp3, xt1
mulpd a2, xt1
Expand All @@ -465,7 +467,9 @@
addpd a1, yy2
MOVDDUP(3 * SIZE, A1, a1)

#ifdef PREFETCH
PREFETCH PREFETCHSIZE(XX)
#endif

movapd xtemp3, xt1
movapd 12 * SIZE(XX), xtemp3
Expand Down Expand Up @@ -504,7 +508,9 @@
addpd a2, yy1
MOVDDUP(5 * SIZE, A1, a2)

#ifdef PREFETCH
PREFETCH PREFETCHSIZE(A2)
#endif

movlpd yy1, 0 * SIZE(YY)
movhpd yy1, 1 * SIZE(YY)
Expand Down Expand Up @@ -532,7 +538,9 @@
addpd a2, yy1
MOVDDUP(4 * SIZE, A2, a2)

#ifdef PREFETCH
PREFETCHW PREFETCHSIZE(YY)
#endif

movapd xtemp4, xt1
mulpd a3, xt1
Expand Down
12 changes: 10 additions & 2 deletions kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S
Original file line number Diff line number Diff line change
Expand Up @@ -109,12 +109,20 @@
#define PREFETCHSIZE (8 * 6 + 4)
#endif

#ifdef PREFETCH
#define PREFETCH_KERNEL1(xx) PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;
#define PREFETCH_KERNEL5(xx) PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;
#else
#define PREFETCH_KERNEL1(xx)
#define PREFETCH_KERNEL5(xx)
#endif

#define KERNEL1(xx) \
mulps %xmm8, %xmm9 ;\
addps %xmm9, %xmm0 ;\
movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
mulps %xmm8, %xmm11 ;\
PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\
PREFETCH_KERNEL1(xx) \
addps %xmm11, %xmm1 ;\
movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
mulps %xmm8, %xmm13 ;\
Expand Down Expand Up @@ -171,7 +179,7 @@
addps %xmm9, %xmm0 ;\
movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
mulps %xmm8, %xmm11 ;\
PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\
PREFETCH_KERNEL5(xx) \
addps %xmm11, %xmm1 ;\
movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
mulps %xmm8, %xmm13 ;\
Expand Down
12 changes: 10 additions & 2 deletions kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S
Original file line number Diff line number Diff line change
Expand Up @@ -109,12 +109,20 @@
#define PREFETCHSIZE (8 * 6 + 4)
#endif

#ifdef PREFETCH
#define PREFETCH_KERNEL1(xx) PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;
#define PREFETCH_KERNEL5(xx) PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;
#else
#define PREFETCH_KERNEL1(xx)
#define PREFETCH_KERNEL5(xx)
#endif

#define KERNEL1(xx) \
mulps %xmm8, %xmm9 ;\
addps %xmm9, %xmm0 ;\
movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
mulps %xmm8, %xmm11 ;\
PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\
PREFETCH_KERNEL1(xx) \
addps %xmm11, %xmm1 ;\
movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
mulps %xmm8, %xmm13 ;\
Expand Down Expand Up @@ -171,7 +179,7 @@
addps %xmm9, %xmm0 ;\
movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
mulps %xmm8, %xmm11 ;\
PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\
PREFETCH_KERNEL5(xx) \
addps %xmm11, %xmm1 ;\
movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
mulps %xmm8, %xmm13 ;\
Expand Down
Loading

0 comments on commit d421dec

Please sign in to comment.