diff --git a/.github/workflows/codegen.yml b/.github/workflows/codegen.yml index 69f06de..0029004 100644 --- a/.github/workflows/codegen.yml +++ b/.github/workflows/codegen.yml @@ -64,17 +64,20 @@ jobs: - name: pspamm-tests-generate run: | cd tests/ - python unit_tests_hsw.py + python unit_test.py hsw256 + python unit_test.py hsw128 - name: pspamm-tests-compile run: | cd tests/ - g++ -static -mavx512f build/hsw_testsuite.cpp -o build/hsw-test + g++ -static -mavx2 build/hsw256_testsuite.cpp -o build/hsw256-test + g++ -static -mavx2 build/hsw128_testsuite.cpp -o build/hsw128-test - name: pspamm-tests-run run: | cd tests/ - qemu-x86_64-static -cpu Haswell build/hsw-test + qemu-x86_64-static -cpu Haswell build/hsw256-test + qemu-x86_64-static -cpu Haswell build/hsw128-test pspamm-codegen-avx512-no-run: name: pspamm-codegen-avx512-no-run @@ -102,18 +105,24 @@ jobs: - name: pspamm-tests-generate run: | cd tests/ - python unit_tests_knl.py + python unit_test.py knl512 + python unit_test.py knl256 + python unit_test.py knl128 - name: pspamm-tests-compile run: | cd tests/ - g++ -static -mavx512f build/knl_testsuite.cpp -o build/knl-test + g++ -static -mavx512f build/knl512_testsuite.cpp -o build/knl512-test + g++ -static -mavx512f build/knl256_testsuite.cpp -o build/knl256-test + g++ -static -mavx512f build/knl128_testsuite.cpp -o build/knl128-test # disabled, since qemu doesn't support AVX512F (yet) with of Ubuntu 24.04 # - name: pspamm-tests-run # run: | # cd tests/ - # qemu-x86_64-static -cpu Skylake-Server build/knl-test + # qemu-x86_64-static -cpu Skylake-Server build/knl512-test + # qemu-x86_64-static -cpu Skylake-Server build/knl256-test + # qemu-x86_64-static -cpu Skylake-Server build/knl128-test pspamm-codegen-aarch64: name: pspamm-codegen-aarch64 @@ -141,17 +150,17 @@ jobs: - name: pspamm-tests-generate run: | cd tests/ - python unit_tests_arm.py + python unit_test.py arm128 - name: pspamm-tests-compile run: | cd tests/ - aarch64-linux-gnu-g++ -static -march=armv8.2-a build/arm_testsuite.cpp -o build/arm-test + aarch64-linux-gnu-g++ -static -march=armv8.2-a build/arm128_testsuite.cpp -o build/arm128-test - name: pspamm-tests-run run: | cd tests/ - qemu-aarch64-static -cpu max build/arm-test + qemu-aarch64-static -cpu max build/arm128-test pspamm-codegen-armsve: name: pspamm-codegen-armsve @@ -159,6 +168,7 @@ jobs: needs: install-pspamm # include vector lengths for SVE manually (for now) strategy: + fail-fast: false matrix: vectorlen: - 128 @@ -188,7 +198,7 @@ jobs: - name: pspamm-tests-generate run: | cd tests/ - python unit_tests_arm_sve.py ${{matrix.vectorlen}} + python unit_test.py arm_sve${{matrix.vectorlen}} - name: pspamm-tests-compile run: | diff --git a/README.md b/README.md index fa425cc..efa9cfa 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,10 @@ -# Code Generator for Sparse Matrix Multiplication -Generates inline-Assembly for sparse Matrix Multiplication. +# PSpaMM +A Code Generator For Small Sparse (and Dense) Matrix Multiplications. -Currently Intel Xeon Phi 'Knights Landing' (AVX512), Haswell/Zen2 (AVX2), and ARM Cortex-A53 (ARMv8) are supported. +Currently supported: + +* x86_64: AVX2, AVX512/AVX10.1 +* ARM/AARCH64: NEON, SVE (128,256,512,1024,2048 bit) ## Installation diff --git a/pspamm/VERSION b/pspamm/VERSION index ee1372d..0d91a54 100644 --- a/pspamm/VERSION +++ b/pspamm/VERSION @@ -1 +1 @@ -0.2.2 +0.3.0 diff --git a/pspamm/architecture.py b/pspamm/architecture.py index d0f504c..c6f8f35 100755 --- a/pspamm/architecture.py +++ b/pspamm/architecture.py @@ -8,17 +8,5 @@ def init(): generator = None operands = None - - -#https://stackoverflow.com/questions/452969/does-python-have-an-equivalent-to-java-class-forname - def get_class( kls ): return import_module(kls) - parts = kls.split('.') - module = ".".join(parts[:-1]) - m = __import__( module ) - for comp in parts[1:]: - m = getattr(m, comp) - return m - - diff --git a/pspamm/codegen/analysis.py b/pspamm/codegen/analysis.py index 259c594..d12224d 100644 --- a/pspamm/codegen/analysis.py +++ b/pspamm/codegen/analysis.py @@ -55,5 +55,3 @@ def visitBlock(self, block: Block): stmt.accept(self) self.stack.pop() - - diff --git a/pspamm/codegen/architectures/arm/blocksize.py b/pspamm/codegen/architectures/arm/blocksize.py new file mode 100644 index 0000000..c8e3740 --- /dev/null +++ b/pspamm/codegen/architectures/arm/blocksize.py @@ -0,0 +1,125 @@ + +class Old: + @classmethod + def getBlocksize(cls, m , n, bk, v_size, prec): + + bm = m + bn = n + + if cls.ARM_condition(bm, bn, bk, v_size): + while cls.ARM_condition(bm, bn, bk+1, v_size): + bk += 1 + return (bm, bn, bk) + + while not cls.ARM_condition(bm, bn, bk, v_size): + bm, bn = cls.lowerToNextDiv(m, n, bm, bn, v_size) + + while cls.ARM_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn, bk) + + @classmethod + def lowerToNextDiv(cls, m, n, bm, bn, v_size): + if bm > bn and bm > v_size: + bm -= v_size + while m % bm != 0: + bm -= v_size + else: + bn -= 1 + while n % bn != 0: + bn -= 1 + + return bm, bn + + @classmethod + def ARM_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn+bk) * vm + bn*bk <= 32 + +class Max: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + bm = 2 + bn = 1 + maxval = 0 + + for i in range(v_size, m+1, v_size): + for j in range(1, n+1): + if cls.ARM_condition(i, j, bk, v_size): + if i*j > maxval: + maxval = i*j + bm = i + bn = j + + while cls.ARM_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn, bk) + + + @classmethod + def ARM_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn+bk) * vm + bn*bk <= 32 + +class MaxK: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + bm = 2 + bn = 1 + maxval = 0 + + elem128 = 16 // prec.size() + + for i in range(v_size, m+1, v_size): + for j in range(1, n+1): + if cls.ARM_condition(i, j, bk, v_size, elem128): + if i*j > maxval: + maxval = i*j + bm = i + bn = j + + while cls.ARM_condition(bm, bn, bk+1, v_size, elem128): + bk += 1 + + return (bm, bn, bk) + + @classmethod + def ARM_condition(cls, bm, bn, bk, v_size, elem128): + # ceiling division + vm = -(bm // -v_size) + vk = -(bk // -elem128) + return (bn+bk) * vm + bn*vk <= 32 + +class Cube: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + bm = 2 + bn = 1 + maxval = 0 + + elem128 = 16 // prec.size() + + for i in range(v_size, m+1, v_size): + for j in range(1, n+1): + for k in range(1, 200): + if cls.ARM_condition(i, j, k, v_size, elem128): + if i*j*k > maxval: + maxval = i*j*k + bm = i + bn = j + bk = k + + return (bm, bn, bk) + + @classmethod + def ARM_condition(cls, bm, bn, bk, v_size, elem128): + # ceiling division + vm = -(bm // -v_size) + vk = -(bk // -elem128) + return (bn+bk) * vm + bn*vk <= 32 + +Default = MaxK diff --git a/pspamm/codegen/architectures/arm/generator.py b/pspamm/codegen/architectures/arm/generator.py index 08d1725..b80e16f 100644 --- a/pspamm/codegen/architectures/arm/generator.py +++ b/pspamm/codegen/architectures/arm/generator.py @@ -32,36 +32,51 @@ class Generator(AbstractGenerator): """ def get_v_size(self): - if self.precision == Precision.DOUBLE: - return 2 - raise NotImplementedError + return 16 // self.precision.size() def get_template(self): return Generator.template + def use_broadcast(self): + return True + + def has_masks(self): + return False + + def init_mask(self, m, bm, v_size, tempreg, maskregs): + return block("") + def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int): assert(bm % v_size == 0) vm = bm//v_size - assert((bn+bk) * vm + bn * bk <= 32) # Needs to fit in NEON v registers - - A_regs = Matrix([[v(vm*c + r) for c in range(bk)] for r in range(vm)]) - B_regs = Matrix([[v(vm*bk + bn * r + c) for c in range(bn)] for r in range(bk)]) - C_regs = Matrix([[v(32 - vm*bn + vm*c + r) for c in range(bn)] + elem128 = 16 // self.get_precision().size() + vk = -(bk // -elem128) + assert((bn+bk) * vm + bn * vk <= 32) # Needs to fit in NEON v registers + + prec = { + Precision.DOUBLE: "2d", + Precision.SINGLE: "4s", + Precision.HALF: "8h", + }[self.get_precision()] + + A_regs = Matrix([[v(vm*c + r, prec) for c in range(bk)] for r in range(vm)]) + B_regs = Matrix([[v(vm*bk + bn * r + c, prec) for c in range(bn)] for r in range(vk)]) + C_regs = Matrix([[v(32 - vm*bn + vm*c + r, prec) for c in range(bn)] for r in range(vm)]) # get vector register number of the first vector in B_regs b_reg = vm*bk - alpha_reg = [v(b_reg), v(b_reg)] - beta_reg = [v(b_reg + 1), v(b_reg + 1)] + alpha_reg = [v(b_reg, prec), v(b_reg, prec)] + beta_reg = [v(b_reg + 1, prec), v(b_reg + 1, prec)] starting_regs = [r(0), r(1), r(2), r(3), r(4)] additional_regs = [r(11), xzr] - loop_reg = r(12) + loop_regs = [r(12), r(13), r(14)] - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_reg, additional_regs + return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, [] def bcst_alpha_beta(self, @@ -118,16 +133,16 @@ def move_register_block(self, if (mask is None) or (mask[ir,ic]): cell_offset = Coords(down=ir*v_size, right=ic) addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) - addr.disp += 8 * load_offset + addr.disp += self.precision.size() * load_offset next_offset = [0, 0] if ir+1 < rows: next_offset = [1, 0] - elif ic +1 < rows: + elif ic +1 < cols: next_offset = [0, 1] addr_next, comment_next = cursor.look(cursor_ptr, block_offset, Coords(down=(ir+next_offset[0])*v_size, right=ic+next_offset[1])) - addr_next.disp += 8 * load_offset - if addr_next.disp == addr.disp + 8 * v_size: + addr_next.disp += self.precision.size() * load_offset + if addr_next.disp == addr.disp + 16: skipflag = True if addr.disp > 255: if(addr.disp - cur11 > 0 and addr.disp - cur11 < 256): @@ -137,17 +152,22 @@ def move_register_block(self, cur11 = addr.disp addr.disp = 0 addr.base = additional_regs[0] + if skipflag and addr.disp % 16 != 0: + asm.add(add(addr.disp, additional_regs[0], "", addr.base)) + cur11 = addr.disp + addr.disp = 0 + addr.base = additional_regs[0] - if not skipflag: - if store: - asm.add(st(registers[ir,ic], addr, True, comment)) - else: - asm.add(ld(addr, registers[ir,ic], True, comment)) - else: - if store: - asm.add(st(registers[ir,ic], addr, True, comment, registers[ir+next_offset[0],ic+next_offset[1]])) + if not skipflag: + if store: + asm.add(st(registers[ir,ic], addr, True, comment)) + else: + asm.add(ld(addr, registers[ir,ic], True, comment)) else: - asm.add(ld(addr, registers[ir,ic], True, comment, registers[ir+next_offset[0],ic+next_offset[1]])) + if store: + asm.add(st(registers[ir,ic], addr, True, comment, registers[ir+next_offset[0],ic+next_offset[1]])) + else: + asm.add(ld(addr, registers[ir,ic], True, comment, registers[ir+next_offset[0],ic+next_offset[1]])) return asm @@ -193,16 +213,19 @@ def make_microkernel(self, mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size) asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False)) - x = 0; + elem128 = 16 // self.get_precision().size() + vk = -(bk // -elem128) + bs = [] cur11 = -1000 for Vmi in range(bm//v_size): - for bki in range(bk): # inside this k-block - for bni in range(bn): # inside this n-block + for bni in range(bn): # inside this n-block + for bki in range(bk): # inside this k-block to_cell = Coords(down=bki, right=bni) + bki_reg = bki // elem128 if B.has_nonzero_cell(B_ptr, to_B_block, to_cell): B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) - if B_regs[bki, bni] not in bs: + if B_regs[bki_reg, bni] not in bs: if B_cell_addr.disp > 255: if(B_cell_addr.disp - cur11 > 0 and B_cell_addr.disp - cur11 < 256): B_cell_addr.disp = B_cell_addr.disp - cur11 @@ -213,17 +236,23 @@ def make_microkernel(self, B_cell_addr.base = additional_regs[0] - asm.add(ld(B_cell_addr, B_regs[bki, bni], True, B_comment)) - bs.append(B_regs[bki, bni]) + asm.add(ld(B_cell_addr, B_regs[bki_reg, bni], True, B_comment)) + bs.append(B_regs[bki_reg, bni]) for Vmi in range(bm//v_size): + # TODO: refactor cell_indices into the cursors/blocks + cell_indices = {} for bki in range(bk): # inside this k-block for bni in range(bn): # inside this n-block to_cell = Coords(down=bki, right=bni) if B.has_nonzero_cell(B_ptr, to_B_block, to_cell): B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) comment = "C[{}:{},{}] += A[{}:{},{}]*{}".format(Vmi*v_size, Vmi*v_size+v_size, bni, Vmi*v_size, Vmi*v_size+v_size, bki, B_comment) - asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment)) + bki_reg = bki // elem128 + if (bki_reg, bni) not in cell_indices: + cell_indices[(bki_reg, bni)] = 0 + asm.add(fma(B_regs[bki_reg, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=cell_indices[(bki_reg, bni)])) + cell_indices[(bki_reg, bni)] += 1 return asm diff --git a/pspamm/codegen/architectures/arm/inlineprinter.py b/pspamm/codegen/architectures/arm/inlineprinter.py index abdea08..7b3e68b 100644 --- a/pspamm/codegen/architectures/arm/inlineprinter.py +++ b/pspamm/codegen/architectures/arm/inlineprinter.py @@ -20,7 +20,8 @@ class InlinePrinter(Visitor): def __init__(self, precision: Precision): self.output = [] self.stack = [] - assert precision == Precision.DOUBLE + self.precision = precision + assert precision in (Precision.HALF, Precision.SINGLE, Precision.DOUBLE) def show(self): print("\n".join(self.output)) @@ -48,70 +49,75 @@ def visitFma(self, stmt: FmaStmt): b = stmt.bcast_src.ugly m = stmt.mult_src.ugly a = stmt.add_dest.ugly - if stmt.bcast: - s = "fmla {}, {}, {}[0]".format(a,m,b) + if stmt.bcast is not None: + s = f"fmla {a}, {m}, {b}[{stmt.bcast}]" else: - s = "fmla {}, {}, {}".format(a,m,b) + s = f"fmla {a}, {m}, {b}" self.addLine(s, stmt.comment) def visitMul(self, stmt: MulStmt): b = stmt.src.ugly m = stmt.mult_src.ugly a = stmt.dest.ugly - s = "fmul {}, {}, {}".format(a,m,b) + s = f"fmul {a}, {m}, {b}" self.addLine(s, stmt.comment) def visitBcst(self, stmt: BcstStmt): - b = stmt.bcast_src.ugly + b = stmt.bcast_src.ugly if self.precision == Precision.DOUBLE else stmt.bcast_src.ugly_b32 a = stmt.dest.ugly - s = "dup {}, {}".format(a, b) + s = f"dup {a}, {b}" self.addLine(s, stmt.comment) def visitAdd(self, stmt: AddStmt): + if isinstance(stmt.src, Constant) and stmt.src.value == 0: + # avoid 0 instructions + return if isinstance(stmt.src, Constant) and (stmt.src.value > 4095 or stmt.src.value < -4095): if (stmt.src.value >> 16) & 0xFFFF > 0 and stmt.src.value < 0: s = "mov x11, #-1" - s1 = "movk x11, #{}".format((stmt.src.value) & 0xFFFF) - val = ((stmt.src.value >> 16) & 0xFFFF) - s2 = "movk x11, #{}, lsl #16".format(val) + val1 = (stmt.src.value) & 0xFFFF + s1 = f"movk x11, #{val1}" + val2 = ((stmt.src.value >> 16) & 0xFFFF) + s2 = f"movk x11, #{val2}, lsl #16" self.addLine(s, "") self.addLine(s1, "load lower 16 bit of immediate that requires more than 16 bit") self.addLine(s2, "load upper 16 bit of immediate that requires more than 16 bit") elif (stmt.src.value >> 16) & 0xFFFF: - s1 = "mov x11, #{}".format((stmt.src.value) & 0xFFFF) - val = ((stmt.src.value >> 16) & 0xFFFF) - s2 = "movk x11, #{}, lsl #16".format(val) + val1 = (stmt.src.value) & 0xFFFF + s1 = f"mov x11, #{val1}" + val2 = ((stmt.src.value >> 16) & 0xFFFF) + s2 = f"movk x11, #{val2}, lsl #16" self.addLine(s1, "load lower 16 bit of immediate that requires more than 16 bit") self.addLine(s2, "load upper 16 bit of immediate that requires more than 16 bit") else: - s = "mov x11, {}".format(stmt.src.ugly) + s = f"mov x11, {stmt.src.ugly}" self.addLine(s, "load lower 16 bit of immediate ") if stmt.dest.ugly != "x11": - s = "add {}, {}, x11".format(stmt.dest.ugly,stmt.dest.ugly) + s = f"add {stmt.dest.ugly}, {stmt.dest.ugly}, x11" self.addLine(s, stmt.comment) if stmt.additional is not None: - s = "add {}, {}, {}".format(stmt.dest.ugly,stmt.dest.ugly,stmt.additional.ugly) + s = f"add {stmt.dest.ugly}, {stmt.dest.ugly}, {stmt.additional.ugly}" self.addLine(s, stmt.comment) else: if stmt.additional is not None: - s = "add {}, {}, {}".format(stmt.dest.ugly,stmt.additional.ugly,stmt.src.ugly) + s = f"add {stmt.dest.ugly}, {stmt.additional.ugly}, {stmt.src.ugly}" else: - s = "add {}, {}, {}".format(stmt.dest.ugly,stmt.dest.ugly,stmt.src.ugly) + s = f"add {stmt.dest.ugly}, {stmt.dest.ugly}, {stmt.src.ugly}" self.addLine(s, stmt.comment) def visitLabel(self, stmt: LabelStmt): - s = "{}:".format(stmt.label.ugly) + s = f"{stmt.label.ugly}:" self.addLine(s, stmt.comment) def visitCmp(self, stmt: CmpStmt): - s = "cmp {}, {}".format(stmt.rhs.ugly,stmt.lhs.ugly) + s = f"cmp {stmt.rhs.ugly}, {stmt.lhs.ugly}" self.addLine(s, stmt.comment) def visitJump(self, stmt: JumpStmt): - s = "b.lo {}".format(stmt.destination.ugly) + s = f"b.lo {stmt.destination.ugly}" self.addLine(s, stmt.comment) def visitMov(self, stmt: MovStmt): @@ -120,9 +126,9 @@ def visitMov(self, stmt: MovStmt): else: src_str = stmt.src.ugly if stmt.typ == AsmType.f64x8: - s = "fmov {}, {}".format(stmt.dest.ugly_scalar_1d,src_str) + s = f"fmov {stmt.dest.ugly_scalar_1d}, {src_str}" else: - s = "mov {}, {}".format(stmt.dest.ugly,src_str) + s = f"mov {stmt.dest.ugly}, {src_str}" self.addLine(s, stmt.comment) @@ -133,12 +139,12 @@ def visitLoad(self, stmt: LoadStmt): src_str = stmt.src.ugly if stmt.typ == AsmType.i64: - s = "add {}, {}, {}".format(stmt.dest.ugly,stmt.dest.ugly,src_str) + s = f"add {stmt.dest.ugly}, {stmt.dest.ugly}, {src_str}" elif stmt.typ == AsmType.f64x8 and stmt.aligned: if stmt.dest2 is not None: - s = "ldp {}, {}, {}".format(stmt.dest.ugly_scalar,stmt.dest2.ugly_scalar,src_str) + s = f"ldp {stmt.dest.ugly_scalar}, {stmt.dest2.ugly_scalar}, {src_str}" else: - s = "ldr {}, {}".format(stmt.dest.ugly_scalar,src_str) + s = f"ldr {stmt.dest.ugly_scalar}, {src_str}" else: raise NotImplementedError() self.addLine(s, stmt.comment) @@ -151,12 +157,12 @@ def visitStore(self, stmt: StoreStmt): src_str = stmt.src.ugly if stmt.typ == AsmType.i64: - s = "add {}, {}, {}".format(stmt.dest.ugly,stmt.dest.ugly,src_str) + s = f"add {stmt.dest.ugly}, {stmt.dest.ugly}, {src_str}" elif stmt.typ == AsmType.f64x8 and stmt.aligned: if stmt.src2 is not None: - s = "stp {}, {}, {}".format(stmt.src.ugly_scalar,stmt.src2.ugly_scalar,stmt.dest.ugly) + s = f"stp {stmt.src.ugly_scalar}, {stmt.src2.ugly_scalar}, {stmt.dest.ugly}" else: - s = "str {}, {}".format(stmt.src.ugly_scalar,stmt.dest.ugly) + s = f"str {stmt.src.ugly_scalar}, {stmt.dest.ugly}" else: raise NotImplementedError() self.addLine(s, stmt.comment) diff --git a/pspamm/codegen/architectures/arm/operands.py b/pspamm/codegen/architectures/arm/operands.py index 3b9e3e0..2642c70 100644 --- a/pspamm/codegen/architectures/arm/operands.py +++ b/pspamm/codegen/architectures/arm/operands.py @@ -48,10 +48,23 @@ class Register_ARM(Register): @property def ugly(self): return self.value + + @property + def ugly_precision(self): + return self.value.split(".")[1] + + @property + def ugly_lsl_shift(self): + return { + "d": 3, + "s": 2, + "h": 1 + }[self.ugly_precision] @property def clobbered(self): - return (self.value.split(".")[0]).replace("x", "r") + # removed [this comment should stay here for now---in case there's some compiler expecting it]: .replace("x", "r") + return (self.value.split(".")[0]) @property def ugly_scalar(self): @@ -60,15 +73,15 @@ def ugly_scalar(self): @property def ugly_scalar_1d(self): return (self.value.split(".")[0]).replace("v", "d") - + @property - def ugly_1d(self): - return self.value.replace("2d", "1d") + def ugly_b32(self): + return (self.value.split(".")[0]).replace("x", "w") r = lambda n: Register_ARM(AsmType.i64, "x" + str(n)) xzr = Register_ARM(AsmType.i64, "xzr") -v = lambda n: Register_ARM(AsmType.f64x8, "v" + str(n) + ".2d") +v = lambda n, prec: Register_ARM(AsmType.f64x8, "v" + str(n) + "." + prec) class MemoryAddress_ARM(MemoryAddress): diff --git a/pspamm/codegen/architectures/arm_sve/blocksize.py b/pspamm/codegen/architectures/arm_sve/blocksize.py new file mode 100644 index 0000000..8b8c264 --- /dev/null +++ b/pspamm/codegen/architectures/arm_sve/blocksize.py @@ -0,0 +1,117 @@ +class Max: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + # v_size default is 2, however for SVE that parameter will always be larger + bm = 2 + bn = 1 + maxval = 0 + + for i in range(1, m + 1, 1): + next_multiple = i + while next_multiple % v_size != 0: + next_multiple += 1 + for j in range(1, n + 1): + if cls.ARM_condition(next_multiple, j, bk, v_size) and cls.tileable(m, i): + if i * j >= maxval: + maxval = i * j + bm = i + bn = j + + if maxval == 0: + raise RuntimeError("Could not find an appropriate block size. We suggest padding the matrix dimensions") + + while cls.ARM_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn, bk) + + @classmethod + def ARM_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn + bk) * vm + bn*bk <= 32 + + @classmethod + def tileable(cls, m, bm): + return m % bm == 0 + +class MaxK: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + # v_size default is 2, however for SVE that parameter will always be larger + bm = 2 + bn = 1 + maxval = 0 + + elem128 = 16 // prec.size() + + for i in range(1, m + 1, 1): + next_multiple = -(i // -v_size) * v_size + for j in range(1, n + 1): + if cls.ARM_condition(next_multiple, j, bk, v_size, elem128) and cls.tileable(m, i): + if i * j >= maxval: + maxval = i * j + bm = i + bn = j + + if maxval == 0: + raise RuntimeError("Could not find an appropriate block size. We suggest padding the matrix dimensions") + + while cls.ARM_condition(bm, bn, bk+1, v_size, elem128): + bk += 1 + + return (bm, bn, bk) + + @classmethod + def ARM_condition(cls, bm, bn, bk, v_size, elem128): + # ceiling division + vkext = -(bk // -elem128) + isvkext = bn*vkext < 16 if elem128 == 2 else bn*vkext < 8 + vm = -(bm // -v_size) + vk = vkext if isvkext else bk + return (bn + bk) * vm + bn*vk <= 32 + + @classmethod + def tileable(cls, m, bm): + return m % bm == 0 + +class Cube: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + # v_size default is 2, however for SVE that parameter will always be larger + bm = 2 + bn = 1 + maxval = 0 + + elem128 = 16 // prec.size() + + for i in range(1, m + 1, 1): + next_multiple = -(i // -v_size) * v_size + for j in range(1, n + 1): + for k in range(1, 200): + if cls.ARM_condition(next_multiple, j, k, v_size, elem128) and cls.tileable(m, i): + if i * j * k >= maxval: + maxval = i * j * k + bm = i + bn = j + bk = k + + if maxval == 0: + raise RuntimeError("Could not find an appropriate block size. We suggest padding the matrix dimensions") + + return (bm, bn, bk) + + @classmethod + def ARM_condition(cls, bm, bn, bk, v_size, elem128): + # ceiling division + vkext = -(bk // -elem128) + isvkext = bn*vkext < 16 if elem128 == 2 else bn*vkext < 8 + vm = -(bm // -v_size) + vk = vkext if isvkext else bk + return (bn + bk) * vm + bn*vk <= 32 + + @classmethod + def tileable(cls, m, bm): + return m % bm == 0 + +Default = MaxK diff --git a/pspamm/codegen/architectures/arm_sve/generator.py b/pspamm/codegen/architectures/arm_sve/generator.py index 062673c..80a744e 100644 --- a/pspamm/codegen/architectures/arm_sve/generator.py +++ b/pspamm/codegen/architectures/arm_sve/generator.py @@ -36,19 +36,22 @@ class Generator(AbstractGenerator): prefetch_count = 0 is_sparse = False v_len = 4 # vector register length: v_len * 128 bit + predicates = {} def get_v_size(self): - if self.precision == Precision.DOUBLE: - return 2 * self.v_len # 128 bit == 2 x 64 bit (double) - elif self.precision == Precision.SINGLE: - return 4 * self.v_len # 128 bit == 4 x 32 bit (float) - raise NotImplementedError + return (16 // self.precision.size()) * self.v_len def get_precision(self): return self.precision def get_template(self): return self.template + + def use_broadcast(self): + return True + + def has_masks(self): + return True def pred_n_trues(self, num_trues: int, v_size: int, suffix: str = None) -> Register_ARM: """pred takes num_trues=num of true elements and suffix=type of predicate (m or z) for merging or zeroing @@ -59,33 +62,55 @@ def pred_n_trues(self, num_trues: int, v_size: int, suffix: str = None) -> Regis assert (suffix == "m" or suffix == "z" or suffix is None) # we only use p7 or p0 as predicates (1 == p0, 8 == p7) - num_trues = 8 if num_trues >= v_size else 1 + index = 7 if num_trues >= v_size else self.predicates[num_trues] if suffix is None: - s = "p{}".format(num_trues - 1) + s = f"p{index}" else: - s = "p{}/{}".format(num_trues - 1, suffix) + s = f"p{index}/{suffix}" return Register_ARM(AsmType.p64x8, s) - # taken from https://stackoverflow.com/questions/14822184/is-there-a-ceiling-equivalent-of-operator-in-python - def ceil_div(self, n, d): - return -(n // -d) - # is called at most one time in matmul.py def set_sparse(self): self.is_sparse = True def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: int, n: int, k: int): vm = self.ceil_div(bm, v_size) # vm can be 0 if bm < v_size -> makes ceil_div necessary - assert ((bn + bk) * vm + bn * bk <= 32) # Needs to fit in SVE z registers - prec = "d" if self.get_precision() == Precision.DOUBLE else "s" - # use max(vm, 1) in case bm < v_size, otherwise we get no A_regs/C_regs - A_regs = Matrix([[z(max(vm, 1) * c + r , prec) for c in range(bk)] for r in range(max(vm, 1))]) - B_regs = Matrix([[z(max(vm, 1) * bk + bn * r + c, prec) for c in range(bn)] for r in range(bk)]) - C_regs = Matrix([[z(32 - max(vm, 1) * bn + max(vm, 1) * c + r, prec) for c in range(bn)] for r in range(max(vm, 1))]) + # k-broadcasting only works in 128-bit lanes + elem128 = 16 // self.get_precision().size() + vkext = -(bk // -elem128) - b_reg = max(vm, 1) * bk + # inline broadcasting is only allowed for the lower-numbered registers + self.inline_broadcast = False + if bn*vkext < 16 if self.get_precision().size() == 8 else bn*vkext < 8: + self.inline_broadcast = True + if bk == 1: + self.inline_broadcast = False + + if self.inline_broadcast: + vk = vkext + else: + vk = bk + + assert ((bn + bk) * vm + bn * vk <= 32) # Needs to fit in SVE z registers + + prec = { + Precision.DOUBLE: "d", + Precision.SINGLE: "s", + Precision.HALF: "h", + Precision.BFLOAT16: "h", + }[self.get_precision()] + + # make place for the two broadcasting registers + a_offset = 1 if bn * vk == 1 else 0 + assert ((bn + bk) * vm + bn * vk + a_offset <= 32) + + A_regs = Matrix([[z(vm * c + r + bn * vk + a_offset, prec) for c in range(bk)] for r in range(vm)]) + B_regs = Matrix([[z(bn * r + c, prec) for c in range(bn)] for r in range(vk)]) + C_regs = Matrix([[z(32 - vm * bn + vm * c + r, prec) for c in range(bn)] for r in range(vm)]) + + b_reg = 0 alpha_reg = [z(b_reg, prec), z(b_reg, prec)] beta_reg = [z(b_reg + 1, prec), z(b_reg + 1, prec)] @@ -93,11 +118,13 @@ def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: i additional_regs = [r(11), l("0.0"), r(10), r(8)] # r10 used for scaling offsets - loop_reg = r(12) + loop_regs = [r(12), r(13), r(14)] - self.init_registers(bm, v_size) + mask_regs = [p(0), p(7)] - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_reg, additional_regs + self.init_registers(m, bm, k, bk, v_size, nnz) + + return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, mask_regs def bcst_alpha_beta(self, alpha_reg: Register, @@ -124,32 +151,81 @@ def make_b_pointers(self, asm = block("No register based scaling") return asm + def init_mask(self, + m: int, + bm: int, + v_size: int, + tempreg, + maskreg + ) -> Block: + + asm = block("No register based scaling") + return asm + def init_registers(self, + m: int, bm: int, - v_size: int + k: int, + bk: int, + v_size: int, + nnz: int ) -> None: bmmod = bm % v_size + elem128 = 16 // self.get_precision().size() + bkmod = bk % elem128 if self.inline_broadcast else 0 + kmod = (k % bk) % elem128 if self.inline_broadcast else 0 + mmod = (m % bm) % v_size eol = "\\n\\t" # define the "end of line" sequence for easy assembly - p_suffix = "d" if v_size == 2 * self.v_len else "s" # determine whether predicate suffix is '.d' or '.s - gen_reg = "x" if v_size == 2 * self.v_len else "w" # determine if 'dup' registers are 64 bit or 32 bit + # determine the predicate suffix + p_suffix = { + Precision.DOUBLE: "d", + Precision.SINGLE: "s", + Precision.HALF: "h", + Precision.BFLOAT16: "h", + }[self.get_precision()] + # determine length of 'dup' registers + gen_reg = "w" if self.get_precision().size() <= 4 else "x" overhead_counter = 6 - comment = "//p7 denotes the 'all-true' predicate and, if given, p0 denotes the 'bm % v_size' predicate\n\t" + comment = "// p7 denotes the 'all-true' predicate\n\t" + comment += "// if given, p0 denotes the 'bm % v_size' predicate\n\t" + comment += "// if given, p1 denotes the 'bk % elem128' predicate\n\t" + comment += "// if given, p2 denotes the 'k % elem128' predicate\n\t" + comment += "// if given, p4 denotes the 'k % v_size' predicate\n\t" + + self.has_k_overhead = kmod != 0 + self.has_bk_overhead = bkmod != 0 + self.has_nnz_overhead = nnz % elem128 != 0 + # specification for ptrue: https://developer.arm.com/documentation/ddi0596/2021-12/SVE-Instructions/PTRUE--Initialise-predicate-from-named-constraint- # search for 'DecodePredCount' for the explanation of how the pattern in 'ptrue p{d}.{suffix}, #pattern' is decoded: # https://developer.arm.com/documentation/ddi0596/2020-12/Shared-Pseudocode/AArch64-Functions?lang=en#impl-aarch64.DecodePredCount.2 # 'ptrue' doesnt work for initialising overhead predicate when using single precision -> see valid patterns from above # overhead = "\"ptrue p0.{suffix}, #{overhead}{eol}\"\n\t" if bm != 0 else "" # define overhead predicate - overhead = "\"mov {gen_reg}{overhead_counter}, #{overhead}{eol}\"\n\t\"whilelo p0.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if bmmod != 0 else "" + overhead_bm = "\"mov {gen_reg}{overhead_counter}, #{overhead_bm}{eol}\"\n\t\"whilelo p0.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if bmmod != 0 else "" + overhead_bk = "\"mov {gen_reg}{overhead_counter}, #{overhead_bk}{eol}\"\n\t\"whilelo p1.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if self.has_bk_overhead else "" + overhead_k = "\"mov {gen_reg}{overhead_counter}, #{overhead_k}{eol}\"\n\t\"whilelo p2.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if self.has_k_overhead else "" + overhead_nnz = "\"mov {gen_reg}{overhead_counter}, #{overhead_nnz}{eol}\"\n\t\"whilelo p3.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if self.has_nnz_overhead else "" + overhead_m = "\"mov {gen_reg}{overhead_counter}, #{overhead_m}{eol}\"\n\t\"whilelo p4.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if mmod != 0 else "" all_true = "\"ptrue p7.{suffix}, #31{eol}\"" # define all true predicate - init_registers = (comment + overhead + all_true).format(suffix=p_suffix, - gen_reg=gen_reg, - overhead_counter=overhead_counter, - v_size=v_size, - overhead=bmmod, - eol=eol) + init_registers = (comment + overhead_bm + overhead_bk + overhead_k + overhead_nnz + overhead_m + all_true).format(suffix=p_suffix, + gen_reg=gen_reg, + overhead_counter=overhead_counter, + v_size=v_size, + overhead_bm=bmmod, + overhead_bk=bkmod, + overhead_k=kmod, + overhead_m=mmod, + overhead_nnz=nnz % elem128, + eol=eol) + + self.predicates[v_size] = 7 + if bmmod != 0: self.predicates[bmmod] = 0 + if bkmod != 0: self.predicates[bkmod] = 1 + if kmod != 0: self.predicates[kmod] = 2 + if mmod != 0: self.predicates[mmod] = 4 # since .format() doesn't allow partial formatting, we need to re-include the # placeholders that are replaced at the end of generating a kernel @@ -198,24 +274,31 @@ def move_register_block(self, # this gives us the base register of 'cursor' irrespective of the dummy offset we use prev_base = cursor.look(cursor_ptr, block_offset, Coords(down=0, right=0))[0].base + process_size = min(v_size, cursor.br) + for ic in range(cols): for ir in range(rows): if (mask is None) or (mask[ir, ic]): - processed = ir * v_size - p = self.pred_n_trues(b_row - processed, v_size) if not is_B else self.pred_n_trues(v_size, v_size) - p_zeroing = self.pred_n_trues(b_row - processed, v_size, "z") if not is_B else self.pred_n_trues(v_size, v_size, "z") + processed = ir * process_size + if processed >= b_row: + continue + p = self.pred_n_trues(min(b_row - processed, process_size), v_size) if not is_B else self.pred_n_trues(process_size, v_size) + p_zeroing = self.pred_n_trues(min(b_row - processed, process_size), v_size, "z") if not is_B else self.pred_n_trues(process_size, v_size, "z") cell_offset = Coords(down=ir * v_size, right=ic) # addr = base "pointer" + relative offset in bytes addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.value * load_offset + addr.disp += self.precision.size() * load_offset + + offset = addr.disp - prev_disp # count how many elements we have processed between last step and this step - cont_counter = ((addr.disp - prev_disp) // mul_vl) + cont_counter = (offset // mul_vl) larger_max_offset = cont_counter > max_mem_ins_mult + non_dividing_offset = offset % mul_vl != 0 - if larger_max_offset or (prev_overhead and addr.disp > 0): - offset_comment = "disp > {}".format(max_offset) if larger_max_offset else "previous mem. instr. used p0" + if larger_max_offset or (prev_overhead and addr.disp > 0) or non_dividing_offset: + offset_comment = f"disp > {max_offset}" if larger_max_offset else ("disp % VL != 0" if non_dividing_offset else "previous mem. instr. used p0") asm.add(add(addr.disp, additional_regs[0], offset_comment, addr.base)) prev_disp = addr.disp addr.base = additional_regs[0] @@ -240,7 +323,7 @@ def move_register_block(self, asm.add(ld(addr, registers[ir, ic], True, comment, pred=p_zeroing, is_B=is_B, scalar_offs=False, add_reg=additional_regs[2])) - prev_overhead = int(p.ugly[1]) == 0 # determine if we previously used p0 (overhead predicate) + prev_overhead = p is None or int(p.ugly[1]) == 0 # determine if we previously used p0 (overhead predicate) return asm @@ -266,8 +349,7 @@ def make_microkernel(self, v_size: int, additional_regs, to_A_block: Coords = Coords(), - to_B_block: Coords = Coords(), - is_B: bool = True + to_B_block: Coords = Coords() ) -> Block: """ make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. @@ -283,7 +365,7 @@ def make_microkernel(self, bk, bn, bidx, bpattern = B.get_block(B_ptr, to_B_block) # tell sparse_mask() that we use sve - mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size, is_sve=True) + mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size, True) asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False)) # x = 0; @@ -291,34 +373,55 @@ def make_microkernel(self, cur11 = -1000 Vm = max(self.ceil_div(bm, v_size), 1) - multiple = self.precision.value + multiple = self.precision.size() # for ld1rw (single prec): immediate offset is multiple of 4 in range of 0 to 252 # for ld1rd (double prec): immediate offset is multiple of 8 in range of 0 to 504 # in both cases: instruction encodes the immediate offset within 6 bits - max_offs = (2 ** 6 - 1) * multiple + if not self.inline_broadcast: + max_offs = (2 ** 6 - 1) * multiple + divider = 1 + elem128 = 1 + vk = bk + else: + max_offs = 127 + divider = 16 + elem128 = 16 // self.get_precision().size() + vk = -(bk // -elem128) + + preg = self.pred_n_trues(elem128, elem128, 'z') + preg_last = preg if bk % elem128 == 0 else self.pred_n_trues(bk % elem128, elem128, 'z') for Vmi in range(Vm): # set to all v_size predicates to true, we want to replicate a B element into a whole vector - p_zeroing = self.pred_n_trues(v_size, v_size, "z") - for bki in range(bk): # inside this k-block - for bni in range(bn): # inside this n-block + for bni in range(bn): # inside this n-block + for bki in range(bk): # inside this k-block + bki_reg = bki // elem128 to_cell = Coords(down=bki, right=bni) if B.has_nonzero_cell(B_ptr, to_B_block, to_cell): B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) - if B_regs[bki, bni] not in bs: + if B_regs[bki_reg, bni] not in bs: + p_zeroing = preg_last if bki_reg + 1 == vk else preg + # max_offs is the maximum allowed immediate offset when using ld1rd/ld1rw to broadcast a scalar value - if B_cell_addr.disp > max_offs: - if B_cell_addr.disp - cur11 > 0 and B_cell_addr.disp - cur11 <= max_offs: - B_cell_addr.disp = B_cell_addr.disp - cur11 + if B_cell_addr.disp > max_offs or B_cell_addr.disp % divider != 0: + moved = B_cell_addr.disp - cur11 + if moved > 0 and moved <= max_offs and moved % divider == 0: + B_cell_addr.disp = moved else: asm.add(add(B_cell_addr.disp, additional_regs[0], "", B_cell_addr.base)) cur11 = B_cell_addr.disp B_cell_addr.disp = 0 B_cell_addr.base = additional_regs[0] - asm.add(ld(B_cell_addr, B_regs[bki, bni], True, B_comment, pred=p_zeroing, is_B=is_B)) - bs.append(B_regs[bki, bni]) + + if not self.inline_broadcast: + asm.add(ld(B_cell_addr, B_regs[bki_reg, bni], True, B_comment, pred=p_zeroing, is_B=True)) + else: + asm.add(ld(B_cell_addr, B_regs[bki_reg, bni], True, B_comment, pred=p_zeroing, sub128=True)) + bs.append(B_regs[bki_reg, bni]) for Vmi in range(Vm): + # TODO: refactor cell_indices into the cursors/blocks + cell_indices = {} p_merging = self.pred_n_trues(bm - Vmi * v_size, v_size, "m") end_index = bm if Vmi + 1 == Vm else Vmi * v_size + v_size # end_index helps us print the right index ranges for bki in range(bk): # inside this k-block @@ -328,7 +431,16 @@ def make_microkernel(self, B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) comment = "C[{}:{},{}] += A[{}:{},{}]*{}".format(Vmi * v_size, end_index, bni, Vmi * v_size, end_index, bki, B_comment) - asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, pred=p_merging)) + + bki_reg = bki // elem128 + if (bki_reg, bni) not in cell_indices: + cell_indices[(bki_reg, bni)] = 0 + if not self.inline_broadcast: + bcast = None + else: + bcast = cell_indices[(bki_reg, bni)] + asm.add(fma(B_regs[bki_reg, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, pred=p_merging, bcast=bcast)) + cell_indices[(bki_reg, bni)] += 1 return asm def init_prefetching(self, prefetching): diff --git a/pspamm/codegen/architectures/arm_sve/inlineprinter.py b/pspamm/codegen/architectures/arm_sve/inlineprinter.py index fba3837..26ce670 100644 --- a/pspamm/codegen/architectures/arm_sve/inlineprinter.py +++ b/pspamm/codegen/architectures/arm_sve/inlineprinter.py @@ -19,6 +19,14 @@ def __init__(self, precision: Precision): self.output = [] self.stack = [] self.precision = precision + self.ugly_precision ={ + Precision.DOUBLE: "d", + Precision.SINGLE: "w", + Precision.HALF: "h", + Precision.BFLOAT16: "h", + }[self.precision] + + assert precision in (Precision.BFLOAT16, Precision.HALF, Precision.SINGLE, Precision.DOUBLE) def show(self): print("\n".join(self.output)) @@ -43,7 +51,11 @@ def visitFma(self, stmt: FmaStmt): m = stmt.mult_src.ugly a = stmt.add_dest.ugly p = self.p_string(stmt.pred) - s = "fmla {}, {}{}, {}".format(a, p, m, b) + if stmt.bcast is not None: + # NOTE: ignores predicate + s = f"fmla {a}, {m}, {b}[{stmt.bcast}]" + else: + s = f"fmla {a}, {p}{m}, {b}" self.addLine(s, stmt.comment) @@ -53,73 +65,78 @@ def visitMul(self, stmt: MulStmt): a = stmt.dest.ugly if a != b: - s1 = "movprfx {}, {}".format(a.split(".")[0], b.split(".")[0]) + s1 = f"movprfx {a.split('.')[0]}, {b.split('.')[0]}" self.addLine(s1, "move {} into {}".format(b, a)) b = a p = self.p_string(stmt.pred) - s = "fmul {}, {}{}, {}".format(a, p, b, m) + s = f"fmul {a}, {p}{b}, {m}" self.addLine(s, stmt.comment) def visitBcst(self, stmt: BcstStmt): # Used to broadcast a scalar register into a vector register b = stmt.bcast_src.ugly a = stmt.dest.ugly - # make sure the src register is a W register when using single precision - if self.precision == Precision.SINGLE: + # make sure the src register is a W register when using single/half precision + if self.precision.size() <= 4: b = "w" + b[1:] - s = "dup {}, {}".format(a, b) + s = f"dup {a}, {b}" self.addLine(s, stmt.comment) def visitAdd(self, stmt: AddStmt): - if isinstance(stmt.src, Constant) and (stmt.src.value > 4095 or stmt.src.value < -4095): + if isinstance(stmt.src, Constant) and stmt.src.value == 0: + # avoid 0 instructions + return + if isinstance(stmt.src, Constant) and (stmt.src.value > 4095 or stmt.src.value < -4095): # This condition is probably related to immediate values being restricted to 12 bits for add instructions # https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/ADD--immediate- # https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/ADD--immediate---Add--immediate-- if (stmt.src.value >> 16) & 0xFFFF > 0 and stmt.src.value < 0: s = "mov x11, #-1" - s1 = "movk x11, #{}".format((stmt.src.value) & 0xFFFF) - val = ((stmt.src.value >> 16) & 0xFFFF) - s2 = "movk x11, #{}, lsl #16".format(val) + val1 = (stmt.src.value) & 0xFFFF + s1 = f"movk x11, #{val1}" + val2 = ((stmt.src.value >> 16) & 0xFFFF) + s2 = f"movk x11, #{val2}, lsl #16" self.addLine(s, "") self.addLine(s1, "load lower 16 bit of immediate that requires more than 16 bit") self.addLine(s2, "load upper 16 bit of immediate that requires more than 16 bit") elif (stmt.src.value >> 16) != 0: - s1 = "mov x11, #{}".format((stmt.src.value) & 0xFFFF) - val = ((stmt.src.value >> 16) & 0xFFFF) - s2 = "movk x11, #{}, lsl #16".format(val) + val1 = (stmt.src.value) & 0xFFFF + s1 = "mov x11, #{val1}" + val2 = ((stmt.src.value >> 16) & 0xFFFF) + s2 = "movk x11, #{val2}, lsl #16" self.addLine(s1, "load lower 16 bit of immediate that requires more than 16 bit") self.addLine(s2, "load upper 16 bit of immediate that requires more than 16 bit") else: - s = "mov x11, {}".format(stmt.src.ugly) + s = f"mov x11, {stmt.src.ugly}" self.addLine(s, "load lower 16 bit of immediate ") if stmt.dest.ugly != "x11": - s = "add {}, {}, x11".format(stmt.dest.ugly, stmt.dest.ugly) + s = f"add {stmt.dest.ugly}, {stmt.dest.ugly}, x11" self.addLine(s, stmt.comment) if stmt.additional is not None: - s = "add {}, {}, {}".format(stmt.dest.ugly, stmt.dest.ugly, stmt.additional.ugly) + s = f"add {stmt.dest.ugly}, {stmt.dest.ugly}, {stmt.additional.ugly}" self.addLine(s, stmt.comment) else: # if stmt.src is a Constant but outside of the above range of value < -4095 or value > 4095 # we can simply add the Constant to a register if stmt.additional is not None: - s = "add {}, {}, {}".format(stmt.dest.ugly, stmt.additional.ugly, stmt.src.ugly) + s = f"add {stmt.dest.ugly}, {stmt.additional.ugly}, {stmt.src.ugly}" else: - s = "add {}, {}, {}".format(stmt.dest.ugly, stmt.dest.ugly, stmt.src.ugly) + s = f"add {stmt.dest.ugly}, {stmt.dest.ugly}, {stmt.src.ugly}" self.addLine(s, stmt.comment) def visitLabel(self, stmt: LabelStmt): - s = "{}:".format(stmt.label.ugly) + s = f"{stmt.label.ugly}:" self.addLine(s, stmt.comment) def visitCmp(self, stmt: CmpStmt): - s = "cmp {}, {}".format(stmt.rhs.ugly, stmt.lhs.ugly) + s = f"cmp {stmt.rhs.ugly}, {stmt.lhs.ugly}" self.addLine(s, stmt.comment) def visitJump(self, stmt: JumpStmt): - s = "b.lo {}".format(stmt.destination.ugly) + s = f"b.lo {stmt.destination.ugly}" self.addLine(s, stmt.comment) def visitMov(self, stmt: MovStmt): @@ -128,31 +145,41 @@ def visitMov(self, stmt: MovStmt): else: src_str = stmt.src.ugly if stmt.typ == AsmType.f64x8: - s = "fmov {}, {}".format(stmt.dest.ugly, src_str) + s = f"fmov {stmt.dest.ugly}, {src_str}" else: - s = "mov {}, {}".format(stmt.dest.ugly, src_str) + s = f"mov {stmt.dest.ugly}, {src_str}" self.addLine(s, stmt.comment) def visitLoad(self, stmt: LoadStmt): if isinstance(stmt.src, Label): src_str = "#" + stmt.src.ugly - elif stmt.src.ugly_offset != "0" and stmt.scalar_offs: - self.addLine("mov {}, #{}".format(stmt.add_reg.ugly, stmt.src.ugly_offset), "move immediate offset into {}".format(stmt.add_reg.ugly)) + elif (stmt.src.ugly_offset != "0" and stmt.scalar_offs): + self.addLine(f"mov {stmt.add_reg.ugly}, #{stmt.src.ugly_offset}", f"move immediate offset into {stmt.add_reg.ugly}") # TODO: adapt ugly_lsl_shift to account for possible single precision instead of double precision - src_str = "[{}, {}, LSL #{}]".format(stmt.src.ugly_base, stmt.add_reg.ugly, stmt.dest.ugly_lsl_shift) + src_str = f"[{stmt.src.ugly_base}, {stmt.add_reg.ugly}, LSL #{stmt.dest.ugly_lsl_shift}]" + elif stmt.typ == AsmType.f64x4 or stmt.typ == AsmType.f64x2: + # (note: the 128-bit and 256-bit broadcasts need the following more rudimentary format here) + if stmt.src.ugly_offset == '0': + src_str = f"[{stmt.src.ugly_base}]" + else: + src_str = f"[{stmt.src.ugly_base}, #{stmt.src.ugly_offset}]" else: src_str = stmt.src.ugly if not stmt.is_B else stmt.src.ugly_no_vl_scaling p = self.p_string(stmt.pred) - prec = "d" if stmt.dest.ugly_precision == "d" else "w" + prec = self.ugly_precision if stmt.typ == AsmType.i64: - s = "add {}, {}, {}".format(stmt.dest.ugly, stmt.dest.ugly, src_str) + s = f"add {stmt.dest.ugly}, {stmt.dest.ugly}, {src_str}" elif stmt.typ == AsmType.f64x8 and stmt.aligned: if stmt.is_B: - s = "ld1r{} {}, {}{}".format(prec, stmt.dest.ugly, p, src_str) + s = f"ld1r{prec} {stmt.dest.ugly}, {p}{src_str}" else: - s = "ld1{} {}, {}{}".format(prec, stmt.dest.ugly, p, src_str) + s = f"ld1{prec} {stmt.dest.ugly}, {p}{src_str}" + elif stmt.typ == AsmType.f64x4 and stmt.aligned: + s = f"ld1ro{prec} {stmt.dest.ugly}, {p}{src_str}" + elif stmt.typ == AsmType.f64x2 and stmt.aligned: + s = f"ld1rq{prec} {stmt.dest.ugly}, {p}{src_str}" else: raise NotImplementedError() self.addLine(s, stmt.comment) @@ -161,20 +188,21 @@ def visitStore(self, stmt: StoreStmt): if isinstance(stmt.src, Label): src_str = "#" + stmt.src.ugly elif stmt.dest.ugly_offset != "0" and stmt.scalar_offs: - self.addLine("mov {}, #{}".format(stmt.add_reg.ugly, stmt.dest.ugly_offset), - "move immediate offset into {}".format(stmt.add_reg.ugly)) + self.addLine(f"mov {stmt.add_reg.ugly}, #{stmt.dest.ugly_offset}", + f"move immediate offset into {stmt.add_reg.ugly}") # TODO: adapt ugly_lsl_shift to account for possible single precision instead of double precision - dest_str = "[{}, {}, LSL #{}]".format(stmt.dest.ugly_base, stmt.add_reg.ugly, stmt.src.ugly_lsl_shift) + regsize = stmt.add_dest.size() // 16 + dest_str = f"[{stmt.dest.ugly_base}, {stmt.add_reg.ugly}, LSL #{stmt.src.ugly_lsl_shift}]" else: dest_str = stmt.dest.ugly p = self.p_string(stmt.pred) - prec = "d" if stmt.src.ugly_precision == "d" else "w" + prec = self.ugly_precision if stmt.typ == AsmType.i64: - s = "add {}, {}, {}".format(stmt.dest.ugly, stmt.dest.ugly, src_str) + s = f"add {stmt.dest.ugly}, {stmt.dest.ugly}, {src_str}" elif stmt.typ == AsmType.f64x8 and stmt.aligned: - s = "st1{} {}, {}{}".format(prec, stmt.src.ugly, p, dest_str) + s = f"st1{prec} {stmt.src.ugly}, {p}{dest_str}" else: raise NotImplementedError() self.addLine(s, stmt.comment) @@ -185,10 +213,10 @@ def visitPrefetch(self, stmt: PrefetchStmt): temporality = "KEEP" # could use "STRM" for non-temporal prefetching if needed xn = stmt.dest.ugly_base offset = stmt.dest.ugly_offset - src_string = "[{}, {}, MUL VL]".format(xn, offset) + src_string = f"[{xn}, {offset}, MUL VL]" p = self.p_string(stmt.pred) - prec = "d" if stmt.precision == Precision.DOUBLE else "w" - s = "prf{} P{}{}{}, {}{}".format(prec, stmt.access_type, cache_level, temporality, p.split('/')[0], src_string) + prec = self.ugly_precision + s = f"prf{prec} P{stmt.access_type}{cache_level}{temporality}, {p.split('/')[0]}{src_string}" self.addLine(s, "prefetch from memory") def visitBlock(self, block: Block): diff --git a/pspamm/codegen/architectures/arm_sve/operands.py b/pspamm/codegen/architectures/arm_sve/operands.py index 93d4828..8c962b3 100644 --- a/pspamm/codegen/architectures/arm_sve/operands.py +++ b/pspamm/codegen/architectures/arm_sve/operands.py @@ -52,7 +52,11 @@ def ugly_precision(self): @property def ugly_lsl_shift(self): - return 3 if self.ugly_precision == "d" else 2 + return { + "d": 3, + "s": 2, + "h": 1 + }[self.ugly_precision] @property def clobbered(self): @@ -68,15 +72,11 @@ def ugly_scalar_1d(self): #turns "Vn.2d" into "Dn" return (self.value.split(".")[0]).replace("v", "d") - @property - def ugly_1d(self): - return self.value.replace("2d", "1d") - r = lambda n: Register_ARM(AsmType.i64, "x" + str(n)) xzr = Register_ARM(AsmType.i64, "xzr") z = lambda n, prec: Register_ARM(AsmType.f64x8, "z" + str(n) + "." + prec) - +p = lambda n: Register_ARM(AsmType.i64, "p" + str(n)) class MemoryAddress_ARM(MemoryAddress): @property diff --git a/pspamm/codegen/architectures/hsw/blocksize.py b/pspamm/codegen/architectures/hsw/blocksize.py new file mode 100644 index 0000000..0a38028 --- /dev/null +++ b/pspamm/codegen/architectures/hsw/blocksize.py @@ -0,0 +1,107 @@ +class Old: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + + bm = m + bn = n + + if cls.HSW_condition(bm, bn, bk, v_size): + while cls.HSW_condition(bm, bn, bk+1, v_size): + bk += 1 + return (bm, bn) + + while not cls.HSW_condition(bm, bn, bk, v_size): + bm, bn = cls.lowerToNextDiv(m, n, bm, bn, v_size) + + while cls.HSW_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn) + + @classmethod + def lowerToNextDiv(cls, m, n, bm, bn, v_size): + if bm > bn and bm > v_size: + bm -= v_size + while m % bm != 0: + bm -= v_size + else: + bn -= 1 + while n % bn != 0: + bn -= 1 + + return bm, bn + + @classmethod + def HSW_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn + bk) * vm + bn * bk <= 16 + +class Max: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + bm = 4 + bn = 1 + maxval = 0 + + for i in range(v_size, m+1, v_size): + for j in range(1, n+1): + # can be replaced by cls.HSW_condition_extended here + # (but that seemed to be slower in the end) + if cls.HSW_condition(i, j, bk, v_size): + if i*j > maxval and (cls.HSW_condition(i, j, bk, v_size) or j > 1): + maxval = i*j + bm = i + bn = j + + while cls.HSW_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn, bk) + + @classmethod + def HSW_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn + bk) * vm + bn * bk <= 16 + + @classmethod + def HSW_condition_extended(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return bn * vm + bn * bk + 1 <= 16 + +class Cube: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + bm = 4 + bn = 1 + maxval = 0 + + for i in range(v_size, m+1, v_size): + for j in range(1, n+1): + for k in range(1, 200): + # can be replaced by cls.HSW_condition_extended here + # (but that seemed to be slower in the end) + if cls.HSW_condition(i, j, bk, v_size): + if i*j*k >= maxval and (cls.HSW_condition(i, j, k, v_size) or j > 1): + maxval = i*j*k + bm = i + bn = j + bk = k + + return (bm, bn, bk) + + @classmethod + def HSW_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn + bk) * vm + bn * bk <= 16 + + @classmethod + def HSW_condition_extended(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return bn * vm + bn * bk + 1 <= 16 + +Default = Max diff --git a/pspamm/codegen/architectures/hsw/generator.py b/pspamm/codegen/architectures/hsw/generator.py index 0b5e87b..5ad7c28 100644 --- a/pspamm/codegen/architectures/hsw/generator.py +++ b/pspamm/codegen/architectures/hsw/generator.py @@ -5,7 +5,7 @@ from pspamm.codegen.sugar import * from pspamm.codegen.generator import * from pspamm.codegen.precision import * - +from pspamm.codegen.regcache import * class Generator(AbstractGenerator): template = """ @@ -32,52 +32,76 @@ class Generator(AbstractGenerator): }}}}; """ + v_len = 2 + def get_v_size(self): - if self.precision == Precision.DOUBLE: - return 4 - elif self.precision == Precision.SINGLE: - return 8 - raise NotImplementedError + return (16 // self.precision.size()) * self.v_len def get_template(self): return Generator.template + def use_broadcast(self): + return True + + def has_masks(self): + return False + + def init_mask(self, m, bm, v_size, tempreg, maskregs): + return block("") + def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int): assert(bm % v_size == 0) vm = bm//v_size - assert((bn + bk) * vm + bn * bk <= 16) # Needs to fit in AVX/AVX2 ymm registers - A_regs = Matrix([[ymm(vm*c + r) for c in range(bk)] for r in range(vm)]) - B_regs = Matrix([[ymm(vm*bk + bn * r + c) for c in range(bn)] for r in range(bk)]) - C_regs = Matrix([[ymm(16 - vm*bn + vm*c + r) for c in range(bn)] - for r in range(vm)]) - print([[ymm(vm*c + r ).ugly for c in range(bk)] for r in range(vm)]) - print([[ymm(vm*bk + bn * r + c).ugly for c in range(bn)] for r in range(bk)]) - print([[ymm(16 - vm*bn + vm*c + r).ugly for c in range(bn)] + # Needs to fit in AVX/AVX2 ymm registers + if (bn + bk) * vm + bn * bk <= 16: + self.preloadA = True + else: + self.preloadA = False + assert(bn * vm + bn * bk + 1 <= 16) + + vmm = { + 1: xmm, + 2: ymm + }[self.v_len] + + if self.preloadA: + A_regs = Matrix([[vmm(vm*c + r) for c in range(bk)] for r in range(vm)]) + Aoffset = vm*bk + else: + A_regs = Matrix([[vmm(0) for c in range(bk)] for r in range(vm)]) + Aoffset = 1 + + B_regs = Matrix([[vmm(Aoffset + bn * r + c) for c in range(bn)] for r in range(bk)]) + C_regs = Matrix([[vmm(16 - vm*bn + vm*c + r) for c in range(bn)] for r in range(vm)]) starting_regs = [rdi, rsi, rdx, rbx, rcx] - b_reg = vm*bk - alpha_reg = [xmm(b_reg), ymm(b_reg)] - beta_reg = [xmm(b_reg + 1), ymm(b_reg + 1)] + b_reg = Aoffset + alpha_reg = [xmm(b_reg), vmm(b_reg)] + beta_reg = [xmm(b_reg + 1), vmm(b_reg + 1)] - available_regs = [r(9),r(10),r(11),r(13),r(14),r(15),rax] + available_regs = [r(9),r(10),r(11),r(15),rax] # ,r(13),r(14) additional_regs = [r(8)] reg_count = 0 - for i in range(1024, min(max(nnz * self.precision.value, m*k*self.precision.value, m*n*self.precision.value),8000), 2048): + self.spontaneous_scaling = False + for i in range(1024, min(max(nnz * self.precision.size(), m*k*self.precision.size(), m*n*self.precision.size()),8000), 2048): additional_regs.append(available_regs[reg_count]) reg_count += 1 - for i in range(8192, min(nnz * self.precision.value, 33000), 8192): + for i in range(8192, min(nnz * self.precision.size(), 33000), 8192): + if reg_count == len(available_regs): + self.spontaneous_scaling = True + break additional_regs.append(available_regs[reg_count]) reg_count += 1 - loop_reg = r(12) + loop_regs = [r(12), r(13), r(14)] - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_reg, additional_regs + return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, [] def bcst_alpha_beta(self, @@ -99,8 +123,9 @@ def make_scaling_offsets(self, asm = block("Optimize usage of offsets when accessing B Matrix") - for i in range(1, min(len(additional_regs), 5)): - asm.add(mov(c(1024 + (i-1) * 2048), additional_regs[i], False)) + if not self.spontaneous_scaling: + for i in range(1, min(len(additional_regs), 5)): + asm.add(mov(c(1024 + (i-1) * 2048), additional_regs[i], False)) return asm @@ -112,34 +137,47 @@ def make_b_pointers(self, asm = block("Optimize usage of offsets when accessing B Matrix") - reg_count = 5 + if not self.spontaneous_scaling: + reg_count = 5 - for i in range(8192, min(nnz * self.precision.value, 33000), 8192): - asm.add(lea(B_reg, additional_regs[reg_count], i)) - reg_count += 1 + for i in range(8192, min(nnz * self.precision.size(), 33000), 8192): + asm.add(lea(B_reg, additional_regs[reg_count], i)) + reg_count += 1 return asm - def reg_based_scaling(self, addr: MemoryAddress, additional_regs: List[Register], with_index: bool): - if addr.disp >= 1024 and ((addr.disp < 32768 and with_index) or addr.disp < 8192): - scaling_and_register = { - 1: (1, 1), - 2: (2, 1), - 3: (1, 2), - 4: (4, 1), - 5: (1, 3), - 6: (2, 2), - 7: (1, 4) - } - if addr.disp % 8192 >= 1024: - addr.scaling, reg = scaling_and_register[ (addr.disp % 8192) // 1024 ] - addr.index = additional_regs[reg] - - if addr.disp >= 8192: - addr.base = additional_regs[addr.disp // 8192 + 4] - - addr.disp = addr.disp % 1024 + def reg_based_scaling(self, regcache, asm, addr: MemoryAddress, additional_regs: List[Register], with_index: bool): + if addr.disp >= 1024: + if ((addr.disp < 32768 and with_index) or addr.disp < 8192) and not self.spontaneous_scaling: + scaling_and_register = { + 1: (1, 1), + 2: (2, 1), + 3: (1, 2), + 4: (4, 1), + 5: (1, 3), + 6: (2, 2), + 7: (1, 4) + } + if addr.disp % 8192 >= 1024: + addr.scaling, reg = scaling_and_register[ (addr.disp % 8192) // 1024 ] + addr.index = additional_regs[reg] + + if addr.disp >= 8192 and not self.spontaneous_scaling: + addr.base = additional_regs[addr.disp // 8192 + 4] + + addr.disp = addr.disp % 1024 + else: + # TODO: not 100%ly sure about this code here... + large_offset = addr.disp // 1024 + + basereg, load = regcache.get(large_offset) + if load: + asm.add(mov(addr.base, basereg, False)) + asm.add(add(c(large_offset * 1024), basereg)) + + addr.base = basereg + addr.disp = addr.disp % 1024 def move_register_block(self, cursor: Cursor, @@ -163,7 +201,7 @@ def move_register_block(self, if (mask is None) or (mask[ir,ic]): cell_offset = Coords(down=ir*v_size, right=ic) addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.value * load_offset + addr.disp += self.precision.size() * load_offset if store: asm.add(mov(registers[ir,ic], addr, True, comment)) if prefetching == 'BL2viaC': @@ -172,6 +210,30 @@ def move_register_block(self, asm.add(mov(addr, registers[ir,ic], True, comment)) return asm + def move_register_single(self, + cursor: Cursor, + cursor_ptr: CursorLocation, + block_offset: Coords, + registers: Matrix[Register], + v_size: int, + additional_regs, + ir, + ic, + mask: Matrix[bool] = None, + store: bool = False, + prefetching: str = None, + load_offset: int = 0 + ) -> Block: + + asm = block("") + + if (mask is None) or (mask[ir,ic]): + cell_offset = Coords(down=ir*v_size, right=ic) + addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) + addr.disp += self.precision.size() * load_offset + asm.add(mov(addr, registers[ir,ic], True, comment)) + return asm + def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: rows, cols = registers.shape @@ -210,17 +272,22 @@ def make_microkernel(self, assert(bm % v_size == 0) mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size) - asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False)) + if self.preloadA: + asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False)) + else: + asm.add(self.move_register_single(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, 0, 0, mask, store=False)) + + regcache = RegisterCache(additional_regs) bs = [] bsv = [] for Vmi in range(bm//v_size): - for bki in range(bk): # inside this k-block - for bni in range(bn): # inside this n-block + for bni in range(bn): # inside this n-block + for bki in range(bk): # inside this k-block to_cell = Coords(down=bki, right=bni) if B.has_nonzero_cell(B_ptr, to_B_block, to_cell): B_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) - self.reg_based_scaling(B_addr, additional_regs, True) + self.reg_based_scaling(regcache, asm, B_addr, additional_regs, True) if B_regs[bki, bni] not in bs: asm.add(bcst(B_addr, B_regs[bki, bni], comment=B_comment)) bs.append(B_regs[bki, bni]) @@ -231,13 +298,15 @@ def make_microkernel(self, for Vmi in range(bm//v_size): for bki in range(bk): # inside this k-block + if not self.preloadA and not (Vmi, bki) == (0,0): + asm.add(self.move_register_single(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, Vmi, bki, mask, store=False)) for bni in range(bn): # inside this n-block to_cell = Coords(down=bki, right=bni) if B.has_nonzero_cell(B_ptr, to_B_block, to_cell): B_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) - self.reg_based_scaling(B_addr, additional_regs, True) + self.reg_based_scaling(regcache, asm, B_addr, additional_regs, True) comment = "C[{}:{},{}] += A[{}:{},{}]*{}".format(Vmi*v_size,Vmi*v_size+v_size,bni,Vmi*v_size,Vmi*v_size+v_size,bki,B_comment) - asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=False)) + asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=None)) return asm def init_prefetching(self, prefetching): diff --git a/pspamm/codegen/architectures/hsw/inlineprinter.py b/pspamm/codegen/architectures/hsw/inlineprinter.py index 00ab77b..7ac5ce7 100644 --- a/pspamm/codegen/architectures/hsw/inlineprinter.py +++ b/pspamm/codegen/architectures/hsw/inlineprinter.py @@ -20,13 +20,16 @@ class InlinePrinter(Visitor): def __init__(self, precision: Precision): self.output = [] self.stack = [] - assert precision in [Precision.SINGLE, Precision.DOUBLE] - self.precision = 'd' if precision == Precision.DOUBLE else 's' + assert precision in (Precision.SINGLE, Precision.DOUBLE) + self.precision = precision + self.psuffix = { + Precision.DOUBLE: "d", + Precision.SINGLE: "s" + }[precision] def show(self): print("\n".join(self.output)) - def addLine(self, stmt: str, comment: str): line = " "*self.lmargin + self.indent*self.depth @@ -43,22 +46,20 @@ def addLine(self, stmt: str, comment: str): self.output.append(line) - - def visitFma(self, stmt: FmaStmt): b = stmt.bcast_src.ugly m = stmt.mult_src.ugly a = stmt.add_dest.ugly # no broadcasting supported inside the instruction (unlike AVX-512) - s = "vfmadd231p{} {}, {}, {}".format(self.precision, b,m,a) + s = f"vfmadd231p{self.psuffix} {b}, {m}, {a}" self.addLine(s, stmt.comment) def visitMul(self, stmt: MulStmt): b = stmt.src.ugly m = stmt.mult_src.ugly a = stmt.dest.ugly - s = "vmulp{} {}, {}, {}".format(self.precision, b,m,a) + s = f"vmulp{self.psuffix} {b}, {m}, {a}" self.addLine(s, stmt.comment) def visitBcst(self, stmt: BcstStmt): @@ -68,23 +69,28 @@ def visitBcst(self, stmt: BcstStmt): if isinstance(stmt.bcast_src, Register): # reformat bcast_src to be a memory address b = "0({})".format(b) - s = "vbroadcasts{} {}, {}".format(self.precision, b, a) + regsize = stmt.dest.size() + instruction = "vmovddup" if self.precision == Precision.DOUBLE and regsize == 16 else f"vbroadcasts{self.psuffix}" + s = f"{instruction} {b}, {a}" self.addLine(s, stmt.comment) def visitAdd(self, stmt: AddStmt): - s = "addq {}, {}".format(stmt.src.ugly,stmt.dest.ugly) + if isinstance(stmt.src, Constant) and stmt.src.value == 0: + # avoid 0 instructions + return + s = f"addq {stmt.src.ugly}, {stmt.dest.ugly}" self.addLine(s, stmt.comment) def visitLabel(self, stmt: LabelStmt): - s = "{}:".format(stmt.label.ugly) + s = f"{stmt.label.ugly}:" self.addLine(s, stmt.comment) def visitCmp(self, stmt: CmpStmt): - s = "cmp {}, {}".format(stmt.lhs.ugly,stmt.rhs.ugly) + s = f"cmp {stmt.lhs.ugly}, {stmt.rhs.ugly}" self.addLine(s, stmt.comment) def visitJump(self, stmt: JumpStmt): - s = "jl {}".format(stmt.destination.ugly) + s = f"jl {stmt.destination.ugly}" self.addLine(s, stmt.comment) def visitMov(self, stmt: MovStmt): @@ -94,22 +100,22 @@ def visitMov(self, stmt: MovStmt): src_str = stmt.src.ugly if stmt.typ == AsmType.i64: - s = "movq {}, {}".format(src_str,stmt.dest.ugly) + s = f"movq {src_str}, {stmt.dest.ugly}" elif stmt.typ == AsmType.f64x8 and stmt.aligned: if isinstance(stmt.src, Constant) and stmt.src.value == 0: - s = "vpxor {}, {}, {}".format(stmt.dest.ugly,stmt.dest.ugly,stmt.dest.ugly) + s = f"vpxor {stmt.dest.ugly}, {stmt.dest.ugly}, {stmt.dest.ugly}" else: - s = "vmovup{} {}, {}".format(self.precision, src_str,stmt.dest.ugly) + s = f"vmovup{self.psuffix} {src_str}, {stmt.dest.ugly}" else: raise NotImplementedError() self.addLine(s, stmt.comment) def visitLea(self, stmt: LeaStmt): - s = "leaq {}({}), {}".format(stmt.offset,stmt.src.ugly,stmt.dest.ugly) + s = f"leaq {stmt.offset}({stmt.src.ugly}), {stmt.dest.ugly}" self.addLine(s, stmt.comment) def visitPrefetch(self, stmt: PrefetchStmt): - s = "prefetcht1 {}".format(stmt.dest.ugly) + s = f"prefetcht1 {stmt.dest.ugly}" self.addLine(s, stmt.comment) def visitBlock(self, block: Block): diff --git a/pspamm/codegen/architectures/knl/blocksize.py b/pspamm/codegen/architectures/knl/blocksize.py new file mode 100644 index 0000000..e51165f --- /dev/null +++ b/pspamm/codegen/architectures/knl/blocksize.py @@ -0,0 +1,119 @@ +class Old: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + + bm = m + bn = n + + if cls.KNL_condition(bm, bn, bk, v_size): + while cls.KNL_condition(bm, bn, bk+1, v_size): + bk += 1 + return (bm, bn) + + while not cls.KNL_condition(bm, bn, bk, v_size): + bm, bn = cls.lowerToNextDiv(m, n, bm, bn, v_size) + + while cls.KNL_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn) + + @classmethod + def lowerToNextDiv(cls, m, n, bm, bn, v_size): + if bm > bn and bm > v_size: + bm -= v_size + while m % bm != 0: + bm -= v_size + else: + bn -= 1 + while n % bn != 0: + bn -= 1 + + return bm, bn + + @classmethod + def KNL_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn+bk) * vm <= 32 + +class Max: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + + bm = 8 + bn = 1 + maxval = 0 + + for i in range(1, m+1): + next_multiple = -(bm // -v_size) + for j in range(1, n+1): + if cls.KNL_condition(next_multiple, j, bk, v_size) and cls.tileable(m, bm): + if i*j > maxval: + maxval = i*j + bm = i + bn = j + + while cls.KNL_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn, bk) + + @classmethod + def KNL_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn+bk) * vm <= 32 + + @classmethod + def tileable(cls, m, bm): + return m % bm == 0 + +class MaxBn: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + + bm = v_size + bn = 1 + + for j in range(1, n+1): + if cls.KNL_condition(bm, j, bk, v_size): + bn = j + + while cls.KNL_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn, bk) + + @classmethod + def KNL_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn+bk) * vm <= 32 + +class CubeBn: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + + bm = v_size + bn = 1 + + maxval = 0 + + for j in range(1, n+1): + for k in range(1, 200): + if cls.KNL_condition(bm, j, k, v_size): + if j*k >= maxval: + maxval = j*k + bn = j + bk = k + + return (bm, bn, bk) + + @classmethod + def KNL_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn+bk) * vm <= 32 + +Default = MaxBn diff --git a/pspamm/codegen/architectures/knl/generator.py b/pspamm/codegen/architectures/knl/generator.py index 8c984a3..493bbdf 100644 --- a/pspamm/codegen/architectures/knl/generator.py +++ b/pspamm/codegen/architectures/knl/generator.py @@ -5,7 +5,7 @@ from pspamm.codegen.sugar import * from pspamm.codegen.generator import * from pspamm.codegen.precision import * - +from pspamm.codegen.regcache import * class Generator(AbstractGenerator): template = """ @@ -32,24 +32,41 @@ class Generator(AbstractGenerator): }}}}; """ + v_len = 4 + predicates = {0:mask(0)} + def get_v_size(self): - if self.precision == Precision.DOUBLE: - return 8 - elif self.precision == Precision.SINGLE: - return 16 - raise NotImplementedError + return (16 // self.precision.size()) * self.v_len def get_template(self): return Generator.template + def use_broadcast(self): + return False + + def has_masks(self): + return True # for now + + def pred_n_trues(self, count, v_size, mode): + # a bit hacky at the moment (won't work for all masks) + if count < v_size: + return Predicate(self.predicates[count], mode=='z') + else: + return None + def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int): - assert(bm % v_size == 0) - vm = bm//v_size - assert((bn+bk) * vm <= 32) # Needs to fit in AVX512 zmm registers + vm = self.ceil_div(bm, v_size) + assert((bn+bk) * vm <= 32) # Needs to fit in AVX512 xmm/ymm/zmm registers + + vmm = { + 1: xmm, + 2: ymm, + 4: zmm + }[self.v_len] - A_regs = Matrix([[zmm(vm*c + r) for c in range(bk)] for r in range(vm)]) + A_regs = Matrix([[vmm(vm*c + r) for c in range(bk)] for r in range(vm)]) B_regs = [] - C_regs = Matrix([[zmm(32 - vm*bn + vm*c + r) for c in range(bn)] + C_regs = Matrix([[vmm(32 - vm*bn + vm*c + r) for c in range(bn)] for r in range(vm)]) starting_regs = [rdi, rsi, rdx, rbx, rcx] @@ -57,24 +74,53 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: alpha_reg = [rbx, rbx] beta_reg = [rcx, rcx] - available_regs = [r(9),r(10),r(11),r(13),r(14),r(15),rax] + available_regs = [r(9),r(10),r(11),r(15),rax] # ,r(13),r(14) additional_regs = [r(8)] + mask_regs = [mask(1), mask(2)] + reg_count = 0 - for i in range(1024, min(max(nnz * self.precision.value, m*k*self.precision.value, m*n*self.precision.value),8000), 2048): + self.spontaneous_scaling = False + for i in range(1024, min(max(nnz * self.precision.size(), m*k*self.precision.size(), m*n*self.precision.size()),8000), 2048): additional_regs.append(available_regs[reg_count]) reg_count += 1 - for i in range(8192, min(nnz * self.precision.value, 33000), 8192): + for i in range(8192, min(nnz * self.precision.size(), 33000), 8192): + if reg_count == len(available_regs): + self.spontaneous_scaling = True + break additional_regs.append(available_regs[reg_count]) reg_count += 1 - loop_reg = r(12) - - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_reg, additional_regs - + loop_regs = [r(12), r(13), r(14)] + + # FIXME: a bit hacky to have the mask setup here + rest = bm % v_size + rest2 = (m % bm) % v_size + self.predicates[rest] = mask(1) + self.predicates[rest2] = mask(2) + self.predicates[0] = mask(0) + + return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, mask_regs + + def init_mask(self, m, bm, v_size, tempreg, maskregs): + rest = bm % v_size + rest2 = (m % bm) % v_size + if rest == 0 and rest2 == 0: + return block("") + else: + asm = block("Set mask registers") + if rest > 0: + restval = (1 << rest) - 1 + asm.add(mov(restval, tempreg, False)) + asm.add(mov(tempreg, maskregs[0], False)) + if rest2 > 0: + restval2 = (1 << rest2) - 1 + asm.add(mov(restval2, tempreg, False)) + asm.add(mov(tempreg, maskregs[1], False)) + return asm def bcst_alpha_beta(self, alpha_reg: Register, @@ -82,9 +128,6 @@ def bcst_alpha_beta(self, ) -> Block: asm = block("Broadcast alpha and beta using inline broadcasting") - -# asm.add(bcst(alpha_reg[0], alpha_reg[1])) -# asm.add(bcst(beta_reg[0], beta_reg[1])) return asm @@ -95,8 +138,9 @@ def make_scaling_offsets(self, asm = block("Optimize usage of offsets when accessing B Matrix") - for i in range(1, min(len(additional_regs), 5)): - asm.add(mov(c(1024 + (i-1) * 2048), additional_regs[i], False)) + if not self.spontaneous_scaling: + for i in range(1, min(len(additional_regs), 5)): + asm.add(mov(c(1024 + (i-1) * 2048), additional_regs[i], False)) return asm @@ -108,34 +152,46 @@ def make_b_pointers(self, asm = block("Optimize usage of offsets when accessing B Matrix") - reg_count = 5 + if not self.spontaneous_scaling: + reg_count = 5 - for i in range(8192, min(nnz * self.precision.value, 33000), 8192): - asm.add(lea(B_reg, additional_regs[reg_count], i)) - reg_count += 1 + for i in range(8192, min(nnz * self.precision.size(), 33000), 8192): + asm.add(lea(B_reg, additional_regs[reg_count], i)) + reg_count += 1 return asm - def reg_based_scaling(self, addr: MemoryAddress, additional_regs: List[Register], with_index: bool): - if addr.disp >= 1024 and ((addr.disp < 32768 and with_index) or addr.disp < 8192): - scaling_and_register = { - 1: (1, 1), - 2: (2, 1), - 3: (1, 2), - 4: (4, 1), - 5: (1, 3), - 6: (2, 2), - 7: (1, 4) - } - if addr.disp % 8192 >= 1024: - addr.scaling, reg = scaling_and_register[ (addr.disp % 8192) // 1024 ] - addr.index = additional_regs[reg] - - if addr.disp >= 8192: - addr.base = additional_regs[addr.disp // 8192 + 4] - - addr.disp = addr.disp % 1024 + def reg_based_scaling(self, regcache, asm, addr: MemoryAddress, additional_regs: List[Register], with_index: bool): + if addr.disp >= 1024: + if ((addr.disp < 32768 and with_index) or addr.disp < 8192) and not self.spontaneous_scaling: + scaling_and_register = { + 1: (1, 1), + 2: (2, 1), + 3: (1, 2), + 4: (4, 1), + 5: (1, 3), + 6: (2, 2), + 7: (1, 4) + } + if addr.disp % 8192 >= 1024: + addr.scaling, reg = scaling_and_register[ (addr.disp % 8192) // 1024 ] + addr.index = additional_regs[reg] + + if addr.disp >= 8192 and not self.spontaneous_scaling: + addr.base = additional_regs[addr.disp // 8192 + 4] + + addr.disp = addr.disp % 1024 + else: + large_offset = addr.disp // 1024 + + basereg, load = regcache.get(large_offset) + if load: + asm.add(mov(addr.base, basereg, False)) + asm.add(add(c(large_offset * 1024), basereg)) + + addr.base = basereg + addr.disp = addr.disp % 1024 def move_register_block(self, cursor: Cursor, @@ -154,18 +210,27 @@ def move_register_block(self, action = "Store" if store else "Load" asm = block("{} {} register block @ {}".format(action,cursor.name,block_offset)) + b_row, _, _, _ = cursor.get_block(cursor_ptr, block_offset) + + process_size = min(v_size, cursor.br) + for ic in range(cols): for ir in range(rows): if (mask is None) or (mask[ir,ic]): cell_offset = Coords(down=ir*v_size, right=ic) addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.value * load_offset + addr.disp += self.precision.size() * load_offset + + processed = ir * process_size + if processed >= b_row: + continue + p = self.pred_n_trues(min(process_size, b_row - processed), v_size, 'm') if store: - asm.add(mov(registers[ir,ic], addr, True, comment)) + asm.add(mov(registers[ir,ic], addr, True, comment, pred=p)) if prefetching == 'BL2viaC': asm.add(prefetch(mem(additional_regs[0], addr.disp))) else: - asm.add(mov(addr, registers[ir,ic], True, comment)) + asm.add(mov(addr, registers[ir,ic], True, comment, pred=p)) return asm def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: @@ -203,9 +268,10 @@ def make_microkernel(self, asm = block("Block GEMM microkernel") bm,bk,aidx,apattern = A.get_block(A_ptr, to_A_block) bk,bn,bidx,bpattern = B.get_block(B_ptr, to_B_block) - assert(bm % v_size == 0) - mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size) + regcache = RegisterCache(additional_regs) + + mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size, True) asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False)) for Vmi in range(bm//v_size): @@ -214,9 +280,9 @@ def make_microkernel(self, to_cell = Coords(down=bki, right=bni) if B.has_nonzero_cell(B_ptr, to_B_block, to_cell): B_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) - self.reg_based_scaling(B_addr, additional_regs, True) + self.reg_based_scaling(regcache, asm, B_addr, additional_regs, True) comment = "C[{}:{},{}] += A[{}:{},{}]*{}".format(Vmi*v_size,Vmi*v_size+v_size,bni,Vmi*v_size,Vmi*v_size+v_size,bki,B_comment) - asm.add(fma(B_addr, A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment)) + asm.add(fma(B_addr, A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=0)) return asm def init_prefetching(self, prefetching): diff --git a/pspamm/codegen/architectures/knl/inlineprinter.py b/pspamm/codegen/architectures/knl/inlineprinter.py index 0763294..8f5e73d 100644 --- a/pspamm/codegen/architectures/knl/inlineprinter.py +++ b/pspamm/codegen/architectures/knl/inlineprinter.py @@ -20,8 +20,26 @@ class InlinePrinter(Visitor): def __init__(self, precision: Precision): self.output = [] self.stack = [] - assert precision in [Precision.SINGLE, Precision.DOUBLE] - self.precision = 'd' if precision == Precision.DOUBLE else 's' + assert precision in (Precision.BFLOAT16, Precision.HALF, Precision.SINGLE, Precision.DOUBLE) + self.precision = precision + self.psuffix = { + Precision.DOUBLE: 'd', + Precision.SINGLE: 's', + Precision.HALF: 'h', + Precision.BFLOAT16: 'h' + }[precision] + self.alupsuffix = { + Precision.DOUBLE: 'pd', + Precision.SINGLE: 'ps', + Precision.HALF: 'ph', + Precision.BFLOAT16: 'nepbf16' + }[precision] + self.broadcast_multiplier = { + Precision.DOUBLE: 2, + Precision.SINGLE: 4, + Precision.HALF: 8, + Precision.BFLOAT16: 8 + }[precision] def show(self): print("\n".join(self.output)) @@ -43,78 +61,112 @@ def addLine(self, stmt: str, comment: str): self.output.append(line) - + def maskformat(self, pred): + if pred is None: + return '' + elif pred.zero: + return f'%{{{pred.register.ugly}%}}%{{z%}}' + else: + return f'%{{{pred.register.ugly}%}}' def visitFma(self, stmt: FmaStmt): + mask = self.maskformat(stmt.pred) b = stmt.bcast_src.ugly m = stmt.mult_src.ugly a = stmt.add_dest.ugly - if stmt.bcast: - s = "vfmadd231p{} {}%{{1to{}%}}, {}, {}".format(self.precision, b, 8 if self.precision == 'd' else 16, m, a) + regsize = stmt.add_dest.size() // 16 + extent = regsize * self.broadcast_multiplier + if stmt.bcast is not None: + s = f"vfmadd231{self.alupsuffix} {b}%{{1to{extent}%}}, {m}, {a} {mask}" else: if stmt.mult_src.typeinfo == AsmType.i64: # in this case, m is a Register that points to alpha; manually format to be a memory address - s = "vfmadd231p{} 0({})%{{1to{}%}}, {}, {}".format(self.precision, m, 8 if self.precision == 'd' else 16, b, a) + s = f"vfmadd231{self.alupsuffix} 0({m})%{{1to{extent}%}}, {b}, {a} {mask}" else: - s = "vfmadd231p{} {}, {}, {}".format(self.precision, b,m,a) + s = f"vfmadd231{self.alupsuffix} {b}, {m}, {a} {mask}" self.addLine(s, stmt.comment) def visitMul(self, stmt: MulStmt): + mask = self.maskformat(stmt.pred) b = stmt.src.ugly m = stmt.mult_src.ugly a = stmt.dest.ugly + regsize = stmt.dest.size() // 16 + extent = regsize * self.broadcast_multiplier if stmt.mult_src.typeinfo == AsmType.i64: # in this case, m is a Register that points to alpha/beta; manually format to be a memory address - s = "vmulp{} 0({})%{{1to{}%}}, {}, {}".format(self.precision, m, 8 if self.precision == 'd' else 16, b, a) + s = f"vmul{self.alupsuffix} 0({m})%{{1to{extent}%}}, {b}, {a} {mask}" else: - s = "vmulp{} {}, {}, {}".format(self.precision, b,m,a) + s = f"vmul{self.alupsuffix} {b}, {m}, {a} {mask}" self.addLine(s, stmt.comment) def visitBcst(self, stmt: BcstStmt): + mask = self.maskformat(stmt.pred) b = stmt.bcast_src.ugly a = stmt.dest.ugly - s = "vbroadcasts{} {}, {}".format(self.precision, b,a) + regsize = stmt.dest.size() + if self.precision == Precision.HALF or self.precision == Precision.BFLOAT16: + instruction = 'vpbroadcastw' + elif self.precision == Precision.DOUBLE and regsize == 16: + instruction = 'vmovddup' + else: + instruction = f"vbroadcasts{self.psuffix}" + s = f"{instruction} {b}, {a} {mask}" self.addLine(s, stmt.comment) def visitAdd(self, stmt: AddStmt): - s = "addq {}, {}".format(stmt.src.ugly,stmt.dest.ugly) + if isinstance(stmt.src, Constant) and stmt.src.value == 0: + # avoid 0 instructions + return + + # only used for scalar addition right now + s = f"addq {stmt.src.ugly}, {stmt.dest.ugly}" self.addLine(s, stmt.comment) def visitLabel(self, stmt: LabelStmt): - s = "{}:".format(stmt.label.ugly) + s = f"{stmt.label.ugly}:" self.addLine(s, stmt.comment) def visitCmp(self, stmt: CmpStmt): - s = "cmp {}, {}".format(stmt.lhs.ugly,stmt.rhs.ugly) + mask = self.maskformat(stmt.pred) + s = f"cmp {stmt.lhs.ugly}, {stmt.rhs.ugly} {mask}" self.addLine(s, stmt.comment) def visitJump(self, stmt: JumpStmt): - s = "jl {}".format(stmt.destination.ugly) + s = f"jl {stmt.destination.ugly}" self.addLine(s, stmt.comment) def visitMov(self, stmt: MovStmt): + mask = self.maskformat(stmt.pred) + if isinstance(stmt.src, Label): src_str = "$" + stmt.src.ugly else: src_str = stmt.src.ugly if stmt.typ == AsmType.i64: - s = "movq {}, {}".format(src_str,stmt.dest.ugly) + assert(stmt.pred == None) + # FIXME: no hack + if stmt.dest.ugly[2] == 'k': + s = f"kmovq {src_str}, {stmt.dest.ugly}" + else: + s = f"movq {src_str}, {stmt.dest.ugly}" elif stmt.typ == AsmType.f64x8 and stmt.aligned: if isinstance(stmt.src, Constant) and stmt.src.value == 0: - s = "vpxord {}, {}, {}".format(stmt.dest.ugly,stmt.dest.ugly,stmt.dest.ugly) + s = f"vpxord {stmt.dest.ugly}, {stmt.dest.ugly}, {stmt.dest.ugly} {mask}" else: - s = "vmovup{} {}, {}".format(self.precision, src_str,stmt.dest.ugly) + s = f"vmovupd {src_str}, {stmt.dest.ugly} {mask}" else: raise NotImplementedError() self.addLine(s, stmt.comment) def visitLea(self, stmt: LeaStmt): - s = "leaq {}({}), {}".format(stmt.offset,stmt.src.ugly,stmt.dest.ugly) + mask = self.maskformat(stmt.pred) + s = f"leaq {stmt.offset}({stmt.src.ugly}), {stmt.dest.ugly} {mask}" self.addLine(s, stmt.comment) def visitPrefetch(self, stmt: PrefetchStmt): - s = "prefetcht1 {}".format(stmt.dest.ugly) + s = f"prefetcht1 {stmt.dest.ugly}" self.addLine(s, stmt.comment) def visitBlock(self, block: Block): diff --git a/pspamm/codegen/architectures/knl/operands.py b/pspamm/codegen/architectures/knl/operands.py index 3c9b09b..b0fe394 100644 --- a/pspamm/codegen/architectures/knl/operands.py +++ b/pspamm/codegen/architectures/knl/operands.py @@ -38,7 +38,6 @@ class Register_KNL(Register): def ugly(self): return "%%" + self.value - class MemoryAddress_KNL(MemoryAddress): def __init__(self, @@ -72,8 +71,13 @@ def mem(base, offset, index=None, scaling=None): rdi = Register_KNL(AsmType.i64, "rdi") rsi = Register_KNL(AsmType.i64, "rsi") -r = lambda n: Register_KNL(AsmType.i64, "r"+str(n)) if n > 7 else gen_regs[n] -xmm = lambda n: Register_KNL(AsmType.f64x2, "xmm"+str(n)) -ymm = lambda n: Register_KNL(AsmType.f64x4, "ymm"+str(n)) -zmm = lambda n: Register_KNL(AsmType.f64x8, "zmm"+str(n)) +r = lambda n: Register_KNL(AsmType.i64, "r"+str(n)) if n > 7 else gen_regs[n] +xmm = lambda n: Register_KNL(AsmType.f64x2, "xmm"+str(n)) +ymm = lambda n: Register_KNL(AsmType.f64x4, "ymm"+str(n)) +zmm = lambda n: Register_KNL(AsmType.f64x8, "zmm"+str(n)) +mask = lambda n: Register_KNL(AsmType.i64, "k"+str(n)) +class Predicate: + def __init__(self, register: Register_KNL, zero: bool): + self.register = register + self.zero = zero diff --git a/pspamm/codegen/ast.py b/pspamm/codegen/ast.py index 9287930..356478c 100644 --- a/pspamm/codegen/ast.py +++ b/pspamm/codegen/ast.py @@ -29,6 +29,7 @@ class MovStmt(AsmStmt): dest = None typ = None aligned = False + pred = None def accept(self, visitor: "Visitor"): visitor.visitMov(self) @@ -39,6 +40,7 @@ class LeaStmt(AsmStmt): offset = None typ = None aligned = False + pred = None def accept(self, visitor: "Visitor"): visitor.visitLea(self) @@ -105,6 +107,7 @@ def accept(self, visitor: "Visitor"): class BcstStmt(AsmStmt): bcast_src = None dest = None + pred = None def accept(self, visitor: "Visitor"): visitor.visitBcst(self) @@ -114,6 +117,7 @@ class AddStmt(AsmStmt): dest = None typ = None additional = None + pred = None def accept(self, visitor: "Visitor"): visitor.visitAdd(self) @@ -121,6 +125,7 @@ def accept(self, visitor: "Visitor"): class CmpStmt(AsmStmt): lhs = None rhs = None + pred = None def accept(self, visitor: "Visitor"): visitor.visitCmp(self) diff --git a/pspamm/codegen/ccode.py b/pspamm/codegen/ccode.py index a691b4e..00c5721 100644 --- a/pspamm/codegen/ccode.py +++ b/pspamm/codegen/ccode.py @@ -15,9 +15,8 @@ def make_cfunc(funcName:str, template:str, body:Block, flop:int, starting_regs:L analyzer = Analyzer(starting_regs) body.accept(analyzer) - regs = ['"{}"'.format(reg.clobbered) for reg in analyzer.clobbered_registers] - regs.sort() - clobbered = ",".join(regs) + regs = set('"{}"'.format(reg.clobbered) for reg in analyzer.clobbered_registers) + clobbered = ",".join(sorted(regs)) return template.format(funcName = funcName, body_text = body_text, clobbered = clobbered, diff --git a/pspamm/codegen/forms.py b/pspamm/codegen/forms.py index 4b87fa8..8a232fe 100644 --- a/pspamm/codegen/forms.py +++ b/pspamm/codegen/forms.py @@ -12,7 +12,8 @@ def __init__(self, initial_val: int, final_val: int, increment: int = 1, - body_contents: Block = None + body_contents: Block = None, + unroll: int = 1 ) -> None: self.iteration_var = iteration_var @@ -20,6 +21,8 @@ def __init__(self, self.final_val = final_val self.increment = increment self.body_contents = body_contents + self.unroll = unroll + assert self.unroll == 1 or self.initial_val == 0 self.label = "loop_top_" + str(len(Loop._labels)) Loop._labels.append(self.label) @@ -29,16 +32,30 @@ def __init__(self, @property def contents(self): - return [mov(self.initial_val, self.iteration_var, vector=False), - label(self.label), - *(self.body_contents.contents), - add(self.increment, self.iteration_var), - cmp(self.final_val, self.iteration_var), + onestep = [*(self.body_contents.contents), + add(self.increment, self.iteration_var)] + body = [] + rest = [] + for _ in range(self.unroll): + body += onestep + for _ in range(self.final_val % self.unroll): + rest += onestep + corrected_final_val = (self.final_val // self.unroll) * self.unroll + + allcode = [] + if corrected_final_val == self.initial_val + self.unroll: + allcode += body + elif corrected_final_val > self.initial_val: + allcode += [mov(self.initial_val, self.iteration_var, vector=False), + label(self.label)] + body + [cmp(corrected_final_val, self.iteration_var), jump(self.label, backwards=True)] + allcode += rest + + return allcode def body(self, *args): self.body_contents = block("Loop body", *args) return self -def loop(iter_var, initial_val, final_val, increment): - return Loop(iter_var, initial_val, final_val, increment) +def loop(iter_var, initial_val, final_val, increment, unroll=1): + return Loop(iter_var, initial_val, final_val, increment, unroll=unroll) diff --git a/pspamm/codegen/generator.py b/pspamm/codegen/generator.py index 51e852f..a20247d 100644 --- a/pspamm/codegen/generator.py +++ b/pspamm/codegen/generator.py @@ -9,6 +9,25 @@ def __init__(self, precision: Precision): def get_precision(self): return self.precision + + def set_sparse(self): + pass + + # taken from https://stackoverflow.com/questions/14822184/is-there-a-ceiling-equivalent-of-operator-in-python + def ceil_div(self, n, d): + return -(n // -d) + + @abstractmethod + def init_mask(self, bm, v_size, tempreg, maskreg): + pass + + @abstractmethod + def use_broadcast(self): + pass + + @abstractmethod + def has_masks(self): + pass @abstractmethod def get_v_size(self): diff --git a/pspamm/codegen/operands.py b/pspamm/codegen/operands.py index c22d2ea..0d42c1e 100644 --- a/pspamm/codegen/operands.py +++ b/pspamm/codegen/operands.py @@ -43,6 +43,32 @@ class Register(Operand): def __init__(self, typeinfo, value) -> None: self.typeinfo = typeinfo self.value = str(value) + + def size(self): + if self.typeinfo == AsmType.i8: + return 1 + if self.typeinfo == AsmType.i16: + return 2 + if self.typeinfo == AsmType.i32: + return 4 + if self.typeinfo == AsmType.i64: + return 8 + if self.typeinfo == AsmType.f32: + return 4 + if self.typeinfo == AsmType.f64: + return 8 + if self.typeinfo == AsmType.f32x4: + return 16 + if self.typeinfo == AsmType.f32x8: + return 32 + if self.typeinfo == AsmType.f32x16: + return 64 + if self.typeinfo == AsmType.f64x2: + return 16 + if self.typeinfo == AsmType.f64x4: + return 32 + if self.typeinfo == AsmType.f64x8: + return 64 @property def ugly(self): diff --git a/pspamm/codegen/precision.py b/pspamm/codegen/precision.py index 0672b3d..417c9a6 100644 --- a/pspamm/codegen/precision.py +++ b/pspamm/codegen/precision.py @@ -3,9 +3,29 @@ class Precision(Enum): DOUBLE = 8 SINGLE = 4 + HALF = 2 + BFLOAT16 = 2.1 @classmethod def getCType(cls, precision): - ctype = {cls.DOUBLE: 'double', cls.SINGLE: 'float'} + ctype = {cls.DOUBLE: 'double', cls.SINGLE: 'float', cls.HALF: 'uint16_t', cls.BFLOAT16: 'uint16_t'} return ctype[precision] + + def ctype(self): + return self.getCType(self) + + def size(self): + return { + self.DOUBLE: 8, + self.SINGLE: 4, + self.HALF: 2, + self.BFLOAT16: 2 + }[self] + raise NotImplementedError() + + def __repr__(self): + return self.getCType(self) + + def __str__(self): + return self.getCType(self) diff --git a/pspamm/codegen/regcache.py b/pspamm/codegen/regcache.py new file mode 100644 index 0000000..44bb0ae --- /dev/null +++ b/pspamm/codegen/regcache.py @@ -0,0 +1,28 @@ + +class RegisterCache: + def __init__(self, registers): + self.access = 0 + self.lru = [-1] * len(registers) + self.registers = registers + self.storage = {} + + def get(self, value): + self.access += 1 + + evicted = False + + if value not in self.storage: + evicted = True + minaccess = self.access + minidx = -1 + for i, last in enumerate(self.lru): + if last < minaccess: + minaccess = last + minidx = i + self.storage[value] = minidx + + regidx = self.storage[value] + + self.lru[regidx] = self.access + + return (self.registers[regidx], evicted) diff --git a/pspamm/codegen/sugar.py b/pspamm/codegen/sugar.py index 2014ab2..70b052d 100644 --- a/pspamm/codegen/sugar.py +++ b/pspamm/codegen/sugar.py @@ -19,7 +19,7 @@ def label(name: str): stmt.label = pspamm.architecture.operands.l(name) return stmt -def fma(bcast_src: Register, mult_src: Register, add_dest: Register, comment: str = None, bcast: bool = True, pred: Register = None): +def fma(bcast_src: Register, mult_src: Register, add_dest: Register, comment: str = None, bcast: Union[int, None] = None, pred: Register = None): stmt = FmaStmt() stmt.bcast_src = bcast_src stmt.mult_src = mult_src @@ -58,11 +58,12 @@ def jump(label: str, backwards=True): stmt.destination = pspamm.architecture.operands.l(label) return stmt -def mov(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None): +def mov(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None, pred = None): stmt = MovStmt() stmt.src = src if isinstance(src, Operand) else pspamm.architecture.operands.c(src) stmt.dest = dest stmt.comment = comment + stmt.pred = pred if vector: stmt.aligned = True stmt.typ = AsmType.f64x8 @@ -79,7 +80,7 @@ def lea(src: Register, dest: Operand, offset: int, comment:str = None): stmt.comment = comment return stmt -def ld(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None, dest2: Operand = None, pred: Register = None, is_B: bool = False, scalar_offs: bool = False, add_reg: AsmType.i64 = None): +def ld(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None, dest2: Operand = None, pred: Register = None, is_B: bool = False, scalar_offs: bool = False, add_reg: AsmType.i64 = None, sub128: bool = False): stmt = LoadStmt() stmt.src = src if isinstance(src, Operand) else pspamm.architecture.operands.c(src) stmt.dest = dest @@ -93,7 +94,10 @@ def ld(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None if vector: stmt.aligned = True - stmt.typ = AsmType.f64x8 + if sub128: + stmt.typ = AsmType.f64x2 + else: + stmt.typ = AsmType.f64x8 else: stmt.aligned = False stmt.typ = AsmType.i64 diff --git a/pspamm/cursors/blockcursor.py b/pspamm/cursors/blockcursor.py index 65e14b4..de620e9 100644 --- a/pspamm/cursors/blockcursor.py +++ b/pspamm/cursors/blockcursor.py @@ -194,7 +194,7 @@ def sparse_mask(A_regs: Matrix[Register], B_ptr: CursorLocation, B_block_offset: Coords, v_size: int, - is_sve: bool = False + has_mask: bool = False ) -> Matrix[bool]: Vr, Vc = A_regs.shape @@ -202,7 +202,7 @@ def sparse_mask(A_regs: Matrix[Register], A_br, A_bc, A_idx, A_pat = A.get_block(A_ptr, A_block_offset) B_br, B_bc, B_idx, B_pat = B.get_block(B_ptr, B_block_offset) - if not is_sve: + if not has_mask: assert (Vr * v_size == A_br) # bm must tile m exactly for now in NEON and AVX512 assert(Vc >= A_bc) # Matrix block must fit in register block assert(A_bc == B_br) # Matrix blocks are compatible diff --git a/pspamm/cursors/densecursor.py b/pspamm/cursors/densecursor.py index ad1f3a7..841cb58 100644 --- a/pspamm/cursors/densecursor.py +++ b/pspamm/cursors/densecursor.py @@ -119,4 +119,14 @@ def get_block(self, src: CursorLocation=None, dest_block: Coords=None) -> BlockI pattern = cast(Matrix[bool], pattern) return BlockInfo(br, bc, index, pattern) - + def has_nonzero_block(self, src: CursorLocation, dest_block: Coords) -> bool: + return True + + def has_nonzero_cell(self, + src_loc: CursorLocation, + dest_block: Coords, + dest_cell: Coords) -> bool: + return True + + def start(self) -> CursorLocation: + return CursorLocation() diff --git a/pspamm/matmul.py b/pspamm/matmul.py index 2204af3..cc72f6f 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -5,11 +5,6 @@ from pspamm.codegen.forms import * from pspamm.codegen.precision import * -import pspamm.scripts.old_arm -import pspamm.scripts.max_bn_knl -import pspamm.scripts.max_bn_hsw -import pspamm.scripts.max_arm_sve - from pspamm.cursors import * import pspamm.architecture @@ -96,9 +91,9 @@ def __init__(self, except: self.beta = 'generic' - if arch == 'skx': - arch = 'knl' - + if arch.startswith('skx'): + arch = 'knl' + arch[3:] + # hacky implementation of multi-register length if arch.startswith('arm_sve'): if len(arch) == 7: @@ -108,24 +103,52 @@ def __init__(self, assert v_len_bits % 128 == 0 and v_len_bits <= 2048 v_len_regs = v_len_bits // 128 arch = 'arm_sve' + + if arch.startswith('knl'): + if len(arch) == 3: + v_len_regs = 4 + else: + v_len_bits = int(arch[3:]) + assert v_len_bits in (128, 256, 512) + v_len_regs = v_len_bits // 128 + arch = 'knl' + + if arch.startswith('hsw'): + if len(arch) == 3: + v_len_regs = 2 + else: + v_len_bits = int(arch[3:]) + assert v_len_bits in (128, 256) + v_len_regs = v_len_bits // 128 + arch = 'hsw' + + if arch.startswith('arm') and not arch.startswith('arm_sve'): + # only 128 supported + arch = 'arm' self.arch = arch - assert precision.lower() in ['s', 'd'] - self.precision = Precision.DOUBLE if precision.lower() == 'd' else Precision.SINGLE + assert precision.lower() in ['bf16', 'h', 's', 'd'] + self.precision = { + 'h' : Precision.HALF, + 's' : Precision.SINGLE, + 'd' : Precision.DOUBLE, + 'bf16' : Precision.BFLOAT16 + }[precision.lower()] pspamm.architecture.init() pspamm.architecture.arch = arch pspamm.architecture.Generator = pspamm.architecture.get_class("pspamm.codegen.architectures." + arch + ".generator").Generator pspamm.architecture.operands = pspamm.architecture.get_class("pspamm.codegen.architectures." + arch + ".operands") + pspamm.architecture.blocksize = pspamm.architecture.get_class("pspamm.codegen.architectures." + arch + ".blocksize").Default self.generator = pspamm.architecture.Generator(self.precision) # flag that determines if a matmul kernel uses sve instructions -> needed for sve predicates - self.is_sve = arch == "arm_sve" + self.masks = self.generator.has_masks() # define which architectures need to use an explicit broadcast, necessary for alpha/beta values - self.use_bcst = arch in ["arm", "arm_sve", "hsw"] + self.use_bcst = self.generator.use_broadcast() - if self.is_sve: + if arch in ('arm_sve', 'hsw', 'knl'): self.generator.v_len = v_len_regs self.v_size = self.generator.get_v_size() @@ -135,18 +158,17 @@ def __init__(self, if bm == None or bn == None: if arch == 'knl': - (self.bm, self.bn) = pspamm.scripts.max_bn_knl.getBlocksize(m, n, bk, self.v_size) + (self.bm, self.bn, self.bk) = pspamm.architecture.blocksize.getBlocksize(m, n, bk, self.v_size, self.precision) elif arch == 'hsw': - (self.bm, self.bn) = pspamm.scripts.max_bn_hsw.getBlocksize(m, n, bk, self.v_size) + (self.bm, self.bn, self.bk) = pspamm.architecture.blocksize.getBlocksize(m, n, bk, self.v_size, self.precision) elif arch == 'arm': - (self.bm, self.bn) = pspamm.scripts.old_arm.getBlocksize(m, n, bk, self.v_size) + (self.bm, self.bn, self.bk) = pspamm.architecture.blocksize.getBlocksize(m, n, bk, self.v_size, self.precision) elif arch == 'arm_sve': - (self.bm, self.bn) = pspamm.scripts.max_arm_sve.getBlocksize(m, n, bk, self.v_size) + (self.bm, self.bn, self.bk) = pspamm.architecture.blocksize.getBlocksize(m, n, bk, self.v_size, self.precision) else: self.bm = bm self.bn = bn - - self.bk = bk + self.bk = bk self.prefetching = prefetching @@ -158,17 +180,20 @@ def __init__(self, self.output_overwrite = output_overwrite if ldb == 0: - pattern = Matrix.load(mtx_filename) - if self.is_sve: + bpattern = Matrix.load(mtx_filename) + if self.masks: self.generator.set_sparse() else: - mtx = numpy.zeros((k, n)) - for i in range(k): - for j in range(n): - mtx[i, j] = 1 - pattern = Matrix(mtx) - - blocks,patterns,mtx_overhead = decompose_pattern(self.k, self.n, pattern, self.bk, self.bn) + assert self.k <= ldb + + if lda == 0: + apattern = Matrix.load(mtx_filename) + if self.masks: + self.generator.set_sparse() + else: + assert self.m <= lda + + assert self.m <= ldc self.nnz = 0 self.flop = 0 @@ -176,39 +201,45 @@ def __init__(self, if ldb == 0: for i in range(n): for j in range(k): - if pattern[j,i]: + if bpattern[j,i]: self.nnz += 1 self.flop = self.nnz * m * 2 - self.nnz += sum(mtx_overhead) else: self.nnz = ldb * self.n self.flop = m * n * k * 2 - #if prefetching is not None: - # prefetchReg = self.generator.init_prefetching(self.prefetching) - #else: - # prefetchReg = None prefetchReg = self.generator.init_prefetching(self.prefetching) # if matrices are always padded to multiple of v_size, we can remove the if-part and execute the assert for SVE too - if not self.is_sve: + if not self.masks: assert(self.m % self.v_size == 0) - self.A_regs, self.B_regs, self.C_regs, self.starting_regs, self.alpha_reg, self.beta_reg, self.loop_reg, self.additional_regs = self.generator.make_reg_blocks(self.bm, self.bn, self.bk, self.v_size, self.nnz, self.m, self.n, self.k) + self.A_regs, self.B_regs, self.C_regs, self.starting_regs, self.alpha_reg, self.beta_reg, self.loop_regs, self.additional_regs, self.mask_regs = self.generator.make_reg_blocks(self.bm, self.bn, self.bk, self.v_size, self.nnz, self.m, self.n, self.k) self.alpha_bcst_reg, self.beta_bcst_reg = self.starting_regs[3], self.starting_regs[4] + if lda == 0: + blocks, patterns, mtx_overhead = decompose_pattern(self.m, self.k, apattern, self.bm, self.bk) + self.A = BlockCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.size(), blocks, patterns, mtx_overhead) + self.nnz += sum(mtx_overhead) + else: + self.A = DenseCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.size()) + if ldb == 0: + blocks, patterns, mtx_overhead = decompose_pattern(self.k, self.n, bpattern, self.bk, self.bn) + self.B = BlockCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.size(), blocks, patterns, mtx_overhead) + self.nnz += sum(mtx_overhead) + else: + self.B = DenseCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.size()) + self.C = DenseCursor("C", self.starting_regs[2], self.m, self.n, self.ldc, self.bm, self.bn, self.precision.size()) + self.C_pf = DenseCursor("C_pf", prefetchReg, self.m, self.n, self.ldc, self.bm, self.bn, self.precision.size()) if prefetchReg else None - self.A = DenseCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.value) - self.B = BlockCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.value, blocks, patterns,mtx_overhead) - self.C = DenseCursor("C", self.starting_regs[2], self.m, self.n, self.ldc, self.bm, self.bn, self.precision.value) - self.C_pf = DenseCursor("C_pf", prefetchReg, self.m, self.n, self.ldc, self.bm, self.bn, self.precision.value) if prefetchReg else None + self.unroll = ldb == 0 or lda == 0 - def make_nk_unroll(self): + def make_nk_unroll(self, unroll=True): asm = block("Unrolling over bn and bk") - A_ptr = CursorLocation() + A_ptr = self.A.start() B_ptr = self.B.start() C_ptr = CursorLocation() C_pf_ptr = CursorLocation() @@ -216,7 +247,7 @@ def make_nk_unroll(self): Bn = self.n // self.bn Bk = self.k // self.bk # handle fringe case of SVE -> allow bm < v_size - vm = self.bm // self.v_size if not self.is_sve else self.generator.ceil_div(self.bm, self.v_size) + vm = self.bm // self.v_size if not self.masks else self.generator.ceil_div(self.bm, self.v_size) n_overhead = self.n % self.bn k_overhead = self.k % self.bk @@ -228,8 +259,7 @@ def make_nk_unroll(self): asm.add(self.generator.make_b_pointers(self.starting_regs[1], self.additional_regs, self.nnz)) - for Bni in range(0, Bn): - + def kernelN(asm, Bni, A_ptr, B_ptr, C_ptr): regs = self.C_regs if Bni + 1 == Bn and n_overhead > 0: @@ -242,18 +272,39 @@ def make_nk_unroll(self): asm.add(bcst(self.beta_bcst_reg, self.beta_reg[1], "Broadcast beta")) for ic in range(regs.shape[1]): for ir in range(regs.shape[0]): - pred_m = None if not self.is_sve else self.generator.pred_n_trues(self.bm - ir * self.v_size, self.v_size, "m") + pred_m = None if not self.masks else self.generator.pred_n_trues(self.bm - ir * self.v_size, self.v_size, "m") asm.add(mul(regs[ir,ic], self.beta_reg[1], regs[ir,ic], "C = beta * C", pred=pred_m)) else: asm.add(self.generator.make_zero_block(regs, self.additional_regs)) - for Bki in range(0,Bk): - - to_A = Coords(right=Bki) - to_B = Coords(right=Bni, down=Bki, absolute=True) - - if self.B.has_nonzero_block(B_ptr, to_B): + def kernelK(asm, Bki, A_ptr, B_ptr): + if unroll: + to_A = Coords(right=Bki) + to_B = Coords(right=Bni, down=Bki, absolute=True) + keep = self.B.has_nonzero_block(B_ptr, to_B) and self.A.has_nonzero_block(A_ptr, to_A) + else: + # setting A_ptr, B_ptr here may be a bit too hacky... + A_ptr = CursorLocation(Coords(right=Bki, absolute=True)) + B_ptr = CursorLocation(Coords(right=Bni, down=Bki, absolute=True)) + to_A = Coords() + to_B = Coords() + keep = True + + if keep: asm.add(self.generator.make_microkernel(self.A, self.B, A_ptr, B_ptr, self.A_regs, self.B_regs, regs, self.v_size, self.additional_regs, to_A, to_B)) + + if unroll: + for Bki in range(Bk): + kernelK(asm, Bki, A_ptr, B_ptr) + else: + loopblock = block("microkernel") + kernelK(loopblock, 0, A_ptr, B_ptr) + loopblock.add(self.B.move(B_ptr, Coords(down=1))[0]) + loopblock.add(self.A.move(A_ptr, Coords(right=1))[0]) + asm.add(loop(self.loop_regs[2], 0, Bk-1, 1, unroll=4).body(loopblock)) + kernelK(asm, Bk-1, A_ptr, B_ptr) + asm.add(self.B.move(B_ptr, Coords(down=1-Bk))[0]) + asm.add(self.A.move(A_ptr, Coords(right=1-Bk))[0]) if self.alpha != 1.0: store_block = block("") @@ -271,16 +322,15 @@ def make_nk_unroll(self): for ir in range(A_regs_cut.shape[0]): for ic in range(A_regs_cut.shape[1]): - pred_m = None if not self.is_sve else self.generator.pred_n_trues(self.bm - ir*self.v_size, self.v_size, "m") + pred_m = None if not self.masks else self.generator.pred_n_trues(self.bm - ir*self.v_size, self.v_size, "m") if self.beta != 0.0 and self.beta != 1.0: - store_block.add(mul(A_regs_cut[ir,ic], self.beta_reg[1], A_regs_cut[ir,ic], pred=pred_m)) + store_block.add(mul(A_regs_cut[ir,ic], self.beta_reg[1], A_regs_cut[ir,ic], "C = beta * C + alpha * AB", pred=pred_m)) if self.beta == 0.0: - store_block.add(mul(regs[ir, x + ic], self.alpha_reg[1], A_regs_cut[ir, ic], "C = C + alpha * AB", pred=pred_m)) + store_block.add(mul(regs[ir, x + ic], self.alpha_reg[1], A_regs_cut[ir, ic], "C = alpha * AB", pred=pred_m)) else: - store_block.add(fma(regs[ir, x + ic], self.alpha_reg[1], A_regs_cut[ir, ic], "C = C + alpha * AB", False, pred=pred_m)) + store_block.add(fma(regs[ir, x + ic], self.alpha_reg[1], A_regs_cut[ir, ic], "C = C + alpha * AB", None, pred=pred_m)) store_block.add(self.generator.move_register_block(self.C, C_ptr, Coords(), A_regs_cut, self.v_size, self.additional_regs, None, True, self.prefetching, self.ldc * x)) asm.add(store_block) - else: asm.add(self.generator.move_register_block(self.C, C_ptr, Coords(), regs, self.v_size, self.additional_regs, None, True, self.prefetching)) @@ -291,13 +341,21 @@ def make_nk_unroll(self): move_C_pf, C_pf_ptr = self.C_pf.move(C_pf_ptr, Coords(right=1)) asm.add(move_C_pf) + if unroll: + for Bni in range(0, Bn): + kernelN(asm, Bni, A_ptr, B_ptr, C_ptr) + else: + if Bn > 1: + loopblock = block("microkernel") + kernelN(loopblock, 0, A_ptr, B_ptr, C_ptr) + loopblock.add(self.B.move(B_ptr, Coords(right=1))[0]) + asm.add(loop(self.loop_regs[1], 0, Bn-1, 1).body(loopblock)) + kernelN(asm, Bn-1, A_ptr, B_ptr, C_ptr) + asm.add(self.B.move(B_ptr, Coords(right=1-Bn))[0]) return asm - - def make(self): - A_ptr = CursorLocation() C_ptr = CursorLocation() C_pf_ptr = CursorLocation() @@ -310,7 +368,7 @@ def make(self): Bn += 1 loopBody = [ - self.make_nk_unroll(), + self.make_nk_unroll(self.unroll), self.A.move(A_ptr, Coords(down=1))[0], self.C.move(C_ptr, Coords(down=1, right=1-Bn))[0] ] @@ -320,18 +378,19 @@ def make(self): asm = block("unrolled_{}x{}x{}".format(self.m,self.n,self.k), self.generator.bcst_alpha_beta(self.alpha_reg, self.beta_reg), self.generator.make_scaling_offsets(self.additional_regs, self.nnz), - loop(self.loop_reg, 0, Bm, 1).body(*loopBody) + self.generator.init_mask(self.m, self.bm, self.v_size, self.loop_regs[0], self.mask_regs), + loop(self.loop_regs[0], 0, Bm, 1).body(*loopBody) ) - vm_overhead = (self.m % self.bm) // self.v_size + m_overhead = self.m % self.bm + vm_overhead = -(m_overhead // -self.v_size) if vm_overhead > 0: self.m = self.m % self.bm self.bm = self.m % self.bm - self.A_regs = self.A_regs[0:self.bm // self.v_size, 0:self.bk] - self.C_regs = self.C_regs[0:self.bm // self.v_size, 0:self.bn] + self.A_regs = self.A_regs[0:vm_overhead, 0:self.bk] + self.C_regs = self.C_regs[0:vm_overhead, 0:self.bn] self.A.r = self.m - asm.add(self.make_nk_unroll()) - + asm.add(self.make_nk_unroll(self.unroll)) return asm diff --git a/pspamm/pspamm.py b/pspamm/pspamm.py index 9260ec0..3c6d3b4 100755 --- a/pspamm/pspamm.py +++ b/pspamm/pspamm.py @@ -17,7 +17,7 @@ def generate(alg: MatMul) -> None: block = alg.make() - text = make_cfunc(alg.output_funcname, alg.generator.get_template(), block, alg.flop, alg.starting_regs, alg.generator.get_precision()) + text = make_cfunc(alg.output_funcname, alg.generator.get_template(), block, alg.flop, alg.starting_regs + alg.mask_regs, alg.generator.get_precision()) if alg.output_filename is None: print(text) @@ -50,7 +50,7 @@ def main() -> None: parser.add_argument("--bk", type=int, help="Size of k-blocks") parser.add_argument("--arch", help="Architecture", default="knl") - parser.add_argument("--precision", help="Single (s) or double (d) precision", default="d") + parser.add_argument("--precision", help="Precision of the matrix multiplication, either half (h), single (s), or double (d)", default="d") parser.add_argument("--prefetching", help="Prefetching") diff --git a/pspamm/scripts/__init__.py b/pspamm/scripts/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/pspamm/scripts/max_arm.py b/pspamm/scripts/max_arm.py deleted file mode 100755 index 6cfd5c1..0000000 --- a/pspamm/scripts/max_arm.py +++ /dev/null @@ -1,22 +0,0 @@ -def getBlocksize(m , n, bk): - - bm = 2 - bn = 1 - maxval = 0 - - for i in range(2, m+1, 2): - for j in range(1, n+1): - if ARM_condition(i, j, bk): - if i*j > maxval: - maxval = i*j - bm = i - bn = j - - return (bm, bn) - - -def ARM_condition(bm, bn, bk): - v_size = 2 - # ceiling division - vm = -(bm // -v_size) - return (bn+bk) * vm + bn <= 32 diff --git a/pspamm/scripts/max_arm_sve.py b/pspamm/scripts/max_arm_sve.py deleted file mode 100644 index c8064a5..0000000 --- a/pspamm/scripts/max_arm_sve.py +++ /dev/null @@ -1,31 +0,0 @@ -def getBlocksize(m, n, bk, v_size=2): - # v_size default is 2, however for SVE that parameter will always be larger - bm = 2 - bn = 1 - maxval = 0 - - for i in range(1, m + 1, 1): - next_multiple = i - while next_multiple % v_size != 0: - next_multiple += 1 - for j in range(1, n + 1): - if ARM_condition(next_multiple, j, bk, v_size) and tileable(m, i): - if i * j >= maxval: - maxval = i * j - bm = i - bn = j - - if maxval == 0: - raise RuntimeError("Could not find an appropriate block size. We suggest padding the matrix dimensions") - - return (bm, bn) - - -def ARM_condition(bm, bn, bk, v_size): - # ceiling division - vm = -(bm // -v_size) - return (bn + bk) * vm + bn <= 32 - - -def tileable(m, bm): - return m % bm == 0 diff --git a/pspamm/scripts/max_bn_hsw.py b/pspamm/scripts/max_bn_hsw.py deleted file mode 100755 index 7cc6284..0000000 --- a/pspamm/scripts/max_bn_hsw.py +++ /dev/null @@ -1,16 +0,0 @@ -def getBlocksize(m, n, bk, v_size=4): - - bm = v_size - bn = 1 - - for j in range(1, n+1): - if HSW_condition(bm, j, bk, v_size): - bn = j - - return (bm, bn) - - -def HSW_condition(bm, bn, bk, v_size): - # ceiling division - vm = -(bm // -v_size) - return (bn + bk) * vm + bn * bk <= 16 diff --git a/pspamm/scripts/max_bn_knl.py b/pspamm/scripts/max_bn_knl.py deleted file mode 100755 index 2074d83..0000000 --- a/pspamm/scripts/max_bn_knl.py +++ /dev/null @@ -1,16 +0,0 @@ -def getBlocksize(m, n, bk, v_size=8): - - bm = v_size - bn = 1 - - for j in range(1, n+1): - if KNL_condition(bm, j, bk, v_size): - bn = j - - return (bm, bn) - - -def KNL_condition(bm, bn, bk, v_size): - # ceiling division - vm = -(bm // -v_size) - return (bn+bk) * vm <= 32 diff --git a/pspamm/scripts/max_hsw.py b/pspamm/scripts/max_hsw.py deleted file mode 100755 index 156cd5b..0000000 --- a/pspamm/scripts/max_hsw.py +++ /dev/null @@ -1,22 +0,0 @@ -def getBlocksize(m , n, bk): - - bm = 4 - bn = 1 - maxval = 0 - - for i in range(4, m+1, 4): - for j in range(1, n+1): - if HSW_condition(i, j, bk): - if i*j > maxval: - maxval = i*j - bm = i - bn = j - - return (bm, bn) - - -def HSW_condition(bm, bn, bk): - v_size = 4 - # ceiling division - vm = -(bm // -v_size) - return (bn + bk) * vm + bn * bk <= 16 diff --git a/pspamm/scripts/max_knl.py b/pspamm/scripts/max_knl.py deleted file mode 100755 index e3e3ea7..0000000 --- a/pspamm/scripts/max_knl.py +++ /dev/null @@ -1,22 +0,0 @@ -def getBlocksize(m , n, bk): - - bm = 8 - bn = 1 - maxval = 0 - - for i in range(8, m+1, 8): - for j in range(1, n+1): - if KNL_condition(i, j, bk): - if i*j > maxval: - maxval = i*j - bm = i - bn = j - - return (bm, bn) - - -def KNL_condition(bm, bn, bk): - v_size = 8 - # ceiling division - vm = -(bm // -v_size) - return (bn+bk) * vm <= 32 diff --git a/pspamm/scripts/old_arm.py b/pspamm/scripts/old_arm.py deleted file mode 100755 index 48622f0..0000000 --- a/pspamm/scripts/old_arm.py +++ /dev/null @@ -1,31 +0,0 @@ -def getBlocksize(m , n, bk, v_size=2): - - bm = m - bn = n - - if ARM_condition(bm, bn, bk, v_size): - return (bm, bn) - - while not ARM_condition(bm, bn, bk, v_size): - bm, bn = lowerToNextDiv(m, n, bm, bn, v_size) - - return (bm, bn) - - -def lowerToNextDiv(m, n, bm, bn, v_size): - if bm > bn and bm > v_size: - bm -= v_size - while m % bm != 0: - bm -= v_size - else: - bn -= 1 - while n % bn != 0: - bn -= 1 - - return bm, bn - - -def ARM_condition(bm, bn, bk, v_size): - # ceiling division - vm = -(bm // -v_size) - return (bn+bk) * vm + bn <= 32 diff --git a/pspamm/scripts/old_hsw.py b/pspamm/scripts/old_hsw.py deleted file mode 100755 index 7394c93..0000000 --- a/pspamm/scripts/old_hsw.py +++ /dev/null @@ -1,32 +0,0 @@ -def getBlocksize(m , n, bk): - - bm = m - bn = n - - if HSW_condition(bm, bn, bk): - return (bm, bn) - - while not HSW_condition(bm, bn, bk): - bm, bn = lowerToNextDiv(m, n, bm, bn) - - return (bm, bn) - - -def lowerToNextDiv(m, n, bm, bn): - if bm > bn and bm > 4: - bm -= 4 - while m % bm != 0: - bm -= 4 - else: - bn -= 1 - while n % bn != 0: - bn -= 1 - - return bm, bn - - -def HSW_condition(bm, bn, bk): - v_size = 4 - # ceiling division - vm = -(bm // -v_size) - return (bn + bk) * vm + bn * bk <= 16 diff --git a/pspamm/scripts/old_knl.py b/pspamm/scripts/old_knl.py deleted file mode 100755 index 3686b4c..0000000 --- a/pspamm/scripts/old_knl.py +++ /dev/null @@ -1,32 +0,0 @@ -def getBlocksize(m , n, bk): - - bm = m - bn = n - - if KNL_condition(bm, bn, bk): - return (bm, bn) - - while not KNL_condition(bm, bn, bk): - bm, bn = lowerToNextDiv(m, n, bm, bn) - - return (bm, bn) - - -def lowerToNextDiv(m, n, bm, bn): - if bm > bn and bm > 8: - bm -= 8 - while m % bm != 0: - bm -= 8 - else: - bn -= 1 - while n % bn != 0: - bn -= 1 - - return bm, bn - - -def KNL_condition(bm, bn, bk): - v_size = 8 - # ceiling division - vm = -(bm // -v_size) - return (bn+bk) * vm <= 32 diff --git a/tests/arm/.gitignore b/tests/arm/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/tests/arm/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/tests/arm_sve/.gitignore b/tests/arm_sve/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/tests/arm_sve/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/tests/runall-sve.sh b/tests/runall-sve.sh index 1d85fde..da38854 100755 --- a/tests/runall-sve.sh +++ b/tests/runall-sve.sh @@ -8,7 +8,7 @@ do echo "" echo "" echo "Testing $BITLEN bit SVE register GEMM" - python unit_tests_arm_sve.py $BITLEN + python unit_test.py arm_sve$BITLEN aarch64-linux-gnu-g++ -static -march=armv8.2-a+sve -msve-vector-bits=${BITLEN} build/arm_sve${BITLEN}_testsuite.cpp -o build/sve${BITLEN}-test qemu-aarch64-static -cpu max,sve${BITLEN}=on,sve-default-vector-length=-1 build/sve${BITLEN}-test done diff --git a/tests/sve_testsuite_generator.py b/tests/sve_testsuite_generator.py deleted file mode 100644 index 6b7cfe3..0000000 --- a/tests/sve_testsuite_generator.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/python3 -from collections import namedtuple -import subprocess -import numpy as np -import random -import sys -import os -import testsuite_generator as test_generator - -BASEDIR = 'build' - -SparseKernel = namedtuple('SparseKernel', 'name m n k lda ldb ldc alpha beta block_sizes mtx delta') -DenseKernel = namedtuple('DenseKernel', 'name m n k lda ldb ldc alpha beta block_sizes delta') - -SparseKernelS = namedtuple('SparseKernelS', 'name m n k lda ldb ldc alpha beta block_sizes mtx delta') -DenseKernelS = namedtuple('DenseKernelS', 'name m n k lda ldb ldc alpha beta block_sizes delta') - -setup_prefetching = """ -template -void setup_prefetch(T*& prefetch, T* matrix, unsigned n, unsigned ldc) { - posix_memalign(reinterpret_cast(&prefetch), 64, ldc*n*sizeof(T)); - std::memcpy(prefetch, matrix, ldc*n*sizeof(T)); -} -""" - -def generateMTX(k, n, nnz): - return test_generator.generateMTX(k, n, nnz) - -def make(kernels, arch): - os.makedirs(os.path.join(BASEDIR, arch), exist_ok=True) - - f = open(os.path.join(BASEDIR, f'{arch}_testsuite.cpp'), 'w') - - f.write(test_generator.head_of_testsuite) - - include_single_prec = False - - for kern in kernels: - arguments = ['pspamm-generator', str(kern.m), str(kern.n), str(kern.k), str(kern.lda), - str(kern.ldb), str(kern.ldc), str(kern.alpha), str(kern.beta)] - - if isinstance(kern, SparseKernel) or isinstance(kern, SparseKernelS): - arguments += ['--mtx_filename', kern.mtx] - - prec = 's' if isinstance(kern, SparseKernelS) or isinstance(kern, DenseKernelS) else 'd' - arguments += ['--precision', prec] - if prec == 's': - include_single_prec = True - - block_sizes = list(set(kern.block_sizes)) - - for bs in block_sizes: - bm = bs[0] - bn = bs[1] - - if arch == "knl": - assert (bm % 8 == 0 and (bn + 1) * (bm / 8) <= 32) - elif arch == "arm": - assert (bm % 2 == 0 and (bn + 1) * (bm / 2) + bn <= 32) - elif arch.startswith("arm_sve"): - veclen = int(arch[7:]) - assert veclen % 128 == 0 and veclen <= 2048 - reglen = veclen // 128 - v_len = 2 * reglen if prec == 'd' else 4 * reglen - # this should be the same assertion as in ../scripts/max_arm_sve.py - bk = 1 - # ceiling division - vm = -(bm // -v_len) - if not ((bn + bk) * vm + bn * bk <= 32): - print(f'Skipping block size {bm}x{bn} for {arch}') - continue - - name = kern.name + '_' + str(bm) + '_' + str(bn) - - additional_args = ['--output_funcname', name, '--output_filename', os.path.join(BASEDIR, arch, name + '.h'), - '--output_overwrite'] - additional_args += ['--bm', str(bm), '--bn', str(bn), '--arch', arch, '--prefetching', 'BL2viaC'] - - try: - subprocess.check_output(arguments + additional_args, stderr=subprocess.STDOUT) - except subprocess.CalledProcessError as e: - raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) - - f.write('#include "' + arch + '/' + kern.name + '_' + str(bm) + '_' + str(bn) + '.h"\n') - - f.write('\n') - # necessary functions are defined in testsuite_generator.py - f.write(test_generator.function_definitions) - f.write(setup_prefetching) - f.write(test_generator.setup_main) - # add variable declarations for single precision test cases - f.write(""" std::tuple fpointers; - float falpha; float fbeta; - double* prefetch; - float* fprefetch; - """) - - for kern in kernels: - - block_sizes = list(set(kern.block_sizes)) - - for bs in block_sizes: - bm = bs[0] - bn = bs[1] - - prec = 's' if isinstance(kern, SparseKernelS) or isinstance(kern, DenseKernelS) else 'd' - - if arch.startswith("arm_sve"): - veclen = int(arch[7:]) - assert veclen % 128 == 0 and veclen <= 2048 - reglen = veclen // 128 - v_len = 2 * reglen if prec == 'd' else 4 * reglen - # this should be the same assertion as in ../scripts/max_arm_sve.py - bk = 1 - # ceiling division - vm = -( bm // -v_len) - if not ((bn + bk) * vm + bn * bk <= 32): - # print(f'Skipping block size {bm}x{bn} for {arch}') - continue - - name = kern.name + '_' + str(bm) + '_' + str(bn) - - if isinstance(kern, SparseKernel) or isinstance(kern, SparseKernelS): - mtx = kern.mtx - else: - mtx = "" - # for double precision: set prec to '' to conform to test_generator.function_definitions - prec = 'f' if isinstance(kern, SparseKernelS) or isinstance(kern, DenseKernelS) else '' - - f.write(""" - {p}alpha = {alpha}; {p}beta = {beta}; ldb = {ldb}; - {p}pointers = pre<{T}>({m}, {n}, {k}, {lda}, ldb, {ldc}, "{mtx}"); - setup_prefetch({p}prefetch, std::get<3>({p}pointers), {n}, {ldc}); - {name}(std::get<0>({p}pointers), std::get<{sparse}>({p}pointers), std::get<3>({p}pointers), {p}alpha, {p}beta, {p}prefetch); - result = post<{T}>({m}, {n}, {k}, {lda}, &ldb, {ldc}, &{p}alpha, &{p}beta, std::get<0>({p}pointers), std::get<1>({p}pointers), std::get<3>({p}pointers), std::get<4>({p}pointers), {delta:.7f}); - results.push_back(std::make_tuple("{name}", result)); - free(std::get<0>({p}pointers)); free(std::get<1>({p}pointers)); free(std::get<2>({p}pointers)); free(std::get<3>({p}pointers)); free(std::get<4>({p}pointers)); free({p}prefetch); -""".format(m=kern.m, n=kern.n, k=kern.k, lda=kern.lda, ldb=kern.ldb, ldc=kern.ldc, alpha=kern.alpha, beta=kern.beta, - mtx=mtx, delta=kern.delta, name=name, sparse=2 if kern.ldb == 0 else 1, p=prec, T="float" if prec == 'f' else "double")) - - f.write(test_generator.end_of_testsuite) diff --git a/tests/testsuite_generator.py b/tests/testsuite_generator.py index 9457038..13cf7d1 100755 --- a/tests/testsuite_generator.py +++ b/tests/testsuite_generator.py @@ -4,19 +4,23 @@ import random import sys import os.path +from pspamm.codegen.precision import * BASEDIR = 'build' -SparseKernel = namedtuple('SparseKernel', 'name m n k lda ldb ldc alpha beta block_sizes mtx delta') -DenseKernel = namedtuple('DenseKernel', 'name m n k lda ldb ldc alpha beta block_sizes delta') +SparseKernel = namedtuple('SparseKernel', 'name precision m n k lda ldb ldc alpha beta block_sizes mtx delta') +DenseKernel = namedtuple('DenseKernel', 'name precision m n k lda ldb ldc alpha beta block_sizes delta') head_of_testsuite = """#include #include +#include #include #include #include -#include +#include +#include #include +#include long long pspamm_num_total_flops = 0; @@ -49,6 +53,12 @@ } } +template +void setup_prefetch(T*& prefetch, T* matrix, unsigned n, unsigned ldc) { + posix_memalign(reinterpret_cast(&prefetch), 64, ldc*n*sizeof(T)); + std::memcpy(prefetch, matrix, ldc*n*sizeof(T)); +} + template std::tuple pre(unsigned M, unsigned N, unsigned K, unsigned LDA, unsigned LDB, unsigned LDC, std::string MTX) { @@ -120,68 +130,71 @@ } template -int post(unsigned M, unsigned N, unsigned K, unsigned LDA, unsigned* LDB, unsigned LDC, T* ALPHA, T* BETA, T* A, T* B, T* C, T* Cref, T DELTA) { +bool post(const std::string& name, unsigned M, unsigned N, unsigned K, unsigned LDA, unsigned* LDB, unsigned LDC, T* ALPHA, T* BETA, T* A, T* B, T* C, T* Cref, T DELTA) { if(*LDB == 0) *LDB = K; gemm_ref(M, N, K, LDA, *LDB, LDC, *ALPHA, *BETA, A, B, Cref); - + + bool failed = false; + double diffAbsMax = 0; + double diffRelMax = 0; for(int i = 0; i < M; i++) { for(int j = 0; j < N; j++) { // we use the relative error instead of the absolute error because of an issue we found for sparse single precision // kernels presumably due to limited precision of floats - if(std::abs((C[i + j * LDC] - Cref[i + j * LDC])) / Cref[i + j * LDC] > DELTA) { - return 0; - } + const double diffAbs = std::abs((static_cast(C[i + j * LDC]) - static_cast(Cref[i + j * LDC]))); + const double diffRel = diffAbs / static_cast(Cref[i + j * LDC]); + + diffAbsMax = std::max(diffAbs, diffAbsMax); + diffRelMax = std::max(diffRel, diffRelMax); + + failed |= diffRel > DELTA; } } - return 1; + const std::string resultString = failed ? "fail" : "success"; + + std::cout << std::scientific << name << ": " << resultString << " (abs: " << diffAbsMax << ", rel: " << diffRelMax << ")" << std::endl; + + return !failed; } """ setup_main = """ int main() { - std::vector> results; - std::tuple pointers; - int result; - - // A compiler related issue makes it necessary to store certain values in variables before using them - unsigned ldb; - double alpha; double beta; + int results = 0; + int correct = 0; """ setup_single_testcase = """ - ldb = {ldb}; alpha = {alpha}; beta = {beta}; - pointers = pre({m}, {n}, {k}, {lda}, ldb, {ldc}, "{mtx}"); +{{ + unsigned ldb = {ldb}; + {precision} alpha = {alpha}; + {precision} beta = {beta}; + {precision}* prefetch = nullptr; + auto pointers = pre<{precision}>({m}, {n}, {k}, {lda}, ldb, {ldc}, "{mtx}"); + setup_prefetch(prefetch, std::get<3>(pointers), {n}, {ldc}); {name}(std::get<0>(pointers), std::get<{sparse}>(pointers), std::get<3>(pointers), {alpha}, {beta}, nullptr); - result = post({m}, {n}, {k}, {lda}, &ldb, {ldc}, &alpha, &beta, std::get<0>(pointers), std::get<1>(pointers), std::get<3>(pointers), std::get<4>(pointers), {delta:.7f}); - results.push_back(std::make_tuple("{name}", result)); + const auto result = post<{precision}>(\"{name}\", {m}, {n}, {k}, {lda}, &ldb, {ldc}, &alpha, &beta, std::get<0>(pointers), std::get<1>(pointers), std::get<3>(pointers), std::get<4>(pointers), {delta:.15e}); + + if (result) {{ + ++correct; + }} + ++results; + free(std::get<0>(pointers)); free(std::get<1>(pointers)); free(std::get<2>(pointers)); free(std::get<3>(pointers)); free(std::get<4>(pointers)); +}} """ end_of_testsuite = """ - int correct = 0; - for(int i = 0; i < results.size(); i++) - { - if(std::get<1>(results[i])) - { - ++correct; - printf("%s succeeded.\\n", (std::get<0>(results[i])).c_str()); - } - else - { - printf("%s failed!\\n", (std::get<0>(results[i])).c_str()); - } - } + std::cout << correct << " out of " << results << " succeeded." << std::endl; - printf("\\n%i out of %lu test successful!\\n", correct, results.size()); - - return correct == results.size() ? 0 : 1; + return correct == results ? 0 : 1; } """ @@ -219,6 +232,8 @@ def make(kernels, arch): f.write(head_of_testsuite) + testcases = [] + for kern in kernels: arguments = ['pspamm-generator', str(kern.m), str(kern.n), str(kern.k), str(kern.lda), str(kern.ldb), @@ -227,22 +242,55 @@ def make(kernels, arch): if isinstance(kern, SparseKernel): arguments += ['--mtx_filename', kern.mtx] - block_sizes = list(set(kern.block_sizes)) + prec = 's' if kern.precision == Precision.SINGLE else 'd' + arguments += ['--precision', prec] + + block_sizes = list(set(bs if len(bs) > 2 else (bs[0], bs[1], 1) for bs in kern.block_sizes)) for bs in block_sizes: bm = bs[0] bn = bs[1] + bk = bs[2] - if arch == "knl": - assert (bm % 8 == 0 and (bn + 1) * (bm / 8) <= 32) - elif arch == "arm": - assert (bm % 2 == 0 and (bn + 1) * (bm / 2) + bn <= 32) - - name = kern.name + '_' + str(bm) + '_' + str(bn) + if arch.startswith("arm_sve"): + veclen = int(arch[7:]) if arch[7:] != '' else 128 + else: + veclen = int(arch[3:]) if arch[3:] != '' else 128 + assert veclen % 128 == 0 + reglen = veclen // 128 + v_len = (16 // kern.precision.size()) * reglen + # this should be the same assertion as in ../scripts/max_arm_sve.py + # ceiling division + vm = -(bm // -v_len) + v_size = v_len + elem128 = (16 // kern.precision.size()) + + if arch.startswith("knl"): + if not ((bn+bk) * vm <= 32): + print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') + continue + elif arch.startswith("hsw"): + if not ((bn+bk) * vm + bn * bk <= 16) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: + print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') + continue + elif arch.startswith("arm_sve"): + vkext = -(bk // -elem128) + isvkext = bn*vkext < 16 if elem128 == 2 else bn*vkext < 8 + vk = vkext if isvkext else bk + if not ((bn+bk) * vm + bn * vk <= 32): + print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') + continue + elif arch.startswith("arm"): + vk = -(bk // -elem128) + if not ((bn+bk) * vm + bn * vk <= 32) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: + print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') + continue + + name = f'{kern.name}_{kern.precision}_{bm}_{bn}_{bk}' additional_args = ['--output_funcname', name, '--output_filename', os.path.join(BASEDIR, arch, name + '.h'), '--output_overwrite'] - additional_args += ['--bm', str(bm), '--bn', str(bn), '--arch', arch] + additional_args += ['--bm', str(bm), '--bn', str(bn), '--bk', str(bk), '--arch', arch] try: print(' '.join(arguments + additional_args)) @@ -250,29 +298,26 @@ def make(kernels, arch): except subprocess.CalledProcessError as e: raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) - f.write('#include "' + arch + '/' + kern.name + '_' + str(bm) + '_' + str(bn) + '.h"\n') - - f.write('\n') - - f.write(function_definitions) - f.write(setup_main) - - for kern in kernels: - - block_sizes = list(set(kern.block_sizes)) - - for bs in block_sizes: - bm = bs[0] - bn = bs[1] - name = kern.name + '_' + str(bm) + '_' + str(bn) + f.write('#include "' + arch + '/' + name + '.h"\n') if isinstance(kern, SparseKernel): mtx = kern.mtx else: mtx = "" - f.write(setup_single_testcase.format( + testcases += [ + setup_single_testcase.format( m=kern.m, n=kern.n, k=kern.k, lda=kern.lda, ldb=kern.ldb, ldc=kern.ldc, alpha=kern.alpha, - beta=kern.beta, mtx=mtx, delta=kern.delta, name=name, sparse=2 if kern.ldb == 0 else 1)) + beta=kern.beta, mtx=mtx, delta=kern.delta, name=name, sparse=2 if kern.ldb == 0 else 1, + precision=kern.precision.ctype()) + ] + + f.write('\n') + + f.write(function_definitions) + f.write(setup_main) + + for testcase in testcases: + f.write(testcase) f.write(end_of_testsuite) diff --git a/tests/unit_test.py b/tests/unit_test.py new file mode 100644 index 0000000..2ddeebc --- /dev/null +++ b/tests/unit_test.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 + +import testsuite_generator as generator +from importlib import import_module + +from pspamm.codegen.precision import * + +import sys +import re + +arch = sys.argv[1] + +parsedarch = re.fullmatch(r'(?P[a-zA-Z_]+)(?P\d+)', arch) + +archname = parsedarch.group('name') +archprec = parsedarch.group('prec') + +blocksize = import_module("pspamm.codegen.architectures." + archname + ".blocksize") + +scripts = { + "arm": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.MaxK, blocksize.Cube], + "arm_sve": lambda blocksize: [blocksize.Max, blocksize.MaxK, blocksize.Cube], + "knl": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.MaxBn, blocksize.CubeBn], + "hsw": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.Cube], +} + +blocksize_algs = scripts[archname](blocksize) + [blocksize.Default] + +bitlen = int(archprec) +v_len = bitlen // 128 +v_size_fun = lambda prec: (16 // prec.size()) * v_len + +# define the maximum allowed difference between elements of our solution and the reference solution for +# double and single precision +delta_hp = 1e-2 +delta_sp = 1e-4 # epsilon is around e-7 => /2 ... For most cases, 1e-6 is enough +delta_dp = 1e-6 # epsilon is around e-15 => /2 + +kernels = [] + +for precision, delta in zip((Precision.SINGLE, Precision.DOUBLE), (delta_sp, delta_dp)): + v_size = v_size_fun(precision) + kernels.append(generator.DenseKernel("testlarge", precision, 40, 100, 100, 100, 100, 100, 2.5, 1.0, [(8, 5), (8,2)] + [x.getBlocksize(10, 10, 1, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.SparseKernel("test1", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(56, 56, 30), delta)) + kernels.append(generator.DenseKernel("test2", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("test3", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.SparseKernel("knl_only_test1", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1,2)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 2, 1), delta)) + kernels.append(generator.SparseKernel("knl_only_test2", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2,2), (16,7,2)] + [x.getBlocksize(24, 40, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(40, 40, 20), delta)) + + kernels.append(generator.SparseKernel("knl_only_test3", precision, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1,2)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 2, 2), delta)) + kernels.append(generator.SparseKernel("knl_only_test4", precision, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [(8, 20,2), (24,3,2)] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(10, 20, 1), delta)) + kernels.append(generator.SparseKernel("knl_only_test5", precision, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [(32, 2,2), (8,14,2)] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(10, 5, 1), delta)) + kernels.append(generator.SparseKernel("knl_only_test6", precision, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1,2)] + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 1, 1), delta)) + kernels.append(generator.SparseKernel("knl_only_test7", precision, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8, 24,2), (8,1,2)] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(40, 24, 1), delta)) + + kernels.append(generator.DenseKernel("knl_only_test8", precision, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1,2)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("knl_only_test9", precision, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2,2), (16,7,2)] + [x.getBlocksize(32, 40, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("knl_only_test10", precision, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [(8, 28,2)], delta)) + kernels.append(generator.DenseKernel("knl_only_test11", precision, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8, 20,2), (8,3,2)] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("knl_only_test12", precision, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [(32, 2,2), (8,14,2)] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("knl_only_test13", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1,2)] + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("knl_only_test14", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [(8, 24,2)] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], delta)) + + kernels.append(generator.SparseKernel("hswtest1", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(56, 56, 30), delta)) + kernels.append(generator.DenseKernel("hswtest2", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8,2)] + [x.getBlocksize(8, 40, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("hswtest3", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.SparseKernel("hsw_only_test1", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 2, 1), delta)) + kernels.append(generator.SparseKernel("hsw_only_test2", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2)] + [x.getBlocksize(24, 40, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(40, 40, 20), delta)) + + kernels.append(generator.SparseKernel("hsw_only_test3", precision, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 2, 2), delta)) + kernels.append(generator.SparseKernel("hsw_only_test4", precision, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(10, 20, 1), delta)) + kernels.append(generator.SparseKernel("hsw_only_test5", precision, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(10, 5, 1), delta)) + kernels.append(generator.SparseKernel("hsw_only_test6", precision, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 1, 1), delta)) + kernels.append(generator.SparseKernel("hsw_only_test7", precision, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8,1)] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(40, 24, 1), delta)) + + kernels.append(generator.DenseKernel("hsw_only_test8", precision, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("hsw_only_test9", precision, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2)] + [x.getBlocksize(32, 40, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("hsw_only_test10", precision, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [x.getBlocksize(56, 28, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("hsw_only_test11", precision, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8,3)] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("hsw_only_test12", precision, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("hsw_only_test13", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("hsw_only_test14", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], delta)) + + kernels.append(generator.DenseKernel("itest4", precision, 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4), (4,4,2), (4,4,4), (4,4,8)], delta)) + + kernels.append(generator.SparseKernel("itest1", precision, 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(56, 56, 30), delta)) + kernels.append(generator.DenseKernel("itest2", precision, 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("itest3", precision, 8, 56, 56, 8, 56, 8, 0.0, 0.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], delta)) + + kernels.append(generator.SparseKernel("arm_only_test1", precision, 2, 3, 4, 2, 0, 2, 1.1233, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(4, 3, 5), delta)) + kernels.append(generator.SparseKernel("arm_only_test2", precision, 2, 3, 4, 20, 0, 14, 1.0, 1.0, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(4, 3, 5), delta)) + kernels.append(generator.SparseKernel("arm_only_test3", precision, 32, 80, 50, 32, 0, 32, 1.0, 3.0, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(50, 80, 294), delta)) + kernels.append(generator.SparseKernel("arm_only_test4", precision, 32, 32, 32, 34, 0, 32, 1.0, 0.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(32, 32, 24), delta)) + kernels.append(generator.SparseKernel("arm_only_test5", precision, 2, 1, 1, 2, 0, 8, 1.0, -1.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 1, 1), delta)) + kernels.append(generator.SparseKernel("arm_only_test6", precision, 2, 2, 2, 2, 0, 2, 2.0, 234234.123, [(2, 1)] + [x.getBlocksize(2, 2, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(2, 2, 1), delta)) + kernels.append(generator.SparseKernel("arm_only_test7", precision, 16, 5, 7, 16, 0, 16, 0.0, -1.123, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(7, 5, 35), delta)) + + kernels.append(generator.DenseKernel("arm_only_test8", precision, 2, 3, 4, 2, 4, 2, 1.0, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("arm_only_test9", precision, 2, 3, 4, 20, 12, 14, 2.0, 1.123, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("arm_only_test10", precision, 32, 80, 50, 32, 50, 32, 0.0, 0.2, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("arm_only_test11", precision, 32, 32, 32, 33, 68, 32, 1231.0, 14443.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("arm_only_test12", precision, 2, 1, 1, 2, 1, 8, 1.0, 3.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("arm_only_test13", precision, 2, 3, 3, 2, 3, 2, 1.0, 0.0, [(2, 1)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("arm_only_test14", precision, 16, 5, 7, 16, 7, 16, 1.0, 1.0, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], delta)) + + kernels.append(generator.DenseKernel("sve_mixed_test1", precision, 9, 9, 9, 9, 9, 9, 1.0, 0.0, [(3, 3)] + [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + kernels.append(generator.SparseKernel("sve_mixed_test2", precision, 9, 9, 9, 9, 0, 9, 4.0, 2.5, [(3, 3)] + [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(9, 9, 20), delta_dp)) + kernels.append(generator.SparseKernel("sve_mixed_test3", precision, 18, 18, 18, 18, 0, 18, 3.4, -2.5, [(1, 1), (3, 3), (6, 6)] + [x.getBlocksize(18, 18, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(18, 18, 59), delta_dp)) + kernels.append(generator.SparseKernel("sve_mixed_test4", precision, 80, 80, 80, 80, 0, 80, 0.0, -2.5, [(4, 4), (8, 8)] + [x.getBlocksize(80, 80, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(80, 80, 312), delta_dp)) + kernels.append(generator.SparseKernel("sve_mixed_test5", precision, 8, 8, 8, 10, 0, 8, 3.0, -0.9, [(2, 2), (4, 4)] + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(8, 8, 6), delta_dp)) + kernels.append(generator.DenseKernel("sve_mixed_test6", precision, 8, 8, 8, 10, 8, 8, 3.0, -0.9, [(2, 2), (4, 4)] + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + + kernels.append(generator.DenseKernel("sve_test4", precision, 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4)], delta_dp)) + + kernels.append(generator.SparseKernel("sve_test1", precision, 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(56, 56, 30), delta_dp)) + kernels.append(generator.DenseKernel("sve_test2", precision, 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + kernels.append(generator.DenseKernel("sve_test3", precision, 8, 56, 56, 8, 56, 8, 0.0, 0.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + + kernels.append(generator.SparseKernel("sve_arm_only_test1", precision, 2, 3, 4, 2, 0, 2, 1.1233, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(4, 3, 5), delta_dp)) + kernels.append(generator.SparseKernel("sve_arm_only_test2", precision, 2, 3, 4, 20, 0, 14, 1.0, 1.0, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(4, 3, 5), delta_dp)) + kernels.append(generator.SparseKernel("sve_arm_only_test3", precision, 32, 80, 50, 32, 0, 32, 1.0, 3.0, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(50, 80, 294), delta_dp)) + kernels.append(generator.SparseKernel("sve_arm_only_test4", precision, 32, 32, 32, 34, 0, 32, 1.0, 0.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(32, 32, 24), delta_dp)) + kernels.append(generator.SparseKernel("sve_arm_only_test5", precision, 2, 1, 1, 2, 0, 8, 1.0, -1.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 1, 1), delta_dp)) + kernels.append(generator.SparseKernel("sve_arm_only_test6", precision, 2, 2, 2, 2, 0, 2, 2.0, 234234.123, [(2, 1)] + [x.getBlocksize(2, 2, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(2, 2, 1), delta_dp)) + kernels.append(generator.SparseKernel("sve_arm_only_test7", precision, 16, 5, 7, 16, 0, 16, 0.0, -1.123, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(7, 5, 35), delta_dp)) + + kernels.append(generator.DenseKernel("sve_arm_only_test8", precision, 2, 3, 4, 2, 4, 2, 1.0, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + kernels.append(generator.DenseKernel("sve_arm_only_test9", precision, 2, 3, 4, 20, 12, 14, 2.0, 1.123, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + kernels.append(generator.DenseKernel("sve_arm_only_test10", precision, 32, 80, 50, 32, 50, 32, 0.0, 0.2, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + kernels.append(generator.DenseKernel("sve_arm_only_test11", precision, 32, 32, 32, 33, 68, 32, 1231.0, 14443.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + kernels.append(generator.DenseKernel("sve_arm_only_test12", precision, 2, 1, 1, 2, 1, 8, 1.0, 3.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + kernels.append(generator.DenseKernel("sve_arm_only_test13", precision, 2, 3, 3, 2, 3, 2, 1.0, 0.0, [(2, 1)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + kernels.append(generator.DenseKernel("sve_arm_only_test14", precision, 16, 5, 7, 16, 7, 16, 1.0, 1.0, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + + kernels.append(generator.DenseKernel("sve_arm_only_test15", precision, 23, 29, 31, 23, 31, 23, 1.32, 0.96, [x.getBlocksize(23, 29, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + kernels.append(generator.SparseKernel("sve_arm_only_test16", precision, 23, 29, 31, 23, 0, 23, 1.32, 0.96, [x.getBlocksize(23, 29, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(31, 29, 61), delta_dp)) + + kernels.append(generator.DenseKernel("sve_single_prec_test_S1", precision, 9, 9, 9, 9, 9, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], delta_sp)) + kernels.append(generator.DenseKernel("sve_single_prec_test_S2", precision, 15, 15, 15, 15, 15, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size, precision) for x in blocksize_algs], delta_sp)) + kernels.append(generator.DenseKernel("sve_single_prec_test_S3", precision, 23, 23, 23, 23, 23, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size, precision) for x in blocksize_algs], delta_sp)) + kernels.append(generator.DenseKernel("sve_single_prec_test_S4", precision, 23, 31, 13, 23, 13, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size, precision) for x in blocksize_algs], delta_sp)) + kernels.append(generator.SparseKernel("sve_single_prec_test_S5", precision, 9, 9, 9, 9, 0, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(9, 9, 8), delta_sp)) + kernels.append(generator.SparseKernel("sve_single_prec_test_S6", precision, 15, 15, 15, 15, 0, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(15, 15, 22), delta_sp)) + kernels.append(generator.SparseKernel("sve_single_prec_test_S7", precision, 23, 23, 23, 23, 0, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(23, 23, 52), delta_sp)) + kernels.append(generator.SparseKernel("sve_single_prec_test_S8", precision, 23, 31, 13, 23, 0, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(13, 31, 40), delta_sp)) + +generator.make(kernels, arch) diff --git a/tests/unit_tests_arm.py b/tests/unit_tests_arm.py deleted file mode 100755 index 3d39dcd..0000000 --- a/tests/unit_tests_arm.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 - -import testsuite_generator as generator - -import pspamm.scripts.max_arm as max_square -import pspamm.scripts.old_arm as old - -blocksize_algs = [max_square, old] - -kernels = [] - -kernels.append(generator.DenseKernel("test3", 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4)], 0.0000001)) - -kernels.append(generator.SparseKernel("test1", 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) -kernels.append(generator.DenseKernel("test2", 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("test3", 8, 56, 56, 8, 56, 8, 0.0, 0.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 1) for x in blocksize_algs], 0.0000001)) - -kernels.append(generator.SparseKernel("arm_only_test1", 2, 3, 4, 2, 0, 2, 1.1233, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], generator.generateMTX(4, 3, 5), 0.0000001)) -kernels.append(generator.SparseKernel("arm_only_test2", 2, 3, 4, 20, 0, 14, 1.0, 1.0, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], generator.generateMTX(4, 3, 5), 0.0000001)) -kernels.append(generator.SparseKernel("arm_only_test3", 32, 80, 50, 32, 0, 32, 1.0, 3.0, [(8, 5)] + [x.getBlocksize(32, 80, 1) for x in blocksize_algs], generator.generateMTX(50, 80, 294), 0.0000001)) -kernels.append(generator.SparseKernel("arm_only_test4", 32, 32, 32, 34, 0, 32, 1.0, 0.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1) for x in blocksize_algs], generator.generateMTX(32, 32, 24), 0.0000001)) -kernels.append(generator.SparseKernel("arm_only_test5", 2, 1, 1, 2, 0, 8, 1.0, -1.0, [(2, 1)] + [x.getBlocksize(2, 1, 1) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) -kernels.append(generator.SparseKernel("arm_only_test6", 2, 2, 2, 2, 0, 2, 2.0, 234234.123, [(2, 1)] + [x.getBlocksize(2, 2, 1) for x in blocksize_algs], generator.generateMTX(2, 2, 1), 0.0000001)) -kernels.append(generator.SparseKernel("arm_only_test7", 16, 5, 7, 16, 0, 16, 0.0, -1.123, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1) for x in blocksize_algs], generator.generateMTX(7, 5, 35), 0.0000001)) - -kernels.append(generator.DenseKernel("arm_only_test8", 2, 3, 4, 2, 4, 2, 1.0, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("arm_only_test9", 2, 3, 4, 20, 12, 14, 2.0, 1.123, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("arm_only_test10", 32, 80, 50, 32, 50, 32, 0.0, 0.2, [(8, 5)] + [x.getBlocksize(32, 80, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("arm_only_test11", 32, 32, 32, 33, 68, 32, 1231.0, 14443.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("arm_only_test12", 2, 1, 1, 2, 1, 8, 1.0, 3.0, [(2, 1)] + [x.getBlocksize(2, 1, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("arm_only_test13", 2, 3, 3, 2, 3, 2, 1.0, 0.0, [(2, 1)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("arm_only_test14", 16, 5, 7, 16, 7, 16, 1.0, 1.0, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1) for x in blocksize_algs], 0.0000001)) - - -generator.make(kernels, "arm") diff --git a/tests/unit_tests_arm_sve.py b/tests/unit_tests_arm_sve.py deleted file mode 100644 index bf147ba..0000000 --- a/tests/unit_tests_arm_sve.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python3 - -import sve_testsuite_generator as generator - -import pspamm.scripts.max_arm_sve as max_sve - -import sys - -v_len = 4 - -if len(sys.argv) == 2: - v_len = int(sys.argv[1]) // 128 - -blocksize_algs = [max_sve] -v_size = 2 * v_len -v_size_s = 4 * v_len -bitlen = v_len * 128 -kernels = [] - -# define the maximum allowed difference between elements of our solution and the reference solution for -# double and single precision -delta_sp = 1e-4 # epsilon is around e-7 => /2 ... For most cases, 1e-6 is enough -delta_dp = 1e-7 # epsilon is around e-15 => /2 - -# test cases for double precision multiplication -kernels.append(generator.DenseKernel("sve_mixed_test1", 9, 9, 9, 9, 9, 9, 1.0, 0.0, [(3, 3)] + [x.getBlocksize(9, 9, 1, v_size) for x in blocksize_algs], delta_dp)) -kernels.append(generator.SparseKernel("sve_mixed_test2", 9, 9, 9, 9, 0, 9, 4.0, 2.5, [(3, 3)] + [x.getBlocksize(9, 9, 1, v_size) for x in blocksize_algs], generator.generateMTX(9, 9, 20), delta_dp)) -kernels.append(generator.SparseKernel("sve_mixed_test3", 18, 18, 18, 18, 0, 18, 3.4, -2.5, [(1, 1), (3, 3), (6, 6)] + [x.getBlocksize(18, 18, 1, v_size) for x in blocksize_algs], generator.generateMTX(18, 18, 59), delta_dp)) -kernels.append(generator.SparseKernel("sve_mixed_test4", 80, 80, 80, 80, 0, 80, 0.0, -2.5, [(4, 4), (8, 8)] + [x.getBlocksize(80, 80, 1, v_size) for x in blocksize_algs], generator.generateMTX(80, 80, 312), delta_dp)) -kernels.append(generator.SparseKernel("sve_mixed_test5", 8, 8, 8, 10, 0, 8, 3.0, -0.9, [(2, 2), (4, 4)] + [x.getBlocksize(8, 8, 1, v_size) for x in blocksize_algs], generator.generateMTX(8, 8, 6), delta_dp)) -kernels.append(generator.DenseKernel("sve_mixed_test6", 8, 8, 8, 10, 8, 8, 3.0, -0.9, [(2, 2), (4, 4)] + [x.getBlocksize(8, 8, 1, v_size) for x in blocksize_algs], delta_dp)) - -kernels.append(generator.DenseKernel("sve_test3", 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4)], delta_dp)) - -kernels.append(generator.SparseKernel("sve_test1", 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1, v_size) for x in blocksize_algs], generator.generateMTX(56, 56, 30), delta_dp)) -kernels.append(generator.DenseKernel("sve_test2", 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1, v_size) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_test3", 8, 56, 56, 8, 56, 8, 0.0, 0.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 1, v_size) for x in blocksize_algs], delta_dp)) - -kernels.append(generator.SparseKernel("sve_arm_only_test1", 2, 3, 4, 2, 0, 2, 1.1233, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size) for x in blocksize_algs], generator.generateMTX(4, 3, 5), delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test2", 2, 3, 4, 20, 0, 14, 1.0, 1.0, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size) for x in blocksize_algs], generator.generateMTX(4, 3, 5), delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test3", 32, 80, 50, 32, 0, 32, 1.0, 3.0, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size) for x in blocksize_algs], generator.generateMTX(50, 80, 294), delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test4", 32, 32, 32, 34, 0, 32, 1.0, 0.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size) for x in blocksize_algs], generator.generateMTX(32, 32, 24), delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test5", 2, 1, 1, 2, 0, 8, 1.0, -1.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size) for x in blocksize_algs], generator.generateMTX(1, 1, 1), delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test6", 2, 2, 2, 2, 0, 2, 2.0, 234234.123, [(2, 1)] + [x.getBlocksize(2, 2, 1, v_size) for x in blocksize_algs], generator.generateMTX(2, 2, 1), delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test7", 16, 5, 7, 16, 0, 16, 0.0, -1.123, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size) for x in blocksize_algs], generator.generateMTX(7, 5, 35), delta_dp)) - -kernels.append(generator.DenseKernel("sve_arm_only_test8", 2, 3, 4, 2, 4, 2, 1.0, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_arm_only_test9", 2, 3, 4, 20, 12, 14, 2.0, 1.123, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_arm_only_test10", 32, 80, 50, 32, 50, 32, 0.0, 0.2, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_arm_only_test11", 32, 32, 32, 33, 68, 32, 1231.0, 14443.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_arm_only_test12", 2, 1, 1, 2, 1, 8, 1.0, 3.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_arm_only_test13", 2, 3, 3, 2, 3, 2, 1.0, 0.0, [(2, 1)] + [x.getBlocksize(2, 3, 1, v_size) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_arm_only_test14", 16, 5, 7, 16, 7, 16, 1.0, 1.0, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size) for x in blocksize_algs], delta_dp)) - -kernels.append(generator.DenseKernel("sve_arm_only_test15", 23, 29, 31, 23, 31, 23, 1.32, 0.96, [x.getBlocksize(23, 29, 1, v_size) for x in blocksize_algs], delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test16", 23, 29, 31, 23, 0, 23, 1.32, 0.96, [x.getBlocksize(23, 29, 1, v_size) for x in blocksize_algs], generator.generateMTX(31, 29, 61), delta_dp)) - -# test cases for single precision multiplication -kernels.append(generator.DenseKernelS("sve_single_prec_test_S1", 9, 9, 9, 9, 9, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size_s) for x in blocksize_algs], delta_sp)) -kernels.append(generator.DenseKernelS("sve_single_prec_test_S2", 15, 15, 15, 15, 15, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size_s) for x in blocksize_algs], delta_sp)) -kernels.append(generator.DenseKernelS("sve_single_prec_test_S3", 23, 23, 23, 23, 23, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_s) for x in blocksize_algs], delta_sp)) -kernels.append(generator.DenseKernelS("sve_single_prec_test_S4", 23, 31, 13, 23, 13, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_s) for x in blocksize_algs], delta_sp)) -kernels.append(generator.SparseKernelS("sve_single_prec_test_S5", 9, 9, 9, 9, 0, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(9, 9, 8), delta_sp)) -kernels.append(generator.SparseKernelS("sve_single_prec_test_S6", 15, 15, 15, 15, 0, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(15, 15, 22), delta_sp)) -kernels.append(generator.SparseKernelS("sve_single_prec_test_S7", 23, 23, 23, 23, 0, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(23, 23, 52), delta_sp)) -kernels.append(generator.SparseKernelS("sve_single_prec_test_S8", 23, 31, 13, 23, 0, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(13, 31, 40), delta_sp)) - -generator.make(kernels, f"arm_sve{bitlen}") diff --git a/tests/unit_tests_hsw.py b/tests/unit_tests_hsw.py deleted file mode 100755 index d9a60c7..0000000 --- a/tests/unit_tests_hsw.py +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env python3 - -import testsuite_generator as generator - -import pspamm.scripts.max_hsw as max_square -import pspamm.scripts.max_bn_hsw as max_bn -import pspamm.scripts.old_hsw as old - -blocksize_algs = [max_square, max_bn, old] - -kernels = [] -kernels.append(generator.SparseKernel("test1", 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) -kernels.append(generator.DenseKernel("test2", 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8,2)] + [x.getBlocksize(8, 40, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("test3", 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.SparseKernel("hsw_only_test1", 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 1), 0.0000001)) -kernels.append(generator.SparseKernel("hsw_only_test2", 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2)] + [x.getBlocksize(24, 40, 2) for x in blocksize_algs], generator.generateMTX(40, 40, 20), 0.0000001)) - -kernels.append(generator.SparseKernel("hsw_only_test3", 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 2), 0.0000001)) -kernels.append(generator.SparseKernel("hsw_only_test4", 24, 20, 10, 40, 0, 24, 35.222, 0.0, [] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], generator.generateMTX(10, 20, 1), 0.0000001)) -kernels.append(generator.SparseKernel("hsw_only_test5", 64, 5, 10, 64, 0, 64, 2.3, 0.0, [] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], generator.generateMTX(10, 5, 1), 0.0000001)) -kernels.append(generator.SparseKernel("hsw_only_test6", 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) -kernels.append(generator.SparseKernel("hsw_only_test7", 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8,1)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], generator.generateMTX(40, 24, 1), 0.0000001)) - -kernels.append(generator.DenseKernel("hsw_only_test8", 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("hsw_only_test9", 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2)] + [x.getBlocksize(32, 40, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("hsw_only_test10", 56, 28, 56, 56, 56, 56, 0.1, 3.0, [x.getBlocksize(56, 28, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("hsw_only_test11", 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("hsw_only_test12", 64, 5, 10, 64, 12, 64, 1.0, 1.0, [] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("hsw_only_test13", 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("hsw_only_test14", 8, 24, 40, 8, 41, 8, 2.0, 1.0, [] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], 0.0000001)) - -generator.make(kernels, "hsw") - - diff --git a/tests/unit_tests_knl.py b/tests/unit_tests_knl.py deleted file mode 100755 index 713f58e..0000000 --- a/tests/unit_tests_knl.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 - -import testsuite_generator as generator - -import pspamm.scripts.max_knl as max_square -import pspamm.scripts.max_bn_knl as max_bn -import pspamm.scripts.old_knl as old - -blocksize_algs = [max_square, max_bn, old] - -kernels = [] - -kernels.append(generator.SparseKernel("test1", 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) -kernels.append(generator.DenseKernel("test2", 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("test3", 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.SparseKernel("knl_only_test1", 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 1), 0.0000001)) -kernels.append(generator.SparseKernel("knl_only_test2", 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2), (16,7)] + [x.getBlocksize(24, 40, 2) for x in blocksize_algs], generator.generateMTX(40, 40, 20), 0.0000001)) - -kernels.append(generator.SparseKernel("knl_only_test3", 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 2), 0.0000001)) -kernels.append(generator.SparseKernel("knl_only_test4", 24, 20, 10, 40, 0, 24, 35.222, 0.0, [(8, 20), (24,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], generator.generateMTX(10, 20, 1), 0.0000001)) -kernels.append(generator.SparseKernel("knl_only_test5", 64, 5, 10, 64, 0, 64, 2.3, 0.0, [(32, 2), (8,14)] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], generator.generateMTX(10, 5, 1), 0.0000001)) -kernels.append(generator.SparseKernel("knl_only_test6", 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) -kernels.append(generator.SparseKernel("knl_only_test7", 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8, 24), (8,1)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], generator.generateMTX(40, 24, 1), 0.0000001)) - -kernels.append(generator.DenseKernel("knl_only_test8", 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("knl_only_test9", 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2), (16,7)] + [x.getBlocksize(32, 40, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("knl_only_test10", 56, 28, 56, 56, 56, 56, 0.1, 3.0, [(8, 28)], 0.0000001)) -kernels.append(generator.DenseKernel("knl_only_test11", 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8, 20), (8,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("knl_only_test12", 64, 5, 10, 64, 12, 64, 1.0, 1.0, [(32, 2), (8,14)] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("knl_only_test13", 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("knl_only_test14", 8, 24, 40, 8, 41, 8, 2.0, 1.0, [(8, 24)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], 0.0000001)) - -generator.make(kernels, "knl") - -