From 0ce03c976e6835e38de0f207ca7c9862e7e8c1bc Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 23 Jul 2024 13:24:49 +0200 Subject: [PATCH 01/64] Add half precision --- codegen/architectures/arm_sve/generator.py | 22 ++++++++++++++++--- .../architectures/arm_sve/inlineprinter.py | 10 ++++++--- codegen/architectures/arm_sve/operands.py | 6 ++++- codegen/architectures/knl/generator.py | 2 ++ codegen/architectures/knl/inlineprinter.py | 17 +++++++++----- codegen/precision.py | 3 ++- 6 files changed, 46 insertions(+), 14 deletions(-) diff --git a/codegen/architectures/arm_sve/generator.py b/codegen/architectures/arm_sve/generator.py index ecc8950..36e66e1 100644 --- a/codegen/architectures/arm_sve/generator.py +++ b/codegen/architectures/arm_sve/generator.py @@ -42,6 +42,8 @@ def get_v_size(self): return 2 * self.v_len # 128 bit == 2 x 64 bit (double) elif self.precision == Precision.SINGLE: return 4 * self.v_len # 128 bit == 4 x 32 bit (float) + elif self.precision == Precision.HALF: + return 8 * self.v_len # 128 bit == 8 x 16 bit (half) raise NotImplementedError def get_precision(self): @@ -78,7 +80,11 @@ def set_sparse(self): def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: int, n: int, k: int): vm = self.ceil_div(bm, v_size) # vm can be 0 if bm < v_size -> makes ceil_div necessary assert ((bn + bk) * vm + bn * bk + 2 <= 32) # Needs to fit in SVE z registers - prec = "d" if self.get_precision() == Precision.DOUBLE else "s" + prec = { + Precision.DOUBLE: "d", + Precision.FLOAT: "s", + Precision.HALF: "h" + }[self.get_precision()] # use max(vm, 1) in case bm < v_size, otherwise we get no A_regs/C_regs A_regs = Matrix([[z(max(vm, 1) * c + r + 2, prec) for c in range(bk)] for r in range(max(vm, 1))]) @@ -131,8 +137,18 @@ def init_registers(self, bmmod = bm % v_size eol = "\\n\\t" # define the "end of line" sequence for easy assembly - p_suffix = "d" if v_size == 2 * self.v_len else "s" # determine whether predicate suffix is '.d' or '.s - gen_reg = "x" if v_size == 2 * self.v_len else "w" # determine if 'dup' registers are 64 bit or 32 bit + # determine the predicate suffix + p_suffix = { + Precision.DOUBLE: "d", + Precision.FLOAT: "s", + Precision.HALF: "h" + }[self.get_precision()] + # determine length of 'dup' registers + gen_reg = { + Precision.DOUBLE: "x", + Precision.FLOAT: "w", + Precision.HALF: "w" + }[self.get_precision()] overhead_counter = 6 # https://developer.arm.com/documentation/102374/0101/Registers-in-AArch64---general-purpose-registers diff --git a/codegen/architectures/arm_sve/inlineprinter.py b/codegen/architectures/arm_sve/inlineprinter.py index 8efd2cc..67564b8 100644 --- a/codegen/architectures/arm_sve/inlineprinter.py +++ b/codegen/architectures/arm_sve/inlineprinter.py @@ -19,6 +19,10 @@ def __init__(self, precision: Precision): self.output = [] self.stack = [] self.precision = precision + if stmt.dest.ugly_precision == "d": + self.ugly_precision = "d" + else: + self.ugly_precision = "w" def show(self): print("\n".join(self.output)) @@ -134,7 +138,7 @@ def visitLoad(self, stmt: LoadStmt): src_str = stmt.src.ugly if not stmt.is_B else stmt.src.ugly_no_vl_scaling p = self.p_string(stmt.pred) - prec = "d" if stmt.dest.ugly_precision == "d" else "w" + prec = self.ugly_precision if stmt.typ == AsmType.i64: s = "add {}, {}, {}".format(stmt.dest.ugly, stmt.dest.ugly, src_str) @@ -159,7 +163,7 @@ def visitStore(self, stmt: StoreStmt): dest_str = stmt.dest.ugly p = self.p_string(stmt.pred) - prec = "d" if stmt.src.ugly_precision == "d" else "w" + prec = self.ugly_precision if stmt.typ == AsmType.i64: s = "add {}, {}, {}".format(stmt.dest.ugly, stmt.dest.ugly, src_str) @@ -177,7 +181,7 @@ def visitPrefetch(self, stmt: PrefetchStmt): offset = stmt.dest.ugly_offset src_string = "[{}, {}, MUL VL]".format(xn, offset) p = self.p_string(stmt.pred) - prec = "d" if stmt.precision == Precision.DOUBLE else "w" + prec = self.ugly_precision s = "prf{} P{}{}{}, {}{}".format(prec, stmt.access_type, cache_level, temporality, p.split('/')[0], src_string) self.addLine(s, "prefetch from memory") diff --git a/codegen/architectures/arm_sve/operands.py b/codegen/architectures/arm_sve/operands.py index 16b3c94..747e343 100644 --- a/codegen/architectures/arm_sve/operands.py +++ b/codegen/architectures/arm_sve/operands.py @@ -52,7 +52,11 @@ def ugly_precision(self): @property def ugly_lsl_shift(self): - return 3 if self.ugly_precision == "d" else 2 + return { + "d": 3, + "s": 2, + "h": 1 + }[self.ugly_precision] @property def clobbered(self): diff --git a/codegen/architectures/knl/generator.py b/codegen/architectures/knl/generator.py index d7ac396..e198782 100644 --- a/codegen/architectures/knl/generator.py +++ b/codegen/architectures/knl/generator.py @@ -35,6 +35,8 @@ def get_v_size(self): return 8 elif self.precision == Precision.SINGLE: return 16 + elif self.precision == Precision.HALF: + return 32 raise NotImplementedError def get_template(self): diff --git a/codegen/architectures/knl/inlineprinter.py b/codegen/architectures/knl/inlineprinter.py index 1f32491..492c959 100644 --- a/codegen/architectures/knl/inlineprinter.py +++ b/codegen/architectures/knl/inlineprinter.py @@ -20,13 +20,16 @@ class InlinePrinter(Visitor): def __init__(self, precision: Precision): self.output = [] self.stack = [] - assert precision in [Precision.SINGLE, Precision.DOUBLE] - self.precision = 'd' if precision == Precision.DOUBLE else 's' + assert precision in [Precision.HALF, Precision.SINGLE, Precision.DOUBLE] + self.precision = { + Precision.DOUBLE: "d", + Precision.FLOAT: "s", + Precision.HALF: "h" + }[precision] def show(self): print("\n".join(self.output)) - def addLine(self, stmt: str, comment: str): line = " "*self.lmargin + self.indent*self.depth @@ -43,14 +46,16 @@ def addLine(self, stmt: str, comment: str): self.output.append(line) - - def visitFma(self, stmt: FmaStmt): b = stmt.bcast_src.ugly m = stmt.mult_src.ugly a = stmt.add_dest.ugly if stmt.bcast: - s = "vfmadd231p{} {}%{{1to{}%}}, {}, {}".format(self.precision, b, 8 if self.precision == 'd' else 16, m, a) + s = "vfmadd231p{} {}%{{1to{}%}}, {}, {}".format(self.precision, b, { + 'd': 8, + 's': 16, + 'h': 32 + }[self.precision], m, a) else: s = "vfmadd231p{} {}, {}, {}".format(self.precision, b,m,a) self.addLine(s, stmt.comment) diff --git a/codegen/precision.py b/codegen/precision.py index 0672b3d..e0b5546 100644 --- a/codegen/precision.py +++ b/codegen/precision.py @@ -3,9 +3,10 @@ class Precision(Enum): DOUBLE = 8 SINGLE = 4 + HALF = 2 @classmethod def getCType(cls, precision): - ctype = {cls.DOUBLE: 'double', cls.SINGLE: 'float'} + ctype = {cls.DOUBLE: 'double', cls.SINGLE: 'float', cls.HALF: 'half'} return ctype[precision] From 7dc330e64d75b346f287e94875bb016f4c97fd98 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 23 Jul 2024 13:41:34 +0200 Subject: [PATCH 02/64] Fix half precision, add full AVX10 support --- pspamm/codegen/architectures/hsw/generator.py | 27 ++++++++++++------- .../architectures/hsw/inlineprinter.py | 5 ++-- pspamm/codegen/architectures/knl/generator.py | 20 +++++++++----- .../architectures/knl/inlineprinter.py | 21 +++++++++++---- pspamm/codegen/operands.py | 26 ++++++++++++++++++ 5 files changed, 75 insertions(+), 24 deletions(-) diff --git a/pspamm/codegen/architectures/hsw/generator.py b/pspamm/codegen/architectures/hsw/generator.py index 0b5e87b..a3bc1c5 100644 --- a/pspamm/codegen/architectures/hsw/generator.py +++ b/pspamm/codegen/architectures/hsw/generator.py @@ -32,11 +32,13 @@ class Generator(AbstractGenerator): }}}}; """ + v_len = 2 + def get_v_size(self): if self.precision == Precision.DOUBLE: - return 4 + return 2 * self.v_len elif self.precision == Precision.SINGLE: - return 8 + return 4 * self.v_len raise NotImplementedError def get_template(self): @@ -47,19 +49,24 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: vm = bm//v_size assert((bn + bk) * vm + bn * bk <= 16) # Needs to fit in AVX/AVX2 ymm registers - A_regs = Matrix([[ymm(vm*c + r) for c in range(bk)] for r in range(vm)]) - B_regs = Matrix([[ymm(vm*bk + bn * r + c) for c in range(bn)] for r in range(bk)]) - C_regs = Matrix([[ymm(16 - vm*bn + vm*c + r) for c in range(bn)] + vmm = { + 1: xmm, + 2: ymm + }[self.v_len] + + A_regs = Matrix([[vmm(vm*c + r) for c in range(bk)] for r in range(vm)]) + B_regs = Matrix([[vmm(vm*bk + bn * r + c) for c in range(bn)] for r in range(bk)]) + C_regs = Matrix([[vmm(16 - vm*bn + vm*c + r) for c in range(bn)] for r in range(vm)]) - print([[ymm(vm*c + r ).ugly for c in range(bk)] for r in range(vm)]) - print([[ymm(vm*bk + bn * r + c).ugly for c in range(bn)] for r in range(bk)]) - print([[ymm(16 - vm*bn + vm*c + r).ugly for c in range(bn)] + print([[vmm(vm*c + r ).ugly for c in range(bk)] for r in range(vm)]) + print([[vmm(vm*bk + bn * r + c).ugly for c in range(bn)] for r in range(bk)]) + print([[vmm(16 - vm*bn + vm*c + r).ugly for c in range(bn)] for r in range(vm)]) starting_regs = [rdi, rsi, rdx, rbx, rcx] b_reg = vm*bk - alpha_reg = [xmm(b_reg), ymm(b_reg)] - beta_reg = [xmm(b_reg + 1), ymm(b_reg + 1)] + alpha_reg = [xmm(b_reg), vmm(b_reg)] + beta_reg = [xmm(b_reg + 1), vmm(b_reg + 1)] available_regs = [r(9),r(10),r(11),r(13),r(14),r(15),rax] diff --git a/pspamm/codegen/architectures/hsw/inlineprinter.py b/pspamm/codegen/architectures/hsw/inlineprinter.py index 32520df..48e10d2 100644 --- a/pspamm/codegen/architectures/hsw/inlineprinter.py +++ b/pspamm/codegen/architectures/hsw/inlineprinter.py @@ -20,11 +20,10 @@ class InlinePrinter(Visitor): def __init__(self, precision: Precision): self.output = [] self.stack = [] - assert precision in [Precision.HALF, Precision.SINGLE, Precision.DOUBLE] + assert precision in [Precision.SINGLE, Precision.DOUBLE] self.precision = { Precision.DOUBLE: "d", - Precision.FLOAT: "s", - Precision.HALF: "h" + Precision.FLOAT: "s" }[precision] def show(self): diff --git a/pspamm/codegen/architectures/knl/generator.py b/pspamm/codegen/architectures/knl/generator.py index 9bcf266..cade37f 100644 --- a/pspamm/codegen/architectures/knl/generator.py +++ b/pspamm/codegen/architectures/knl/generator.py @@ -32,13 +32,15 @@ class Generator(AbstractGenerator): }}}}; """ + v_len = 4 + def get_v_size(self): if self.precision == Precision.DOUBLE: - return 8 + return 2 * self.v_len elif self.precision == Precision.SINGLE: - return 16 + return 4 * self.v_len elif self.precision == Precision.HALF: - return 32 + return 8 * self.v_len raise NotImplementedError def get_template(self): @@ -47,11 +49,17 @@ def get_template(self): def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int): assert(bm % v_size == 0) vm = bm//v_size - assert((bn+bk) * vm <= 32) # Needs to fit in AVX512 zmm registers + assert((bn+bk) * vm <= 32) # Needs to fit in AVX512 xmm/ymm/zmm registers + + vmm = { + 1: xmm, + 2: ymm, + 4: zmm + }[self.v_len] - A_regs = Matrix([[zmm(vm*c + r) for c in range(bk)] for r in range(vm)]) + A_regs = Matrix([[vmm(vm*c + r) for c in range(bk)] for r in range(vm)]) B_regs = [] - C_regs = Matrix([[zmm(32 - vm*bn + vm*c + r) for c in range(bn)] + C_regs = Matrix([[vmm(32 - vm*bn + vm*c + r) for c in range(bn)] for r in range(vm)]) starting_regs = [rdi, rsi, rdx, rbx, rcx] diff --git a/pspamm/codegen/architectures/knl/inlineprinter.py b/pspamm/codegen/architectures/knl/inlineprinter.py index 0763294..fbfe2a0 100644 --- a/pspamm/codegen/architectures/knl/inlineprinter.py +++ b/pspamm/codegen/architectures/knl/inlineprinter.py @@ -20,8 +20,17 @@ class InlinePrinter(Visitor): def __init__(self, precision: Precision): self.output = [] self.stack = [] - assert precision in [Precision.SINGLE, Precision.DOUBLE] - self.precision = 'd' if precision == Precision.DOUBLE else 's' + assert precision in [Precision.HALF, Precision.SINGLE, Precision.DOUBLE] + self.precision = { + Precision.DOUBLE: 'd', + Precision.SINGLE: 's', + Precision.HALF: 'h' + }[precision] + self.broadcast_multiplier = { + Precision.DOUBLE: 2, + Precision.SINGLE: 4, + Precision.HALF: 8 + }[precision] def show(self): print("\n".join(self.output)) @@ -49,12 +58,13 @@ def visitFma(self, stmt: FmaStmt): b = stmt.bcast_src.ugly m = stmt.mult_src.ugly a = stmt.add_dest.ugly + regsize = stmt.add_dest.size() // 16 if stmt.bcast: - s = "vfmadd231p{} {}%{{1to{}%}}, {}, {}".format(self.precision, b, 8 if self.precision == 'd' else 16, m, a) + s = "vfmadd231p{} {}%{{1to{}%}}, {}, {}".format(self.precision, b, regsize * self.broadcast_multiplier, m, a) else: if stmt.mult_src.typeinfo == AsmType.i64: # in this case, m is a Register that points to alpha; manually format to be a memory address - s = "vfmadd231p{} 0({})%{{1to{}%}}, {}, {}".format(self.precision, m, 8 if self.precision == 'd' else 16, b, a) + s = "vfmadd231p{} 0({})%{{1to{}%}}, {}, {}".format(self.precision, m, regsize * self.broadcast_multiplier, b, a) else: s = "vfmadd231p{} {}, {}, {}".format(self.precision, b,m,a) self.addLine(s, stmt.comment) @@ -63,9 +73,10 @@ def visitMul(self, stmt: MulStmt): b = stmt.src.ugly m = stmt.mult_src.ugly a = stmt.dest.ugly + regsize = stmt.add_dest.size() // 16 if stmt.mult_src.typeinfo == AsmType.i64: # in this case, m is a Register that points to alpha/beta; manually format to be a memory address - s = "vmulp{} 0({})%{{1to{}%}}, {}, {}".format(self.precision, m, 8 if self.precision == 'd' else 16, b, a) + s = "vmulp{} 0({})%{{1to{}%}}, {}, {}".format(self.precision, m, regsize * self.broadcast_multiplier, b, a) else: s = "vmulp{} {}, {}, {}".format(self.precision, b,m,a) self.addLine(s, stmt.comment) diff --git a/pspamm/codegen/operands.py b/pspamm/codegen/operands.py index c22d2ea..0d42c1e 100644 --- a/pspamm/codegen/operands.py +++ b/pspamm/codegen/operands.py @@ -43,6 +43,32 @@ class Register(Operand): def __init__(self, typeinfo, value) -> None: self.typeinfo = typeinfo self.value = str(value) + + def size(self): + if self.typeinfo == AsmType.i8: + return 1 + if self.typeinfo == AsmType.i16: + return 2 + if self.typeinfo == AsmType.i32: + return 4 + if self.typeinfo == AsmType.i64: + return 8 + if self.typeinfo == AsmType.f32: + return 4 + if self.typeinfo == AsmType.f64: + return 8 + if self.typeinfo == AsmType.f32x4: + return 16 + if self.typeinfo == AsmType.f32x8: + return 32 + if self.typeinfo == AsmType.f32x16: + return 64 + if self.typeinfo == AsmType.f64x2: + return 16 + if self.typeinfo == AsmType.f64x4: + return 32 + if self.typeinfo == AsmType.f64x8: + return 64 @property def ugly(self): From bfb36777ceb4569b3e4771a4ce8631d196c66586 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 23 Jul 2024 13:59:26 +0200 Subject: [PATCH 03/64] Forward half precision, refactor a bit --- pspamm/codegen/architectures/arm/generator.py | 6 +++++ .../architectures/arm_sve/generator.py | 6 +++++ pspamm/codegen/architectures/hsw/generator.py | 6 +++++ pspamm/codegen/architectures/knl/generator.py | 6 +++++ pspamm/codegen/generator.py | 11 ++++++++ pspamm/matmul.py | 26 +++++++++++-------- pspamm/pspamm.py | 2 +- 7 files changed, 51 insertions(+), 12 deletions(-) diff --git a/pspamm/codegen/architectures/arm/generator.py b/pspamm/codegen/architectures/arm/generator.py index 08d1725..4b6639a 100644 --- a/pspamm/codegen/architectures/arm/generator.py +++ b/pspamm/codegen/architectures/arm/generator.py @@ -39,6 +39,12 @@ def get_v_size(self): def get_template(self): return Generator.template + def use_broadcast(self): + return True + + def has_masks(self): + return False + def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int): assert(bm % v_size == 0) vm = bm//v_size diff --git a/pspamm/codegen/architectures/arm_sve/generator.py b/pspamm/codegen/architectures/arm_sve/generator.py index 30430c2..7144cba 100644 --- a/pspamm/codegen/architectures/arm_sve/generator.py +++ b/pspamm/codegen/architectures/arm_sve/generator.py @@ -51,6 +51,12 @@ def get_precision(self): def get_template(self): return self.template + + def use_broadcast(self): + return True + + def has_masks(self): + return True def pred_n_trues(self, num_trues: int, v_size: int, suffix: str = None) -> Register_ARM: """pred takes num_trues=num of true elements and suffix=type of predicate (m or z) for merging or zeroing diff --git a/pspamm/codegen/architectures/hsw/generator.py b/pspamm/codegen/architectures/hsw/generator.py index a3bc1c5..0904bbd 100644 --- a/pspamm/codegen/architectures/hsw/generator.py +++ b/pspamm/codegen/architectures/hsw/generator.py @@ -44,6 +44,12 @@ def get_v_size(self): def get_template(self): return Generator.template + def use_broadcast(self): + return True + + def has_masks(self): + return False + def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int): assert(bm % v_size == 0) vm = bm//v_size diff --git a/pspamm/codegen/architectures/knl/generator.py b/pspamm/codegen/architectures/knl/generator.py index cade37f..4bee213 100644 --- a/pspamm/codegen/architectures/knl/generator.py +++ b/pspamm/codegen/architectures/knl/generator.py @@ -46,6 +46,12 @@ def get_v_size(self): def get_template(self): return Generator.template + def use_broadcast(self): + return False + + def has_masks(self): + return False # for now + def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int): assert(bm % v_size == 0) vm = bm//v_size diff --git a/pspamm/codegen/generator.py b/pspamm/codegen/generator.py index 51e852f..f2db7fb 100644 --- a/pspamm/codegen/generator.py +++ b/pspamm/codegen/generator.py @@ -9,6 +9,17 @@ def __init__(self, precision: Precision): def get_precision(self): return self.precision + + def set_sparse(self): + pass + + @abstractmethod + def use_broadcast(self): + pass + + @abstractmethod + def has_masks(self): + pass @abstractmethod def get_v_size(self): diff --git a/pspamm/matmul.py b/pspamm/matmul.py index 2204af3..69cfeb6 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -98,7 +98,7 @@ def __init__(self, if arch == 'skx': arch = 'knl' - + # hacky implementation of multi-register length if arch.startswith('arm_sve'): if len(arch) == 7: @@ -110,8 +110,12 @@ def __init__(self, arch = 'arm_sve' self.arch = arch - assert precision.lower() in ['s', 'd'] - self.precision = Precision.DOUBLE if precision.lower() == 'd' else Precision.SINGLE + assert precision.lower() in ['h', 's', 'd'] + self.precision = { + 'h' : Precision.HALF, + 's' : Precision.SINGLE, + 'd' : Precision.DOUBLE + }[precision.lower()] pspamm.architecture.init() pspamm.architecture.arch = arch @@ -121,11 +125,11 @@ def __init__(self, self.generator = pspamm.architecture.Generator(self.precision) # flag that determines if a matmul kernel uses sve instructions -> needed for sve predicates - self.is_sve = arch == "arm_sve" + self.masks = self.generator.has_masks() # define which architectures need to use an explicit broadcast, necessary for alpha/beta values - self.use_bcst = arch in ["arm", "arm_sve", "hsw"] + self.use_bcst = self.generator.use_broadcast() - if self.is_sve: + if arch.startswith('arm_sve'): self.generator.v_len = v_len_regs self.v_size = self.generator.get_v_size() @@ -159,7 +163,7 @@ def __init__(self, if ldb == 0: pattern = Matrix.load(mtx_filename) - if self.is_sve: + if self.masks: self.generator.set_sparse() else: mtx = numpy.zeros((k, n)) @@ -191,7 +195,7 @@ def __init__(self, prefetchReg = self.generator.init_prefetching(self.prefetching) # if matrices are always padded to multiple of v_size, we can remove the if-part and execute the assert for SVE too - if not self.is_sve: + if not self.masks: assert(self.m % self.v_size == 0) self.A_regs, self.B_regs, self.C_regs, self.starting_regs, self.alpha_reg, self.beta_reg, self.loop_reg, self.additional_regs = self.generator.make_reg_blocks(self.bm, self.bn, self.bk, self.v_size, self.nnz, self.m, self.n, self.k) @@ -216,7 +220,7 @@ def make_nk_unroll(self): Bn = self.n // self.bn Bk = self.k // self.bk # handle fringe case of SVE -> allow bm < v_size - vm = self.bm // self.v_size if not self.is_sve else self.generator.ceil_div(self.bm, self.v_size) + vm = self.bm // self.v_size if not self.masks else self.generator.ceil_div(self.bm, self.v_size) n_overhead = self.n % self.bn k_overhead = self.k % self.bk @@ -242,7 +246,7 @@ def make_nk_unroll(self): asm.add(bcst(self.beta_bcst_reg, self.beta_reg[1], "Broadcast beta")) for ic in range(regs.shape[1]): for ir in range(regs.shape[0]): - pred_m = None if not self.is_sve else self.generator.pred_n_trues(self.bm - ir * self.v_size, self.v_size, "m") + pred_m = None if not self.masks else self.generator.pred_n_trues(self.bm - ir * self.v_size, self.v_size, "m") asm.add(mul(regs[ir,ic], self.beta_reg[1], regs[ir,ic], "C = beta * C", pred=pred_m)) else: asm.add(self.generator.make_zero_block(regs, self.additional_regs)) @@ -271,7 +275,7 @@ def make_nk_unroll(self): for ir in range(A_regs_cut.shape[0]): for ic in range(A_regs_cut.shape[1]): - pred_m = None if not self.is_sve else self.generator.pred_n_trues(self.bm - ir*self.v_size, self.v_size, "m") + pred_m = None if not self.masks else self.generator.pred_n_trues(self.bm - ir*self.v_size, self.v_size, "m") if self.beta != 0.0 and self.beta != 1.0: store_block.add(mul(A_regs_cut[ir,ic], self.beta_reg[1], A_regs_cut[ir,ic], pred=pred_m)) if self.beta == 0.0: diff --git a/pspamm/pspamm.py b/pspamm/pspamm.py index 9260ec0..c1b5a68 100755 --- a/pspamm/pspamm.py +++ b/pspamm/pspamm.py @@ -50,7 +50,7 @@ def main() -> None: parser.add_argument("--bk", type=int, help="Size of k-blocks") parser.add_argument("--arch", help="Architecture", default="knl") - parser.add_argument("--precision", help="Single (s) or double (d) precision", default="d") + parser.add_argument("--precision", help="Precision of the matrix multiplication, either half (h), single (s), or double (d)", default="d") parser.add_argument("--prefetching", help="Prefetching") From 56287e739f75558489810b81b804b871b193134a Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 23 Jul 2024 14:18:24 +0200 Subject: [PATCH 04/64] Forward new vector lengths --- pspamm/matmul.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/pspamm/matmul.py b/pspamm/matmul.py index 69cfeb6..2adba30 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -96,8 +96,8 @@ def __init__(self, except: self.beta = 'generic' - if arch == 'skx': - arch = 'knl' + if arch.startswith('skx'): + arch = 'knl' + arch[3:] # hacky implementation of multi-register length if arch.startswith('arm_sve'): @@ -108,6 +108,24 @@ def __init__(self, assert v_len_bits % 128 == 0 and v_len_bits <= 2048 v_len_regs = v_len_bits // 128 arch = 'arm_sve' + + if arch.startswith('knl'): + if len(arch) == 3: + v_len_regs = 4 + else: + v_len_bits = int(arch[3:]) + assert v_len_bits in (128, 256, 512) + v_len_regs = v_len_bits // 128 + arch = 'knl' + + if arch.startswith('hsw'): + if len(arch) == 3: + v_len_regs = 2 + else: + v_len_bits = int(arch[3:]) + assert v_len_bits in (128, 256) + v_len_regs = v_len_bits // 128 + arch = 'hsw' self.arch = arch assert precision.lower() in ['h', 's', 'd'] @@ -129,7 +147,7 @@ def __init__(self, # define which architectures need to use an explicit broadcast, necessary for alpha/beta values self.use_bcst = self.generator.use_broadcast() - if arch.startswith('arm_sve'): + if arch in ('arm_sve', 'hsw', 'knl'): self.generator.v_len = v_len_regs self.v_size = self.generator.get_v_size() From aa29c9bc3a3dcdd57e3b4f50030e6289f74c578b Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 6 Aug 2024 12:28:18 +0200 Subject: [PATCH 05/64] Prepare for BFLOAT16, AVX512 masks, single-precision NEON --- pspamm/codegen/analysis.py | 2 - pspamm/codegen/architectures/arm/generator.py | 21 ++++++--- .../architectures/arm/inlineprinter.py | 2 +- pspamm/codegen/architectures/arm/operands.py | 18 +++++--- .../architectures/arm_sve/generator.py | 44 +++++++++---------- .../architectures/arm_sve/inlineprinter.py | 11 +++-- .../codegen/architectures/arm_sve/operands.py | 6 +-- pspamm/codegen/architectures/hsw/generator.py | 11 +++-- .../architectures/hsw/inlineprinter.py | 4 +- pspamm/codegen/architectures/knl/generator.py | 30 +++++++------ .../architectures/knl/inlineprinter.py | 13 ++++-- pspamm/codegen/architectures/knl/operands.py | 11 +++-- pspamm/codegen/generator.py | 8 ++++ pspamm/codegen/precision.py | 12 ++++- pspamm/cursors/blockcursor.py | 4 +- pspamm/matmul.py | 5 +-- pspamm/pspamm.py | 2 +- pspamm/scripts/max_arm.py | 7 ++- pspamm/scripts/max_hsw.py | 7 ++- pspamm/scripts/max_knl.py | 7 ++- pspamm/scripts/old_hsw.py | 19 ++++---- pspamm/scripts/old_knl.py | 19 ++++---- 22 files changed, 146 insertions(+), 117 deletions(-) diff --git a/pspamm/codegen/analysis.py b/pspamm/codegen/analysis.py index 259c594..d12224d 100644 --- a/pspamm/codegen/analysis.py +++ b/pspamm/codegen/analysis.py @@ -55,5 +55,3 @@ def visitBlock(self, block: Block): stmt.accept(self) self.stack.pop() - - diff --git a/pspamm/codegen/architectures/arm/generator.py b/pspamm/codegen/architectures/arm/generator.py index 4b6639a..f87da77 100644 --- a/pspamm/codegen/architectures/arm/generator.py +++ b/pspamm/codegen/architectures/arm/generator.py @@ -44,21 +44,30 @@ def use_broadcast(self): def has_masks(self): return False + + def init_mask(self, bm, v_size, tempreg, maskregs): + return block() def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int): assert(bm % v_size == 0) vm = bm//v_size assert((bn+bk) * vm + bn * bk <= 32) # Needs to fit in NEON v registers - A_regs = Matrix([[v(vm*c + r) for c in range(bk)] for r in range(vm)]) - B_regs = Matrix([[v(vm*bk + bn * r + c) for c in range(bn)] for r in range(bk)]) - C_regs = Matrix([[v(32 - vm*bn + vm*c + r) for c in range(bn)] + prec = { + Precision.DOUBLE: "2D", + Precision.SINGLE: "4S", + Precision.HALF: "8H", + }[self.get_precision()] + + A_regs = Matrix([[v(vm*c + r, prec) for c in range(bk)] for r in range(vm)]) + B_regs = Matrix([[v(vm*bk + bn * r + c, prec) for c in range(bn)] for r in range(bk)]) + C_regs = Matrix([[v(32 - vm*bn + vm*c + r, prec) for c in range(bn)] for r in range(vm)]) # get vector register number of the first vector in B_regs b_reg = vm*bk - alpha_reg = [v(b_reg), v(b_reg)] - beta_reg = [v(b_reg + 1), v(b_reg + 1)] + alpha_reg = [v(b_reg, prec), v(b_reg, prec)] + beta_reg = [v(b_reg + 1, prec), v(b_reg + 1, prec)] starting_regs = [r(0), r(1), r(2), r(3), r(4)] @@ -67,7 +76,7 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: loop_reg = r(12) - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_reg, additional_regs + return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_reg, additional_regs, [] def bcst_alpha_beta(self, diff --git a/pspamm/codegen/architectures/arm/inlineprinter.py b/pspamm/codegen/architectures/arm/inlineprinter.py index abdea08..7ecd7ba 100644 --- a/pspamm/codegen/architectures/arm/inlineprinter.py +++ b/pspamm/codegen/architectures/arm/inlineprinter.py @@ -20,7 +20,7 @@ class InlinePrinter(Visitor): def __init__(self, precision: Precision): self.output = [] self.stack = [] - assert precision == Precision.DOUBLE + assert precision in (Precision.HALF, Precision.SINGLE, Precision.DOUBLE) def show(self): print("\n".join(self.output)) diff --git a/pspamm/codegen/architectures/arm/operands.py b/pspamm/codegen/architectures/arm/operands.py index 3b9e3e0..5d2c643 100644 --- a/pspamm/codegen/architectures/arm/operands.py +++ b/pspamm/codegen/architectures/arm/operands.py @@ -48,6 +48,18 @@ class Register_ARM(Register): @property def ugly(self): return self.value + + @property + def ugly_precision(self): + return self.value.split(".")[1] + + @property + def ugly_lsl_shift(self): + return { + "d": 3, + "s": 2, + "h": 1 + }[self.ugly_precision] @property def clobbered(self): @@ -61,14 +73,10 @@ def ugly_scalar(self): def ugly_scalar_1d(self): return (self.value.split(".")[0]).replace("v", "d") - @property - def ugly_1d(self): - return self.value.replace("2d", "1d") - r = lambda n: Register_ARM(AsmType.i64, "x" + str(n)) xzr = Register_ARM(AsmType.i64, "xzr") -v = lambda n: Register_ARM(AsmType.f64x8, "v" + str(n) + ".2d") +v = lambda n, prec: Register_ARM(AsmType.f64x8, "v" + str(n) + "." + prec) class MemoryAddress_ARM(MemoryAddress): diff --git a/pspamm/codegen/architectures/arm_sve/generator.py b/pspamm/codegen/architectures/arm_sve/generator.py index 7144cba..ee467e4 100644 --- a/pspamm/codegen/architectures/arm_sve/generator.py +++ b/pspamm/codegen/architectures/arm_sve/generator.py @@ -38,13 +38,7 @@ class Generator(AbstractGenerator): v_len = 4 # vector register length: v_len * 128 bit def get_v_size(self): - if self.precision == Precision.DOUBLE: - return 2 * self.v_len # 128 bit == 2 x 64 bit (double) - elif self.precision == Precision.SINGLE: - return 4 * self.v_len # 128 bit == 4 x 32 bit (float) - elif self.precision == Precision.HALF: - return 8 * self.v_len # 128 bit == 8 x 16 bit (half) - raise NotImplementedError + return (16 // self.precision.size()) * self.v_len def get_precision(self): return self.precision @@ -75,10 +69,6 @@ def pred_n_trues(self, num_trues: int, v_size: int, suffix: str = None) -> Regis s = "p{}/{}".format(num_trues - 1, suffix) return Register_ARM(AsmType.p64x8, s) - # taken from https://stackoverflow.com/questions/14822184/is-there-a-ceiling-equivalent-of-operator-in-python - def ceil_div(self, n, d): - return -(n // -d) - # is called at most one time in matmul.py def set_sparse(self): self.is_sparse = True @@ -88,8 +78,9 @@ def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: i assert ((bn + bk) * vm + bn * bk <= 32) # Needs to fit in SVE z registers prec = { Precision.DOUBLE: "d", - Precision.FLOAT: "s", - Precision.HALF: "h" + Precision.SINGLE: "s", + Precision.HALF: "h", + Precision.BFLOAT16: "h", }[self.get_precision()] # use max(vm, 1) in case bm < v_size, otherwise we get no A_regs/C_regs @@ -107,9 +98,11 @@ def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: i loop_reg = r(12) + mask_regs = [p(0), p(7)] + self.init_registers(bm, v_size) - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_reg, additional_regs + return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_reg, additional_regs, mask_regs def bcst_alpha_beta(self, alpha_reg: Register, @@ -136,6 +129,16 @@ def make_b_pointers(self, asm = block("No register based scaling") return asm + def init_mask(self, + bm: int, + v_size: int, + tempreg, + maskreg + ) -> Block: + + asm = block("No register based scaling") + return asm + def init_registers(self, bm: int, v_size: int @@ -147,15 +150,12 @@ def init_registers(self, # determine the predicate suffix p_suffix = { Precision.DOUBLE: "d", - Precision.FLOAT: "s", - Precision.HALF: "h" + Precision.SINGLE: "s", + Precision.HALF: "h", + Precision.BFLOAT16: "h", }[self.get_precision()] # determine length of 'dup' registers - gen_reg = { - Precision.DOUBLE: "x", - Precision.FLOAT: "w", - Precision.HALF: "w" - }[self.get_precision()] + gen_reg = "w" if self.get_precision().size() <= 4 else "x" overhead_counter = 6 comment = "//p7 denotes the 'all-true' predicate and, if given, p0 denotes the 'bm % v_size' predicate\n\t" @@ -305,7 +305,7 @@ def make_microkernel(self, bk, bn, bidx, bpattern = B.get_block(B_ptr, to_B_block) # tell sparse_mask() that we use sve - mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size, is_sve=True) + mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size, True) asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False)) # x = 0; diff --git a/pspamm/codegen/architectures/arm_sve/inlineprinter.py b/pspamm/codegen/architectures/arm_sve/inlineprinter.py index e03c2ae..73f9768 100644 --- a/pspamm/codegen/architectures/arm_sve/inlineprinter.py +++ b/pspamm/codegen/architectures/arm_sve/inlineprinter.py @@ -19,10 +19,9 @@ def __init__(self, precision: Precision): self.output = [] self.stack = [] self.precision = precision - if stmt.dest.ugly_precision == "d": - self.ugly_precision = "d" - else: - self.ugly_precision = "w" + self.ugly_precision = "w" if self.precision.size() <= 4 else "x" + + assert precision in (Precision.BFLOAT16, Precision.HALF, Precision.SINGLE, Precision.DOUBLE) def show(self): print("\n".join(self.output)) @@ -69,8 +68,8 @@ def visitBcst(self, stmt: BcstStmt): # Used to broadcast a scalar register into a vector register b = stmt.bcast_src.ugly a = stmt.dest.ugly - # make sure the src register is a W register when using single precision - if self.precision == Precision.SINGLE: + # make sure the src register is a W register when using single/half precision + if self.precision.size() <= 4: b = "w" + b[1:] s = "dup {}, {}".format(a, b) self.addLine(s, stmt.comment) diff --git a/pspamm/codegen/architectures/arm_sve/operands.py b/pspamm/codegen/architectures/arm_sve/operands.py index e016156..8c962b3 100644 --- a/pspamm/codegen/architectures/arm_sve/operands.py +++ b/pspamm/codegen/architectures/arm_sve/operands.py @@ -72,15 +72,11 @@ def ugly_scalar_1d(self): #turns "Vn.2d" into "Dn" return (self.value.split(".")[0]).replace("v", "d") - @property - def ugly_1d(self): - return self.value.replace("2d", "1d") - r = lambda n: Register_ARM(AsmType.i64, "x" + str(n)) xzr = Register_ARM(AsmType.i64, "xzr") z = lambda n, prec: Register_ARM(AsmType.f64x8, "z" + str(n) + "." + prec) - +p = lambda n: Register_ARM(AsmType.i64, "p" + str(n)) class MemoryAddress_ARM(MemoryAddress): @property diff --git a/pspamm/codegen/architectures/hsw/generator.py b/pspamm/codegen/architectures/hsw/generator.py index 0904bbd..679ba8e 100644 --- a/pspamm/codegen/architectures/hsw/generator.py +++ b/pspamm/codegen/architectures/hsw/generator.py @@ -35,11 +35,7 @@ class Generator(AbstractGenerator): v_len = 2 def get_v_size(self): - if self.precision == Precision.DOUBLE: - return 2 * self.v_len - elif self.precision == Precision.SINGLE: - return 4 * self.v_len - raise NotImplementedError + return (16 // self.precision.size()) * self.v_len def get_template(self): return Generator.template @@ -50,6 +46,9 @@ def use_broadcast(self): def has_masks(self): return False + def init_mask(self, bm, v_size, tempreg, maskregs): + return block() + def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int): assert(bm % v_size == 0) vm = bm//v_size @@ -90,7 +89,7 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: loop_reg = r(12) - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_reg, additional_regs + return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_reg, additional_regs, [] def bcst_alpha_beta(self, diff --git a/pspamm/codegen/architectures/hsw/inlineprinter.py b/pspamm/codegen/architectures/hsw/inlineprinter.py index 48e10d2..f39ab34 100644 --- a/pspamm/codegen/architectures/hsw/inlineprinter.py +++ b/pspamm/codegen/architectures/hsw/inlineprinter.py @@ -20,10 +20,10 @@ class InlinePrinter(Visitor): def __init__(self, precision: Precision): self.output = [] self.stack = [] - assert precision in [Precision.SINGLE, Precision.DOUBLE] + assert precision in (Precision.SINGLE, Precision.DOUBLE) self.precision = { Precision.DOUBLE: "d", - Precision.FLOAT: "s" + Precision.SINGLE: "s" }[precision] def show(self): diff --git a/pspamm/codegen/architectures/knl/generator.py b/pspamm/codegen/architectures/knl/generator.py index 4bee213..a928916 100644 --- a/pspamm/codegen/architectures/knl/generator.py +++ b/pspamm/codegen/architectures/knl/generator.py @@ -35,13 +35,7 @@ class Generator(AbstractGenerator): v_len = 4 def get_v_size(self): - if self.precision == Precision.DOUBLE: - return 2 * self.v_len - elif self.precision == Precision.SINGLE: - return 4 * self.v_len - elif self.precision == Precision.HALF: - return 8 * self.v_len - raise NotImplementedError + return (16 // self.precision.size()) * self.v_len def get_template(self): return Generator.template @@ -50,11 +44,10 @@ def use_broadcast(self): return False def has_masks(self): - return False # for now + return True def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int): - assert(bm % v_size == 0) - vm = bm//v_size + vm = self.ceil_div(bm, v_size) assert((bn+bk) * vm <= 32) # Needs to fit in AVX512 xmm/ymm/zmm registers vmm = { @@ -77,6 +70,8 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: additional_regs = [r(8)] + mask_regs = [mask(0)] + reg_count = 0 for i in range(1024, min(max(nnz * self.precision.value, m*k*self.precision.value, m*n*self.precision.value),8000), 2048): @@ -89,8 +84,18 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: loop_reg = r(12) - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_reg, additional_regs + return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_reg, additional_regs, mask_regs + def init_mask(self, bm, v_size, tempreg, maskregs): + rest = bm % v_size + if rest == 0: + return block("") + else: + asm = block("Set mask register") + restval = (1 << rest) - 1 + asm.add(mov(restval, tempreg)) + asm.add(mov(tempreg, maskreg[0])) + return asm def bcst_alpha_beta(self, alpha_reg: Register, @@ -219,9 +224,8 @@ def make_microkernel(self, asm = block("Block GEMM microkernel") bm,bk,aidx,apattern = A.get_block(A_ptr, to_A_block) bk,bn,bidx,bpattern = B.get_block(B_ptr, to_B_block) - assert(bm % v_size == 0) - mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size) + mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size, True) asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False)) for Vmi in range(bm//v_size): diff --git a/pspamm/codegen/architectures/knl/inlineprinter.py b/pspamm/codegen/architectures/knl/inlineprinter.py index fbfe2a0..fe8aa1c 100644 --- a/pspamm/codegen/architectures/knl/inlineprinter.py +++ b/pspamm/codegen/architectures/knl/inlineprinter.py @@ -20,16 +20,18 @@ class InlinePrinter(Visitor): def __init__(self, precision: Precision): self.output = [] self.stack = [] - assert precision in [Precision.HALF, Precision.SINGLE, Precision.DOUBLE] + assert precision in (Precision.BFLOAT16, Precision.HALF, Precision.SINGLE, Precision.DOUBLE) self.precision = { Precision.DOUBLE: 'd', Precision.SINGLE: 's', - Precision.HALF: 'h' + Precision.HALF: 'h', + Precision.BFLOAT16: 'bf16' }[precision] self.broadcast_multiplier = { Precision.DOUBLE: 2, Precision.SINGLE: 4, - Precision.HALF: 8 + Precision.HALF: 8, + Precision.BFLOAT16: 8 }[precision] def show(self): @@ -110,7 +112,10 @@ def visitMov(self, stmt: MovStmt): src_str = stmt.src.ugly if stmt.typ == AsmType.i64: - s = "movq {}, {}".format(src_str,stmt.dest.ugly) + if stmt.dest.ugly[0] == 'k': + s = "kmovq {}, {}".format(src_str,stmt.dest.ugly) + else: + s = "movq {}, {}".format(src_str,stmt.dest.ugly) elif stmt.typ == AsmType.f64x8 and stmt.aligned: if isinstance(stmt.src, Constant) and stmt.src.value == 0: s = "vpxord {}, {}, {}".format(stmt.dest.ugly,stmt.dest.ugly,stmt.dest.ugly) diff --git a/pspamm/codegen/architectures/knl/operands.py b/pspamm/codegen/architectures/knl/operands.py index 3c9b09b..b7c450f 100644 --- a/pspamm/codegen/architectures/knl/operands.py +++ b/pspamm/codegen/architectures/knl/operands.py @@ -38,7 +38,6 @@ class Register_KNL(Register): def ugly(self): return "%%" + self.value - class MemoryAddress_KNL(MemoryAddress): def __init__(self, @@ -72,8 +71,8 @@ def mem(base, offset, index=None, scaling=None): rdi = Register_KNL(AsmType.i64, "rdi") rsi = Register_KNL(AsmType.i64, "rsi") -r = lambda n: Register_KNL(AsmType.i64, "r"+str(n)) if n > 7 else gen_regs[n] -xmm = lambda n: Register_KNL(AsmType.f64x2, "xmm"+str(n)) -ymm = lambda n: Register_KNL(AsmType.f64x4, "ymm"+str(n)) -zmm = lambda n: Register_KNL(AsmType.f64x8, "zmm"+str(n)) - +r = lambda n: Register_KNL(AsmType.i64, "r"+str(n)) if n > 7 else gen_regs[n] +xmm = lambda n: Register_KNL(AsmType.f64x2, "xmm"+str(n)) +ymm = lambda n: Register_KNL(AsmType.f64x4, "ymm"+str(n)) +zmm = lambda n: Register_KNL(AsmType.f64x8, "zmm"+str(n)) +mask = lambda n: Register_KNL(AsmType.i64, "k"+str(n)) diff --git a/pspamm/codegen/generator.py b/pspamm/codegen/generator.py index f2db7fb..a20247d 100644 --- a/pspamm/codegen/generator.py +++ b/pspamm/codegen/generator.py @@ -13,6 +13,14 @@ def get_precision(self): def set_sparse(self): pass + # taken from https://stackoverflow.com/questions/14822184/is-there-a-ceiling-equivalent-of-operator-in-python + def ceil_div(self, n, d): + return -(n // -d) + + @abstractmethod + def init_mask(self, bm, v_size, tempreg, maskreg): + pass + @abstractmethod def use_broadcast(self): pass diff --git a/pspamm/codegen/precision.py b/pspamm/codegen/precision.py index e0b5546..e88349e 100644 --- a/pspamm/codegen/precision.py +++ b/pspamm/codegen/precision.py @@ -4,9 +4,19 @@ class Precision(Enum): DOUBLE = 8 SINGLE = 4 HALF = 2 + BFLOAT16 = 2.1 @classmethod def getCType(cls, precision): - ctype = {cls.DOUBLE: 'double', cls.SINGLE: 'float', cls.HALF: 'half'} + ctype = {cls.DOUBLE: 'double', cls.SINGLE: 'float', cls.HALF: 'uint16_t', cls.BFLOAT16: 'uint16_t'} return ctype[precision] + def size(self): + return { + self.DOUBLE: 8, + self.SINGLE: 4, + self.HALF: 2, + self.BFLOAT16: 2 + }[self] + raise NotImplementedError() + diff --git a/pspamm/cursors/blockcursor.py b/pspamm/cursors/blockcursor.py index 65e14b4..de620e9 100644 --- a/pspamm/cursors/blockcursor.py +++ b/pspamm/cursors/blockcursor.py @@ -194,7 +194,7 @@ def sparse_mask(A_regs: Matrix[Register], B_ptr: CursorLocation, B_block_offset: Coords, v_size: int, - is_sve: bool = False + has_mask: bool = False ) -> Matrix[bool]: Vr, Vc = A_regs.shape @@ -202,7 +202,7 @@ def sparse_mask(A_regs: Matrix[Register], A_br, A_bc, A_idx, A_pat = A.get_block(A_ptr, A_block_offset) B_br, B_bc, B_idx, B_pat = B.get_block(B_ptr, B_block_offset) - if not is_sve: + if not has_mask: assert (Vr * v_size == A_br) # bm must tile m exactly for now in NEON and AVX512 assert(Vc >= A_bc) # Matrix block must fit in register block assert(A_bc == B_br) # Matrix blocks are compatible diff --git a/pspamm/matmul.py b/pspamm/matmul.py index 2adba30..ee38964 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -216,11 +216,10 @@ def __init__(self, if not self.masks: assert(self.m % self.v_size == 0) - self.A_regs, self.B_regs, self.C_regs, self.starting_regs, self.alpha_reg, self.beta_reg, self.loop_reg, self.additional_regs = self.generator.make_reg_blocks(self.bm, self.bn, self.bk, self.v_size, self.nnz, self.m, self.n, self.k) + self.A_regs, self.B_regs, self.C_regs, self.starting_regs, self.alpha_reg, self.beta_reg, self.loop_reg, self.additional_regs, self.mask_regs = self.generator.make_reg_blocks(self.bm, self.bn, self.bk, self.v_size, self.nnz, self.m, self.n, self.k) self.alpha_bcst_reg, self.beta_bcst_reg = self.starting_regs[3], self.starting_regs[4] - self.A = DenseCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.value) self.B = BlockCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.value, blocks, patterns,mtx_overhead) self.C = DenseCursor("C", self.starting_regs[2], self.m, self.n, self.ldc, self.bm, self.bn, self.precision.value) @@ -342,6 +341,7 @@ def make(self): asm = block("unrolled_{}x{}x{}".format(self.m,self.n,self.k), self.generator.bcst_alpha_beta(self.alpha_reg, self.beta_reg), self.generator.make_scaling_offsets(self.additional_regs, self.nnz), + self.generator.init_mask(self.bm, self.v_size, self.loop_reg, self.mask_regs), loop(self.loop_reg, 0, Bm, 1).body(*loopBody) ) @@ -355,5 +355,4 @@ def make(self): self.A.r = self.m asm.add(self.make_nk_unroll()) - return asm diff --git a/pspamm/pspamm.py b/pspamm/pspamm.py index c1b5a68..3c6d3b4 100755 --- a/pspamm/pspamm.py +++ b/pspamm/pspamm.py @@ -17,7 +17,7 @@ def generate(alg: MatMul) -> None: block = alg.make() - text = make_cfunc(alg.output_funcname, alg.generator.get_template(), block, alg.flop, alg.starting_regs, alg.generator.get_precision()) + text = make_cfunc(alg.output_funcname, alg.generator.get_template(), block, alg.flop, alg.starting_regs + alg.mask_regs, alg.generator.get_precision()) if alg.output_filename is None: print(text) diff --git a/pspamm/scripts/max_arm.py b/pspamm/scripts/max_arm.py index 6cfd5c1..5e35680 100755 --- a/pspamm/scripts/max_arm.py +++ b/pspamm/scripts/max_arm.py @@ -1,4 +1,4 @@ -def getBlocksize(m , n, bk): +def getBlocksize(m , n, bk, v_size=2): bm = 2 bn = 1 @@ -6,7 +6,7 @@ def getBlocksize(m , n, bk): for i in range(2, m+1, 2): for j in range(1, n+1): - if ARM_condition(i, j, bk): + if ARM_condition(i, j, bk, v_size): if i*j > maxval: maxval = i*j bm = i @@ -15,8 +15,7 @@ def getBlocksize(m , n, bk): return (bm, bn) -def ARM_condition(bm, bn, bk): - v_size = 2 +def ARM_condition(bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) return (bn+bk) * vm + bn <= 32 diff --git a/pspamm/scripts/max_hsw.py b/pspamm/scripts/max_hsw.py index 156cd5b..ac4e094 100755 --- a/pspamm/scripts/max_hsw.py +++ b/pspamm/scripts/max_hsw.py @@ -1,4 +1,4 @@ -def getBlocksize(m , n, bk): +def getBlocksize(m , n, bk, v_size=4): bm = 4 bn = 1 @@ -6,7 +6,7 @@ def getBlocksize(m , n, bk): for i in range(4, m+1, 4): for j in range(1, n+1): - if HSW_condition(i, j, bk): + if HSW_condition(i, j, bk, v_size): if i*j > maxval: maxval = i*j bm = i @@ -15,8 +15,7 @@ def getBlocksize(m , n, bk): return (bm, bn) -def HSW_condition(bm, bn, bk): - v_size = 4 +def HSW_condition(bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) return (bn + bk) * vm + bn * bk <= 16 diff --git a/pspamm/scripts/max_knl.py b/pspamm/scripts/max_knl.py index e3e3ea7..b6b7f57 100755 --- a/pspamm/scripts/max_knl.py +++ b/pspamm/scripts/max_knl.py @@ -1,4 +1,4 @@ -def getBlocksize(m , n, bk): +def getBlocksize(m , n, bk, v_size=8): bm = 8 bn = 1 @@ -6,7 +6,7 @@ def getBlocksize(m , n, bk): for i in range(8, m+1, 8): for j in range(1, n+1): - if KNL_condition(i, j, bk): + if KNL_condition(i, j, bk, v_size): if i*j > maxval: maxval = i*j bm = i @@ -15,8 +15,7 @@ def getBlocksize(m , n, bk): return (bm, bn) -def KNL_condition(bm, bn, bk): - v_size = 8 +def KNL_condition(bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) return (bn+bk) * vm <= 32 diff --git a/pspamm/scripts/old_hsw.py b/pspamm/scripts/old_hsw.py index 7394c93..5f8ce6d 100755 --- a/pspamm/scripts/old_hsw.py +++ b/pspamm/scripts/old_hsw.py @@ -1,22 +1,22 @@ -def getBlocksize(m , n, bk): +def getBlocksize(m , n, bk, v_size=4): bm = m bn = n - if HSW_condition(bm, bn, bk): + if HSW_condition(bm, bn, bk, v_size): return (bm, bn) - while not HSW_condition(bm, bn, bk): - bm, bn = lowerToNextDiv(m, n, bm, bn) + while not HSW_condition(bm, bn, bk, v_size): + bm, bn = lowerToNextDiv(m, n, bm, bn, v_size) return (bm, bn) -def lowerToNextDiv(m, n, bm, bn): - if bm > bn and bm > 4: - bm -= 4 +def lowerToNextDiv(m, n, bm, bn, v_size): + if bm > bn and bm > v_size: + bm -= v_size while m % bm != 0: - bm -= 4 + bm -= v_size else: bn -= 1 while n % bn != 0: @@ -25,8 +25,7 @@ def lowerToNextDiv(m, n, bm, bn): return bm, bn -def HSW_condition(bm, bn, bk): - v_size = 4 +def HSW_condition(bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) return (bn + bk) * vm + bn * bk <= 16 diff --git a/pspamm/scripts/old_knl.py b/pspamm/scripts/old_knl.py index 3686b4c..d6ab5c2 100755 --- a/pspamm/scripts/old_knl.py +++ b/pspamm/scripts/old_knl.py @@ -1,22 +1,22 @@ -def getBlocksize(m , n, bk): +def getBlocksize(m , n, bk, v_size=8): bm = m bn = n - if KNL_condition(bm, bn, bk): + if KNL_condition(bm, bn, bk, v_size): return (bm, bn) - while not KNL_condition(bm, bn, bk): - bm, bn = lowerToNextDiv(m, n, bm, bn) + while not KNL_condition(bm, bn, bk, v_size): + bm, bn = lowerToNextDiv(m, n, bm, bn, v_size) return (bm, bn) -def lowerToNextDiv(m, n, bm, bn): - if bm > bn and bm > 8: - bm -= 8 +def lowerToNextDiv(m, n, bm, bn, v_size): + if bm > bn and bm > v_size: + bm -= v_size while m % bm != 0: - bm -= 8 + bm -= v_size else: bn -= 1 while n % bn != 0: @@ -25,8 +25,7 @@ def lowerToNextDiv(m, n, bm, bn): return bm, bn -def KNL_condition(bm, bn, bk): - v_size = 8 +def KNL_condition(bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) return (bn+bk) * vm <= 32 From 9265e897bc75954274a6e79a535bfe49e4671c4d Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 6 Aug 2024 14:59:43 +0200 Subject: [PATCH 06/64] Fix build --- pspamm/codegen/architectures/arm/generator.py | 2 +- pspamm/codegen/architectures/hsw/generator.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pspamm/codegen/architectures/arm/generator.py b/pspamm/codegen/architectures/arm/generator.py index f87da77..5931f47 100644 --- a/pspamm/codegen/architectures/arm/generator.py +++ b/pspamm/codegen/architectures/arm/generator.py @@ -46,7 +46,7 @@ def has_masks(self): return False def init_mask(self, bm, v_size, tempreg, maskregs): - return block() + return block("") def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int): assert(bm % v_size == 0) diff --git a/pspamm/codegen/architectures/hsw/generator.py b/pspamm/codegen/architectures/hsw/generator.py index 679ba8e..be40f4d 100644 --- a/pspamm/codegen/architectures/hsw/generator.py +++ b/pspamm/codegen/architectures/hsw/generator.py @@ -47,7 +47,7 @@ def has_masks(self): return False def init_mask(self, bm, v_size, tempreg, maskregs): - return block() + return block("") def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int): assert(bm % v_size == 0) From 0b51b2fdb178916ffbd9d10ed6d8a3fe6ba56122 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 6 Aug 2024 15:00:18 +0200 Subject: [PATCH 07/64] Autoselect bk --- pspamm/matmul.py | 18 +++++++++--------- pspamm/scripts/max_arm.py | 7 +++++-- pspamm/scripts/max_arm_sve.py | 5 ++++- pspamm/scripts/max_bn_hsw.py | 16 ---------------- pspamm/scripts/max_bn_knl.py | 5 ++++- pspamm/scripts/max_hsw.py | 7 +++++-- pspamm/scripts/max_knl.py | 14 +++++++++++--- pspamm/scripts/old_arm.py | 5 ++++- pspamm/scripts/old_hsw.py | 3 +++ pspamm/scripts/old_knl.py | 3 +++ 10 files changed, 48 insertions(+), 35 deletions(-) delete mode 100755 pspamm/scripts/max_bn_hsw.py diff --git a/pspamm/matmul.py b/pspamm/matmul.py index ee38964..b6aab73 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -7,7 +7,7 @@ import pspamm.scripts.old_arm import pspamm.scripts.max_bn_knl -import pspamm.scripts.max_bn_hsw +import pspamm.scripts.max_hsw import pspamm.scripts.max_arm_sve from pspamm.cursors import * @@ -128,11 +128,12 @@ def __init__(self, arch = 'hsw' self.arch = arch - assert precision.lower() in ['h', 's', 'd'] + assert precision.lower() in ['bf16', 'h', 's', 'd'] self.precision = { 'h' : Precision.HALF, 's' : Precision.SINGLE, - 'd' : Precision.DOUBLE + 'd' : Precision.DOUBLE, + 'bf16' : Precision.BFLOAT16 }[precision.lower()] pspamm.architecture.init() @@ -157,18 +158,17 @@ def __init__(self, if bm == None or bn == None: if arch == 'knl': - (self.bm, self.bn) = pspamm.scripts.max_bn_knl.getBlocksize(m, n, bk, self.v_size) + (self.bm, self.bn, self.bk) = pspamm.scripts.max_bn_knl.getBlocksize(m, n, bk, self.v_size) elif arch == 'hsw': - (self.bm, self.bn) = pspamm.scripts.max_bn_hsw.getBlocksize(m, n, bk, self.v_size) + (self.bm, self.bn, self.bk) = pspamm.scripts.max_hsw.getBlocksize(m, n, bk, self.v_size) elif arch == 'arm': - (self.bm, self.bn) = pspamm.scripts.old_arm.getBlocksize(m, n, bk, self.v_size) + (self.bm, self.bn, self.bk) = pspamm.scripts.old_arm.getBlocksize(m, n, bk, self.v_size) elif arch == 'arm_sve': - (self.bm, self.bn) = pspamm.scripts.max_arm_sve.getBlocksize(m, n, bk, self.v_size) + (self.bm, self.bn, self.bk) = pspamm.scripts.max_arm_sve.getBlocksize(m, n, bk, self.v_size) else: self.bm = bm self.bn = bn - - self.bk = bk + self.bk = bk self.prefetching = prefetching diff --git a/pspamm/scripts/max_arm.py b/pspamm/scripts/max_arm.py index 5e35680..a47a344 100755 --- a/pspamm/scripts/max_arm.py +++ b/pspamm/scripts/max_arm.py @@ -4,7 +4,7 @@ def getBlocksize(m , n, bk, v_size=2): bn = 1 maxval = 0 - for i in range(2, m+1, 2): + for i in range(v_size, m+1, v_size): for j in range(1, n+1): if ARM_condition(i, j, bk, v_size): if i*j > maxval: @@ -12,7 +12,10 @@ def getBlocksize(m , n, bk, v_size=2): bm = i bn = j - return (bm, bn) + while ARM_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn, bk) def ARM_condition(bm, bn, bk, v_size): diff --git a/pspamm/scripts/max_arm_sve.py b/pspamm/scripts/max_arm_sve.py index c8064a5..f57132a 100644 --- a/pspamm/scripts/max_arm_sve.py +++ b/pspamm/scripts/max_arm_sve.py @@ -18,7 +18,10 @@ def getBlocksize(m, n, bk, v_size=2): if maxval == 0: raise RuntimeError("Could not find an appropriate block size. We suggest padding the matrix dimensions") - return (bm, bn) + while ARM_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn, bk) def ARM_condition(bm, bn, bk, v_size): diff --git a/pspamm/scripts/max_bn_hsw.py b/pspamm/scripts/max_bn_hsw.py deleted file mode 100755 index 7cc6284..0000000 --- a/pspamm/scripts/max_bn_hsw.py +++ /dev/null @@ -1,16 +0,0 @@ -def getBlocksize(m, n, bk, v_size=4): - - bm = v_size - bn = 1 - - for j in range(1, n+1): - if HSW_condition(bm, j, bk, v_size): - bn = j - - return (bm, bn) - - -def HSW_condition(bm, bn, bk, v_size): - # ceiling division - vm = -(bm // -v_size) - return (bn + bk) * vm + bn * bk <= 16 diff --git a/pspamm/scripts/max_bn_knl.py b/pspamm/scripts/max_bn_knl.py index 2074d83..2483484 100755 --- a/pspamm/scripts/max_bn_knl.py +++ b/pspamm/scripts/max_bn_knl.py @@ -7,7 +7,10 @@ def getBlocksize(m, n, bk, v_size=8): if KNL_condition(bm, j, bk, v_size): bn = j - return (bm, bn) + while KNL_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn, bk) def KNL_condition(bm, bn, bk, v_size): diff --git a/pspamm/scripts/max_hsw.py b/pspamm/scripts/max_hsw.py index ac4e094..f131c26 100755 --- a/pspamm/scripts/max_hsw.py +++ b/pspamm/scripts/max_hsw.py @@ -4,7 +4,7 @@ def getBlocksize(m , n, bk, v_size=4): bn = 1 maxval = 0 - for i in range(4, m+1, 4): + for i in range(v_size, m+1, v_size): for j in range(1, n+1): if HSW_condition(i, j, bk, v_size): if i*j > maxval: @@ -12,7 +12,10 @@ def getBlocksize(m , n, bk, v_size=4): bm = i bn = j - return (bm, bn) + while HSW_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn, bk) def HSW_condition(bm, bn, bk, v_size): diff --git a/pspamm/scripts/max_knl.py b/pspamm/scripts/max_knl.py index b6b7f57..d75b1ab 100755 --- a/pspamm/scripts/max_knl.py +++ b/pspamm/scripts/max_knl.py @@ -4,18 +4,26 @@ def getBlocksize(m , n, bk, v_size=8): bn = 1 maxval = 0 - for i in range(8, m+1, 8): + for i in range(1, m+1): + next_multiple = -(bm // -v_size) for j in range(1, n+1): - if KNL_condition(i, j, bk, v_size): + if KNL_condition(next_multiple, j, bk, v_size) and tileable(m, bm): if i*j > maxval: maxval = i*j bm = i bn = j + + while KNL_condition(bm, bn, bk+1, v_size): + bk += 1 - return (bm, bn) + return (bm, bn, bk) def KNL_condition(bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) return (bn+bk) * vm <= 32 + +def tileable(m, bm): + return m % bm == 0 + diff --git a/pspamm/scripts/old_arm.py b/pspamm/scripts/old_arm.py index 48622f0..de1ba2e 100755 --- a/pspamm/scripts/old_arm.py +++ b/pspamm/scripts/old_arm.py @@ -9,7 +9,10 @@ def getBlocksize(m , n, bk, v_size=2): while not ARM_condition(bm, bn, bk, v_size): bm, bn = lowerToNextDiv(m, n, bm, bn, v_size) - return (bm, bn) + while ARM_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn, bk) def lowerToNextDiv(m, n, bm, bn, v_size): diff --git a/pspamm/scripts/old_hsw.py b/pspamm/scripts/old_hsw.py index 5f8ce6d..0e4c3a9 100755 --- a/pspamm/scripts/old_hsw.py +++ b/pspamm/scripts/old_hsw.py @@ -9,6 +9,9 @@ def getBlocksize(m , n, bk, v_size=4): while not HSW_condition(bm, bn, bk, v_size): bm, bn = lowerToNextDiv(m, n, bm, bn, v_size) + while HSW_condition(bm, bn, bk+1, v_size): + bk += 1 + return (bm, bn) diff --git a/pspamm/scripts/old_knl.py b/pspamm/scripts/old_knl.py index d6ab5c2..8aff4a7 100755 --- a/pspamm/scripts/old_knl.py +++ b/pspamm/scripts/old_knl.py @@ -9,6 +9,9 @@ def getBlocksize(m , n, bk, v_size=8): while not KNL_condition(bm, bn, bk, v_size): bm, bn = lowerToNextDiv(m, n, bm, bn, v_size) + while KNL_condition(bm, bn, bk+1, v_size): + bk += 1 + return (bm, bn) From 3dbc2764942b1eb92dd3b5141a40d97d29312eca Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 6 Aug 2024 15:02:06 +0200 Subject: [PATCH 08/64] Fix register counting --- pspamm/scripts/max_arm.py | 2 +- pspamm/scripts/max_arm_sve.py | 2 +- pspamm/scripts/old_arm.py | 6 ++++-- pspamm/scripts/old_hsw.py | 2 ++ pspamm/scripts/old_knl.py | 2 ++ 5 files changed, 10 insertions(+), 4 deletions(-) diff --git a/pspamm/scripts/max_arm.py b/pspamm/scripts/max_arm.py index a47a344..a8e4cf6 100755 --- a/pspamm/scripts/max_arm.py +++ b/pspamm/scripts/max_arm.py @@ -21,4 +21,4 @@ def getBlocksize(m , n, bk, v_size=2): def ARM_condition(bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) - return (bn+bk) * vm + bn <= 32 + return (bn+bk) * vm + bn*bk <= 32 diff --git a/pspamm/scripts/max_arm_sve.py b/pspamm/scripts/max_arm_sve.py index f57132a..f1df1f8 100644 --- a/pspamm/scripts/max_arm_sve.py +++ b/pspamm/scripts/max_arm_sve.py @@ -27,7 +27,7 @@ def getBlocksize(m, n, bk, v_size=2): def ARM_condition(bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) - return (bn + bk) * vm + bn <= 32 + return (bn + bk) * vm + bn*bk <= 32 def tileable(m, bm): diff --git a/pspamm/scripts/old_arm.py b/pspamm/scripts/old_arm.py index de1ba2e..e7bb884 100755 --- a/pspamm/scripts/old_arm.py +++ b/pspamm/scripts/old_arm.py @@ -4,7 +4,9 @@ def getBlocksize(m , n, bk, v_size=2): bn = n if ARM_condition(bm, bn, bk, v_size): - return (bm, bn) + while ARM_condition(bm, bn, bk+1, v_size): + bk += 1 + return (bm, bn, bk) while not ARM_condition(bm, bn, bk, v_size): bm, bn = lowerToNextDiv(m, n, bm, bn, v_size) @@ -31,4 +33,4 @@ def lowerToNextDiv(m, n, bm, bn, v_size): def ARM_condition(bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) - return (bn+bk) * vm + bn <= 32 + return (bn+bk) * vm + bn*bk <= 32 diff --git a/pspamm/scripts/old_hsw.py b/pspamm/scripts/old_hsw.py index 0e4c3a9..9e2ff6f 100755 --- a/pspamm/scripts/old_hsw.py +++ b/pspamm/scripts/old_hsw.py @@ -4,6 +4,8 @@ def getBlocksize(m , n, bk, v_size=4): bn = n if HSW_condition(bm, bn, bk, v_size): + while HSW_condition(bm, bn, bk+1, v_size): + bk += 1 return (bm, bn) while not HSW_condition(bm, bn, bk, v_size): diff --git a/pspamm/scripts/old_knl.py b/pspamm/scripts/old_knl.py index 8aff4a7..dd075f4 100755 --- a/pspamm/scripts/old_knl.py +++ b/pspamm/scripts/old_knl.py @@ -4,6 +4,8 @@ def getBlocksize(m , n, bk, v_size=8): bn = n if KNL_condition(bm, bn, bk, v_size): + while KNL_condition(bm, bn, bk+1, v_size): + bk += 1 return (bm, bn) while not KNL_condition(bm, bn, bk, v_size): From 3ad7718eecf9ab0bc7275474344edc258c96e417 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 6 Aug 2024 15:25:38 +0200 Subject: [PATCH 09/64] Add extra strategy for hsw --- pspamm/codegen/architectures/hsw/generator.py | 53 ++++++++++++++++--- pspamm/scripts/max_hsw.py | 2 +- 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/pspamm/codegen/architectures/hsw/generator.py b/pspamm/codegen/architectures/hsw/generator.py index be40f4d..8196f96 100644 --- a/pspamm/codegen/architectures/hsw/generator.py +++ b/pspamm/codegen/architectures/hsw/generator.py @@ -52,21 +52,29 @@ def init_mask(self, bm, v_size, tempreg, maskregs): def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int): assert(bm % v_size == 0) vm = bm//v_size - assert((bn + bk) * vm + bn * bk <= 16) # Needs to fit in AVX/AVX2 ymm registers + + # Needs to fit in AVX/AVX2 ymm registers + if (bn + bk) * vm + bn * bk <= 16: + self.preloadA = True + else: + self.preloadA = False + assert(bn * vm + bn * bk + 1 <= 16) vmm = { 1: xmm, 2: ymm }[self.v_len] - A_regs = Matrix([[vmm(vm*c + r) for c in range(bk)] for r in range(vm)]) - B_regs = Matrix([[vmm(vm*bk + bn * r + c) for c in range(bn)] for r in range(bk)]) + if self.preloadA: + A_regs = Matrix([[vmm(vm*c + r) for c in range(bk)] for r in range(vm)]) + Aoffset = vm*bk + else: + A_regs = Matrix([[vmm(0) for c in range(bk)] for r in range(vm)]) + Aoffset = 1 + + B_regs = Matrix([[vmm(Aoffset + bn * r + c) for c in range(bn)] for r in range(bk)]) C_regs = Matrix([[vmm(16 - vm*bn + vm*c + r) for c in range(bn)] for r in range(vm)]) - print([[vmm(vm*c + r ).ugly for c in range(bk)] for r in range(vm)]) - print([[vmm(vm*bk + bn * r + c).ugly for c in range(bn)] for r in range(bk)]) - print([[vmm(16 - vm*bn + vm*c + r).ugly for c in range(bn)] - for r in range(vm)]) starting_regs = [rdi, rsi, rdx, rbx, rcx] b_reg = vm*bk @@ -184,6 +192,30 @@ def move_register_block(self, asm.add(mov(addr, registers[ir,ic], True, comment)) return asm + def move_register_single(self, + cursor: Cursor, + cursor_ptr: CursorLocation, + block_offset: Coords, + registers: Matrix[Register], + v_size: int, + additional_regs, + ir, + ic, + mask: Matrix[bool] = None, + store: bool = False, + prefetching: str = None, + load_offset: int = 0 + ) -> Block: + + asm = block("") + + if (mask is None) or (mask[ir,ic]): + cell_offset = Coords(down=ir*v_size, right=ic) + addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) + addr.disp += self.precision.value * load_offset + asm.add(mov(addr, registers[ir,ic], True, comment)) + return asm + def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: rows, cols = registers.shape @@ -222,7 +254,10 @@ def make_microkernel(self, assert(bm % v_size == 0) mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size) - asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False)) + if self.preloadA: + asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False)) + else: + asm.add(self.move_register_single(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, 0, 0, mask, store=False)) bs = [] bsv = [] @@ -243,6 +278,8 @@ def make_microkernel(self, for Vmi in range(bm//v_size): for bki in range(bk): # inside this k-block + if not self.preloadA and not (Vmi, bki) == (0,0): + asm.add(self.move_register_single(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, Vmi, bki, mask, store=False)) for bni in range(bn): # inside this n-block to_cell = Coords(down=bki, right=bni) if B.has_nonzero_cell(B_ptr, to_B_block, to_cell): diff --git a/pspamm/scripts/max_hsw.py b/pspamm/scripts/max_hsw.py index f131c26..ec2f59e 100755 --- a/pspamm/scripts/max_hsw.py +++ b/pspamm/scripts/max_hsw.py @@ -21,4 +21,4 @@ def getBlocksize(m , n, bk, v_size=4): def HSW_condition(bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) - return (bn + bk) * vm + bn * bk <= 16 + return bn * vm + bn * bk + 1 <= 16 From 8e604f2e9b67f3c160bcd74f707f8e7a8d9b5545 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 6 Aug 2024 15:32:00 +0200 Subject: [PATCH 10/64] Adjust block condition --- pspamm/scripts/max_hsw.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pspamm/scripts/max_hsw.py b/pspamm/scripts/max_hsw.py index ec2f59e..14f7b74 100755 --- a/pspamm/scripts/max_hsw.py +++ b/pspamm/scripts/max_hsw.py @@ -7,7 +7,7 @@ def getBlocksize(m , n, bk, v_size=4): for i in range(v_size, m+1, v_size): for j in range(1, n+1): if HSW_condition(i, j, bk, v_size): - if i*j > maxval: + if i*j > maxval and (HSW_condition_safe(i, j, bk, v_size) or j > 1): maxval = i*j bm = i bn = j @@ -17,6 +17,10 @@ def getBlocksize(m , n, bk, v_size=4): return (bm, bn, bk) +def HSW_condition_safe(bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn + bk) * vm + bn * bk <= 16 def HSW_condition(bm, bn, bk, v_size): # ceiling division From c51fa7c4b77ec6ccd985978f85ff7515ac4dd23f Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 6 Aug 2024 15:34:26 +0200 Subject: [PATCH 11/64] Fix register offset --- pspamm/codegen/architectures/hsw/generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pspamm/codegen/architectures/hsw/generator.py b/pspamm/codegen/architectures/hsw/generator.py index 8196f96..3a7917b 100644 --- a/pspamm/codegen/architectures/hsw/generator.py +++ b/pspamm/codegen/architectures/hsw/generator.py @@ -77,7 +77,7 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: for r in range(vm)]) starting_regs = [rdi, rsi, rdx, rbx, rcx] - b_reg = vm*bk + b_reg = Aoffset alpha_reg = [xmm(b_reg), vmm(b_reg)] beta_reg = [xmm(b_reg + 1), vmm(b_reg + 1)] From 7152b9ce8c2bd2532cba4bd3900e35bca8ef3f77 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 6 Aug 2024 17:01:13 +0200 Subject: [PATCH 12/64] Compact dense matrix multiplication --- pspamm/codegen/architectures/hsw/generator.py | 6 +- pspamm/codegen/forms.py | 25 ++++--- pspamm/cursors/densecursor.py | 12 +++- pspamm/matmul.py | 66 +++++++++++++------ pspamm/scripts/max_hsw.py | 2 +- 5 files changed, 79 insertions(+), 32 deletions(-) diff --git a/pspamm/codegen/architectures/hsw/generator.py b/pspamm/codegen/architectures/hsw/generator.py index 3a7917b..47bceae 100644 --- a/pspamm/codegen/architectures/hsw/generator.py +++ b/pspamm/codegen/architectures/hsw/generator.py @@ -81,7 +81,7 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: alpha_reg = [xmm(b_reg), vmm(b_reg)] beta_reg = [xmm(b_reg + 1), vmm(b_reg + 1)] - available_regs = [r(9),r(10),r(11),r(13),r(14),r(15),rax] + available_regs = [r(9),r(10),r(11),r(15),rax,r(13),r(14)] additional_regs = [r(8)] @@ -95,9 +95,9 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: additional_regs.append(available_regs[reg_count]) reg_count += 1 - loop_reg = r(12) + loop_regs = [r(12), r(13), r(14)] - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_reg, additional_regs, [] + return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, [] def bcst_alpha_beta(self, diff --git a/pspamm/codegen/forms.py b/pspamm/codegen/forms.py index 4b87fa8..ddd5991 100644 --- a/pspamm/codegen/forms.py +++ b/pspamm/codegen/forms.py @@ -12,7 +12,8 @@ def __init__(self, initial_val: int, final_val: int, increment: int = 1, - body_contents: Block = None + body_contents: Block = None, + unroll: int = 1 ) -> None: self.iteration_var = iteration_var @@ -20,6 +21,8 @@ def __init__(self, self.final_val = final_val self.increment = increment self.body_contents = body_contents + self.unroll = unroll + assert self.unroll == 1 or self.initial_val == 0 self.label = "loop_top_" + str(len(Loop._labels)) Loop._labels.append(self.label) @@ -29,16 +32,22 @@ def __init__(self, @property def contents(self): + onestep = [*(self.body_contents.contents), + add(self.increment, self.iteration_var)] + body = [] + rest = [] + for _ in range(self.unroll): + body += onestep + for _ in range(self.final_val % self.unroll): + rest += onestep + corrected_final_val = (self.final_val // self.unroll) * self.unroll return [mov(self.initial_val, self.iteration_var, vector=False), - label(self.label), - *(self.body_contents.contents), - add(self.increment, self.iteration_var), - cmp(self.final_val, self.iteration_var), - jump(self.label, backwards=True)] + label(self.label)] + body + [cmp(corrected_final_val, self.iteration_var), + jump(self.label, backwards=True)] + rest def body(self, *args): self.body_contents = block("Loop body", *args) return self -def loop(iter_var, initial_val, final_val, increment): - return Loop(iter_var, initial_val, final_val, increment) +def loop(iter_var, initial_val, final_val, increment, unroll=1): + return Loop(iter_var, initial_val, final_val, increment, unroll=unroll) diff --git a/pspamm/cursors/densecursor.py b/pspamm/cursors/densecursor.py index ad1f3a7..841cb58 100644 --- a/pspamm/cursors/densecursor.py +++ b/pspamm/cursors/densecursor.py @@ -119,4 +119,14 @@ def get_block(self, src: CursorLocation=None, dest_block: Coords=None) -> BlockI pattern = cast(Matrix[bool], pattern) return BlockInfo(br, bc, index, pattern) - + def has_nonzero_block(self, src: CursorLocation, dest_block: Coords) -> bool: + return True + + def has_nonzero_cell(self, + src_loc: CursorLocation, + dest_block: Coords, + dest_cell: Coords) -> bool: + return True + + def start(self) -> CursorLocation: + return CursorLocation() diff --git a/pspamm/matmul.py b/pspamm/matmul.py index b6aab73..eca8907 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -216,20 +216,25 @@ def __init__(self, if not self.masks: assert(self.m % self.v_size == 0) - self.A_regs, self.B_regs, self.C_regs, self.starting_regs, self.alpha_reg, self.beta_reg, self.loop_reg, self.additional_regs, self.mask_regs = self.generator.make_reg_blocks(self.bm, self.bn, self.bk, self.v_size, self.nnz, self.m, self.n, self.k) + self.A_regs, self.B_regs, self.C_regs, self.starting_regs, self.alpha_reg, self.beta_reg, self.loop_regs, self.additional_regs, self.mask_regs = self.generator.make_reg_blocks(self.bm, self.bn, self.bk, self.v_size, self.nnz, self.m, self.n, self.k) self.alpha_bcst_reg, self.beta_bcst_reg = self.starting_regs[3], self.starting_regs[4] self.A = DenseCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.value) - self.B = BlockCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.value, blocks, patterns,mtx_overhead) + if ldb == 0: + self.B = BlockCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.value, blocks, patterns,mtx_overhead) + else: + self.B = DenseCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.value) self.C = DenseCursor("C", self.starting_regs[2], self.m, self.n, self.ldc, self.bm, self.bn, self.precision.value) self.C_pf = DenseCursor("C_pf", prefetchReg, self.m, self.n, self.ldc, self.bm, self.bn, self.precision.value) if prefetchReg else None + self.unroll = ldb == 0 - def make_nk_unroll(self): + + def make_nk_unroll(self, unroll=True): asm = block("Unrolling over bn and bk") - A_ptr = CursorLocation() + A_ptr = self.A.start() B_ptr = self.B.start() C_ptr = CursorLocation() C_pf_ptr = CursorLocation() @@ -249,8 +254,7 @@ def make_nk_unroll(self): asm.add(self.generator.make_b_pointers(self.starting_regs[1], self.additional_regs, self.nnz)) - for Bni in range(0, Bn): - + def kernelN(asm, Bni, C_ptr): regs = self.C_regs if Bni + 1 == Bn and n_overhead > 0: @@ -268,14 +272,30 @@ def make_nk_unroll(self): else: asm.add(self.generator.make_zero_block(regs, self.additional_regs)) - for Bki in range(0,Bk): - - to_A = Coords(right=Bki) - to_B = Coords(right=Bni, down=Bki, absolute=True) - - if self.B.has_nonzero_block(B_ptr, to_B): + def kernelK(asm, Bki): + if unroll: + to_A = Coords(right=Bki) + to_B = Coords(right=Bni, down=Bki, absolute=True) + keep = self.B.has_nonzero_block(B_ptr, to_B) + else: + to_A = Coords() + to_B = Coords() + keep = True + + if keep: asm.add(self.generator.make_microkernel(self.A, self.B, A_ptr, B_ptr, self.A_regs, self.B_regs, regs, self.v_size, self.additional_regs, to_A, to_B)) + if unroll: + for Bki in range(Bk): + kernelK(asm, Bki) + else: + loopblock = block("microkernel") + kernelK(loopblock, 0) + loopblock.add(self.B.move(B_ptr, Coords(down=1))[0]) + loopblock.add(self.A.move(B_ptr, Coords(right=1))[0]) + asm.add(loop(self.loop_regs[2], 0, Bk, 1, unroll=4).body(loopblock)) + asm.add(self.A.move(B_ptr, Coords(right=1-Bk))[0]) + if self.alpha != 1.0: store_block = block("") @@ -312,13 +332,21 @@ def make_nk_unroll(self): move_C_pf, C_pf_ptr = self.C_pf.move(C_pf_ptr, Coords(right=1)) asm.add(move_C_pf) + if unroll: + for Bni in range(0, Bn): + kernelN(asm, Bni, C_ptr) + else: + if Bn > 1: + loopblock = block("microkernel") + kernelN(loopblock, 0, C_ptr) + asm.add(loop(self.loop_regs[1], 0, Bn-1, 1).body(loopblock)) + asm.add(self.B.move(B_ptr, Coords(right=1))[0]) + kernelN(asm, Bn-1, C_ptr) + asm.add(self.B.move(B_ptr, Coords(right=1-Bn))[0]) return asm - - def make(self): - A_ptr = CursorLocation() C_ptr = CursorLocation() C_pf_ptr = CursorLocation() @@ -331,7 +359,7 @@ def make(self): Bn += 1 loopBody = [ - self.make_nk_unroll(), + self.make_nk_unroll(self.unroll), self.A.move(A_ptr, Coords(down=1))[0], self.C.move(C_ptr, Coords(down=1, right=1-Bn))[0] ] @@ -341,8 +369,8 @@ def make(self): asm = block("unrolled_{}x{}x{}".format(self.m,self.n,self.k), self.generator.bcst_alpha_beta(self.alpha_reg, self.beta_reg), self.generator.make_scaling_offsets(self.additional_regs, self.nnz), - self.generator.init_mask(self.bm, self.v_size, self.loop_reg, self.mask_regs), - loop(self.loop_reg, 0, Bm, 1).body(*loopBody) + self.generator.init_mask(self.bm, self.v_size, self.loop_regs[0], self.mask_regs), + loop(self.loop_regs[0], 0, Bm, 1).body(*loopBody) ) vm_overhead = (self.m % self.bm) // self.v_size @@ -353,6 +381,6 @@ def make(self): self.A_regs = self.A_regs[0:self.bm // self.v_size, 0:self.bk] self.C_regs = self.C_regs[0:self.bm // self.v_size, 0:self.bn] self.A.r = self.m - asm.add(self.make_nk_unroll()) + asm.add(self.make_nk_unroll(self.unroll)) return asm diff --git a/pspamm/scripts/max_hsw.py b/pspamm/scripts/max_hsw.py index 14f7b74..d433e8a 100755 --- a/pspamm/scripts/max_hsw.py +++ b/pspamm/scripts/max_hsw.py @@ -12,7 +12,7 @@ def getBlocksize(m , n, bk, v_size=4): bm = i bn = j - while HSW_condition(bm, bn, bk+1, v_size): + while HSW_condition_safe(bm, bn, bk+1, v_size): bk += 1 return (bm, bn, bk) From e8fb559a41603ee613705f3d8d7b108abca007c3 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 6 Aug 2024 19:36:33 +0200 Subject: [PATCH 13/64] Bugfixing --- pspamm/matmul.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pspamm/matmul.py b/pspamm/matmul.py index eca8907..f782e92 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -254,7 +254,7 @@ def make_nk_unroll(self, unroll=True): asm.add(self.generator.make_b_pointers(self.starting_regs[1], self.additional_regs, self.nnz)) - def kernelN(asm, Bni, C_ptr): + def kernelN(asm, Bni, A_ptr, C_ptr): regs = self.C_regs if Bni + 1 == Bn and n_overhead > 0: @@ -278,6 +278,9 @@ def kernelK(asm, Bki): to_B = Coords(right=Bni, down=Bki, absolute=True) keep = self.B.has_nonzero_block(B_ptr, to_B) else: + # setting A_ptr, B_ptr here may be a bit too hacky... + A_ptr = CursorLocation(Coords(right=Bki, absolute=True)) + B_ptr = CursorLocation(Coords(right=Bni, down=Bki, absolute=True)) to_A = Coords() to_B = Coords() keep = True @@ -292,9 +295,11 @@ def kernelK(asm, Bki): loopblock = block("microkernel") kernelK(loopblock, 0) loopblock.add(self.B.move(B_ptr, Coords(down=1))[0]) - loopblock.add(self.A.move(B_ptr, Coords(right=1))[0]) - asm.add(loop(self.loop_regs[2], 0, Bk, 1, unroll=4).body(loopblock)) - asm.add(self.A.move(B_ptr, Coords(right=1-Bk))[0]) + loopblock.add(self.A.move(A_ptr, Coords(right=1))[0]) + asm.add(loop(self.loop_regs[2], 0, Bk-1, 1, unroll=4).body(loopblock)) + kernelK(asm, Bk-1) + asm.add(self.B.move(B_ptr, Coords(down=1-Bk))[0]) + asm.add(self.A.move(A_ptr, Coords(right=1-Bk))[0]) if self.alpha != 1.0: store_block = block("") @@ -338,10 +343,10 @@ def kernelK(asm, Bki): else: if Bn > 1: loopblock = block("microkernel") - kernelN(loopblock, 0, C_ptr) + kernelN(loopblock, 0, A_ptr, C_ptr) + loopblock.add(self.B.move(B_ptr, Coords(right=1))[0]) asm.add(loop(self.loop_regs[1], 0, Bn-1, 1).body(loopblock)) - asm.add(self.B.move(B_ptr, Coords(right=1))[0]) - kernelN(asm, Bn-1, C_ptr) + kernelN(asm, Bn-1, A_ptr, C_ptr) asm.add(self.B.move(B_ptr, Coords(right=1-Bn))[0]) return asm From 16921875b8bbd6937d0343eab2b967774baa0a10 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 6 Aug 2024 19:41:48 +0200 Subject: [PATCH 14/64] Fix pointers --- pspamm/matmul.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pspamm/matmul.py b/pspamm/matmul.py index f782e92..d2c265e 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -254,7 +254,7 @@ def make_nk_unroll(self, unroll=True): asm.add(self.generator.make_b_pointers(self.starting_regs[1], self.additional_regs, self.nnz)) - def kernelN(asm, Bni, A_ptr, C_ptr): + def kernelN(asm, Bni, A_ptr, B_ptr, C_ptr): regs = self.C_regs if Bni + 1 == Bn and n_overhead > 0: @@ -272,7 +272,7 @@ def kernelN(asm, Bni, A_ptr, C_ptr): else: asm.add(self.generator.make_zero_block(regs, self.additional_regs)) - def kernelK(asm, Bki): + def kernelK(asm, Bki, A_ptr, B_ptr): if unroll: to_A = Coords(right=Bki) to_B = Coords(right=Bni, down=Bki, absolute=True) @@ -293,11 +293,11 @@ def kernelK(asm, Bki): kernelK(asm, Bki) else: loopblock = block("microkernel") - kernelK(loopblock, 0) + kernelK(loopblock, 0, A_ptr, B_ptr) loopblock.add(self.B.move(B_ptr, Coords(down=1))[0]) loopblock.add(self.A.move(A_ptr, Coords(right=1))[0]) asm.add(loop(self.loop_regs[2], 0, Bk-1, 1, unroll=4).body(loopblock)) - kernelK(asm, Bk-1) + kernelK(asm, Bk-1, A_ptr, B_ptr) asm.add(self.B.move(B_ptr, Coords(down=1-Bk))[0]) asm.add(self.A.move(A_ptr, Coords(right=1-Bk))[0]) @@ -339,14 +339,14 @@ def kernelK(asm, Bki): if unroll: for Bni in range(0, Bn): - kernelN(asm, Bni, C_ptr) + kernelN(asm, Bni, A_ptr, B_ptr, C_ptr) else: if Bn > 1: loopblock = block("microkernel") - kernelN(loopblock, 0, A_ptr, C_ptr) + kernelN(loopblock, 0, A_ptr, B_ptr, C_ptr) loopblock.add(self.B.move(B_ptr, Coords(right=1))[0]) asm.add(loop(self.loop_regs[1], 0, Bn-1, 1).body(loopblock)) - kernelN(asm, Bn-1, A_ptr, C_ptr) + kernelN(asm, Bn-1, A_ptr, B_ptr, C_ptr) asm.add(self.B.move(B_ptr, Coords(right=1-Bn))[0]) return asm From cefc13ab3f9fefdacf29d327567c14b4ca9a1c17 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 6 Aug 2024 19:51:08 +0200 Subject: [PATCH 15/64] Bugfixes --- pspamm/codegen/architectures/arm/generator.py | 4 ++-- pspamm/codegen/architectures/arm_sve/generator.py | 4 ++-- pspamm/codegen/architectures/knl/generator.py | 6 +++--- pspamm/matmul.py | 2 +- pspamm/scripts/max_arm.py | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pspamm/codegen/architectures/arm/generator.py b/pspamm/codegen/architectures/arm/generator.py index 5931f47..89de2e3 100644 --- a/pspamm/codegen/architectures/arm/generator.py +++ b/pspamm/codegen/architectures/arm/generator.py @@ -74,9 +74,9 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: additional_regs = [r(11), xzr] - loop_reg = r(12) + loop_regs = [r(12), r(13), r(14)] - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_reg, additional_regs, [] + return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, [] def bcst_alpha_beta(self, diff --git a/pspamm/codegen/architectures/arm_sve/generator.py b/pspamm/codegen/architectures/arm_sve/generator.py index ee467e4..ac56177 100644 --- a/pspamm/codegen/architectures/arm_sve/generator.py +++ b/pspamm/codegen/architectures/arm_sve/generator.py @@ -96,13 +96,13 @@ def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: i additional_regs = [r(11), l("0.0"), r(10), r(8)] # r10 used for scaling offsets - loop_reg = r(12) + loop_regs = [r(12), r(13), r(14)] mask_regs = [p(0), p(7)] self.init_registers(bm, v_size) - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_reg, additional_regs, mask_regs + return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, mask_regs def bcst_alpha_beta(self, alpha_reg: Register, diff --git a/pspamm/codegen/architectures/knl/generator.py b/pspamm/codegen/architectures/knl/generator.py index a928916..ad8a6ae 100644 --- a/pspamm/codegen/architectures/knl/generator.py +++ b/pspamm/codegen/architectures/knl/generator.py @@ -66,7 +66,7 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: alpha_reg = [rbx, rbx] beta_reg = [rcx, rcx] - available_regs = [r(9),r(10),r(11),r(13),r(14),r(15),rax] + available_regs = [r(9),r(10),r(11),r(15),rax,r(13),r(14)] additional_regs = [r(8)] @@ -82,9 +82,9 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: additional_regs.append(available_regs[reg_count]) reg_count += 1 - loop_reg = r(12) + loop_reg = [r(12), r(13), r(14)] - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_reg, additional_regs, mask_regs + return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, mask_regs def init_mask(self, bm, v_size, tempreg, maskregs): rest = bm % v_size diff --git a/pspamm/matmul.py b/pspamm/matmul.py index d2c265e..fcace55 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -290,7 +290,7 @@ def kernelK(asm, Bki, A_ptr, B_ptr): if unroll: for Bki in range(Bk): - kernelK(asm, Bki) + kernelK(asm, Bki, A_ptr, B_ptr) else: loopblock = block("microkernel") kernelK(loopblock, 0, A_ptr, B_ptr) diff --git a/pspamm/scripts/max_arm.py b/pspamm/scripts/max_arm.py index a8e4cf6..9d7bba0 100755 --- a/pspamm/scripts/max_arm.py +++ b/pspamm/scripts/max_arm.py @@ -13,7 +13,7 @@ def getBlocksize(m , n, bk, v_size=2): bn = j while ARM_condition(bm, bn, bk+1, v_size): - bk += 1 + bk += 1 return (bm, bn, bk) From 3f67d54b9d2114169785cefd3b28eb34af9a7b17 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 6 Aug 2024 19:53:07 +0200 Subject: [PATCH 16/64] Rollback condition extension --- pspamm/scripts/max_hsw.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pspamm/scripts/max_hsw.py b/pspamm/scripts/max_hsw.py index d433e8a..80c5825 100755 --- a/pspamm/scripts/max_hsw.py +++ b/pspamm/scripts/max_hsw.py @@ -6,23 +6,24 @@ def getBlocksize(m , n, bk, v_size=4): for i in range(v_size, m+1, v_size): for j in range(1, n+1): + # can be replaced by HSW_condition_extended here if HSW_condition(i, j, bk, v_size): - if i*j > maxval and (HSW_condition_safe(i, j, bk, v_size) or j > 1): + if i*j > maxval and (HSW_condition(i, j, bk, v_size) or j > 1): maxval = i*j bm = i bn = j - while HSW_condition_safe(bm, bn, bk+1, v_size): + while HSW_condition(bm, bn, bk+1, v_size): bk += 1 return (bm, bn, bk) -def HSW_condition_safe(bm, bn, bk, v_size): +def HSW_condition(bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) return (bn + bk) * vm + bn * bk <= 16 -def HSW_condition(bm, bn, bk, v_size): +def HSW_condition_extended(bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) return bn * vm + bn * bk + 1 <= 16 From f19ea1b218b374f9cdccbc1fd2ffa9664863de9f Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 6 Aug 2024 20:50:00 +0200 Subject: [PATCH 17/64] Begin adjusting tests --- tests/sve_testsuite_generator.py | 21 ++++---- tests/testsuite_generator.py | 4 +- tests/unit_tests_arm.py | 42 +++++++-------- tests/unit_tests_arm_sve.py | 92 ++++++++++++++++++-------------- tests/unit_tests_hsw.py | 41 +++++++------- tests/unit_tests_knl.py | 38 ++++++------- 6 files changed, 122 insertions(+), 116 deletions(-) diff --git a/tests/sve_testsuite_generator.py b/tests/sve_testsuite_generator.py index 6b7cfe3..a3830ee 100644 --- a/tests/sve_testsuite_generator.py +++ b/tests/sve_testsuite_generator.py @@ -9,11 +9,8 @@ BASEDIR = 'build' -SparseKernel = namedtuple('SparseKernel', 'name m n k lda ldb ldc alpha beta block_sizes mtx delta') -DenseKernel = namedtuple('DenseKernel', 'name m n k lda ldb ldc alpha beta block_sizes delta') - -SparseKernelS = namedtuple('SparseKernelS', 'name m n k lda ldb ldc alpha beta block_sizes mtx delta') -DenseKernelS = namedtuple('DenseKernelS', 'name m n k lda ldb ldc alpha beta block_sizes delta') +SparseKernel = test_generator.SparseKernel +DenseKernel = test_generator.DenseKernel setup_prefetching = """ template @@ -39,10 +36,10 @@ def make(kernels, arch): arguments = ['pspamm-generator', str(kern.m), str(kern.n), str(kern.k), str(kern.lda), str(kern.ldb), str(kern.ldc), str(kern.alpha), str(kern.beta)] - if isinstance(kern, SparseKernel) or isinstance(kern, SparseKernelS): + if isinstance(kern, SparseKernel): arguments += ['--mtx_filename', kern.mtx] - prec = 's' if isinstance(kern, SparseKernelS) or isinstance(kern, DenseKernelS) else 'd' + prec = 's' if kern.precision == Precision.SINGLE else 'd' arguments += ['--precision', prec] if prec == 's': include_single_prec = True @@ -61,7 +58,7 @@ def make(kernels, arch): veclen = int(arch[7:]) assert veclen % 128 == 0 and veclen <= 2048 reglen = veclen // 128 - v_len = 2 * reglen if prec == 'd' else 4 * reglen + v_len = (16 // kern.precision.size()) * reglen # this should be the same assertion as in ../scripts/max_arm_sve.py bk = 1 # ceiling division @@ -103,13 +100,13 @@ def make(kernels, arch): bm = bs[0] bn = bs[1] - prec = 's' if isinstance(kern, SparseKernelS) or isinstance(kern, DenseKernelS) else 'd' + prec = 's' if kern.precision == Precision.SINGLE else 'd' if arch.startswith("arm_sve"): veclen = int(arch[7:]) assert veclen % 128 == 0 and veclen <= 2048 reglen = veclen // 128 - v_len = 2 * reglen if prec == 'd' else 4 * reglen + v_len = (16 // kern.precision.size()) * reglen # this should be the same assertion as in ../scripts/max_arm_sve.py bk = 1 # ceiling division @@ -120,12 +117,12 @@ def make(kernels, arch): name = kern.name + '_' + str(bm) + '_' + str(bn) - if isinstance(kern, SparseKernel) or isinstance(kern, SparseKernelS): + if isinstance(kern, SparseKernel): mtx = kern.mtx else: mtx = "" # for double precision: set prec to '' to conform to test_generator.function_definitions - prec = 'f' if isinstance(kern, SparseKernelS) or isinstance(kern, DenseKernelS) else '' + prec = 'f' if kern.precision == Precision.SINGLE else '' f.write(""" {p}alpha = {alpha}; {p}beta = {beta}; ldb = {ldb}; diff --git a/tests/testsuite_generator.py b/tests/testsuite_generator.py index 9457038..dfc39a9 100755 --- a/tests/testsuite_generator.py +++ b/tests/testsuite_generator.py @@ -7,8 +7,8 @@ BASEDIR = 'build' -SparseKernel = namedtuple('SparseKernel', 'name m n k lda ldb ldc alpha beta block_sizes mtx delta') -DenseKernel = namedtuple('DenseKernel', 'name m n k lda ldb ldc alpha beta block_sizes delta') +SparseKernel = namedtuple('SparseKernel', 'name precision m n k lda ldb ldc alpha beta block_sizes mtx delta') +DenseKernel = namedtuple('DenseKernel', 'name precision m n k lda ldb ldc alpha beta block_sizes delta') head_of_testsuite = """#include #include diff --git a/tests/unit_tests_arm.py b/tests/unit_tests_arm.py index 3d39dcd..2b1ff04 100755 --- a/tests/unit_tests_arm.py +++ b/tests/unit_tests_arm.py @@ -9,27 +9,27 @@ kernels = [] -kernels.append(generator.DenseKernel("test3", 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4)], 0.0000001)) - -kernels.append(generator.SparseKernel("test1", 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) -kernels.append(generator.DenseKernel("test2", 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("test3", 8, 56, 56, 8, 56, 8, 0.0, 0.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 1) for x in blocksize_algs], 0.0000001)) - -kernels.append(generator.SparseKernel("arm_only_test1", 2, 3, 4, 2, 0, 2, 1.1233, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], generator.generateMTX(4, 3, 5), 0.0000001)) -kernels.append(generator.SparseKernel("arm_only_test2", 2, 3, 4, 20, 0, 14, 1.0, 1.0, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], generator.generateMTX(4, 3, 5), 0.0000001)) -kernels.append(generator.SparseKernel("arm_only_test3", 32, 80, 50, 32, 0, 32, 1.0, 3.0, [(8, 5)] + [x.getBlocksize(32, 80, 1) for x in blocksize_algs], generator.generateMTX(50, 80, 294), 0.0000001)) -kernels.append(generator.SparseKernel("arm_only_test4", 32, 32, 32, 34, 0, 32, 1.0, 0.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1) for x in blocksize_algs], generator.generateMTX(32, 32, 24), 0.0000001)) -kernels.append(generator.SparseKernel("arm_only_test5", 2, 1, 1, 2, 0, 8, 1.0, -1.0, [(2, 1)] + [x.getBlocksize(2, 1, 1) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) -kernels.append(generator.SparseKernel("arm_only_test6", 2, 2, 2, 2, 0, 2, 2.0, 234234.123, [(2, 1)] + [x.getBlocksize(2, 2, 1) for x in blocksize_algs], generator.generateMTX(2, 2, 1), 0.0000001)) -kernels.append(generator.SparseKernel("arm_only_test7", 16, 5, 7, 16, 0, 16, 0.0, -1.123, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1) for x in blocksize_algs], generator.generateMTX(7, 5, 35), 0.0000001)) - -kernels.append(generator.DenseKernel("arm_only_test8", 2, 3, 4, 2, 4, 2, 1.0, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("arm_only_test9", 2, 3, 4, 20, 12, 14, 2.0, 1.123, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("arm_only_test10", 32, 80, 50, 32, 50, 32, 0.0, 0.2, [(8, 5)] + [x.getBlocksize(32, 80, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("arm_only_test11", 32, 32, 32, 33, 68, 32, 1231.0, 14443.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("arm_only_test12", 2, 1, 1, 2, 1, 8, 1.0, 3.0, [(2, 1)] + [x.getBlocksize(2, 1, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("arm_only_test13", 2, 3, 3, 2, 3, 2, 1.0, 0.0, [(2, 1)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("arm_only_test14", 16, 5, 7, 16, 7, 16, 1.0, 1.0, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("test3", Precision.DOUBLE, 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4)], 0.0000001)) + +kernels.append(generator.SparseKernel("test1", Precision.DOUBLE, 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) +kernels.append(generator.DenseKernel("test2", Precision.DOUBLE, 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("test3", Precision.DOUBLE, 8, 56, 56, 8, 56, 8, 0.0, 0.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 1) for x in blocksize_algs], 0.0000001)) + +kernels.append(generator.SparseKernel("arm_only_test1", Precision.DOUBLE, 2, 3, 4, 2, 0, 2, 1.1233, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], generator.generateMTX(4, 3, 5), 0.0000001)) +kernels.append(generator.SparseKernel("arm_only_test2", Precision.DOUBLE, 2, 3, 4, 20, 0, 14, 1.0, 1.0, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], generator.generateMTX(4, 3, 5), 0.0000001)) +kernels.append(generator.SparseKernel("arm_only_test3", Precision.DOUBLE, 32, 80, 50, 32, 0, 32, 1.0, 3.0, [(8, 5)] + [x.getBlocksize(32, 80, 1) for x in blocksize_algs], generator.generateMTX(50, 80, 294), 0.0000001)) +kernels.append(generator.SparseKernel("arm_only_test4", Precision.DOUBLE, 32, 32, 32, 34, 0, 32, 1.0, 0.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1) for x in blocksize_algs], generator.generateMTX(32, 32, 24), 0.0000001)) +kernels.append(generator.SparseKernel("arm_only_test5", Precision.DOUBLE, 2, 1, 1, 2, 0, 8, 1.0, -1.0, [(2, 1)] + [x.getBlocksize(2, 1, 1) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) +kernels.append(generator.SparseKernel("arm_only_test6", Precision.DOUBLE, 2, 2, 2, 2, 0, 2, 2.0, 234234.123, [(2, 1)] + [x.getBlocksize(2, 2, 1) for x in blocksize_algs], generator.generateMTX(2, 2, 1), 0.0000001)) +kernels.append(generator.SparseKernel("arm_only_test7", Precision.DOUBLE, 16, 5, 7, 16, 0, 16, 0.0, -1.123, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1) for x in blocksize_algs], generator.generateMTX(7, 5, 35), 0.0000001)) + +kernels.append(generator.DenseKernel("arm_only_test8", Precision.DOUBLE, 2, 3, 4, 2, 4, 2, 1.0, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("arm_only_test9", Precision.DOUBLE, 2, 3, 4, 20, 12, 14, 2.0, 1.123, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("arm_only_test10", Precision.DOUBLE, 32, 80, 50, 32, 50, 32, 0.0, 0.2, [(8, 5)] + [x.getBlocksize(32, 80, 1) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("arm_only_test11", Precision.DOUBLE, 32, 32, 32, 33, 68, 32, 1231.0, 14443.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("arm_only_test12", Precision.DOUBLE, 2, 1, 1, 2, 1, 8, 1.0, 3.0, [(2, 1)] + [x.getBlocksize(2, 1, 1) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("arm_only_test13", Precision.DOUBLE, 2, 3, 3, 2, 3, 2, 1.0, 0.0, [(2, 1)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("arm_only_test14", Precision.DOUBLE, 16, 5, 7, 16, 7, 16, 1.0, 1.0, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1) for x in blocksize_algs], 0.0000001)) generator.make(kernels, "arm") diff --git a/tests/unit_tests_arm_sve.py b/tests/unit_tests_arm_sve.py index bf147ba..d0a8a53 100644 --- a/tests/unit_tests_arm_sve.py +++ b/tests/unit_tests_arm_sve.py @@ -12,8 +12,9 @@ v_len = int(sys.argv[1]) // 128 blocksize_algs = [max_sve] -v_size = 2 * v_len -v_size_s = 4 * v_len +v_size = lambda prec: (16 // prec.size()) * v_len +v_size_d = v_size(Precision.DOUBLE) +v_size_s = v_size(Precision.SINGLE) bitlen = v_len * 128 kernels = [] @@ -23,46 +24,55 @@ delta_dp = 1e-7 # epsilon is around e-15 => /2 # test cases for double precision multiplication -kernels.append(generator.DenseKernel("sve_mixed_test1", 9, 9, 9, 9, 9, 9, 1.0, 0.0, [(3, 3)] + [x.getBlocksize(9, 9, 1, v_size) for x in blocksize_algs], delta_dp)) -kernels.append(generator.SparseKernel("sve_mixed_test2", 9, 9, 9, 9, 0, 9, 4.0, 2.5, [(3, 3)] + [x.getBlocksize(9, 9, 1, v_size) for x in blocksize_algs], generator.generateMTX(9, 9, 20), delta_dp)) -kernels.append(generator.SparseKernel("sve_mixed_test3", 18, 18, 18, 18, 0, 18, 3.4, -2.5, [(1, 1), (3, 3), (6, 6)] + [x.getBlocksize(18, 18, 1, v_size) for x in blocksize_algs], generator.generateMTX(18, 18, 59), delta_dp)) -kernels.append(generator.SparseKernel("sve_mixed_test4", 80, 80, 80, 80, 0, 80, 0.0, -2.5, [(4, 4), (8, 8)] + [x.getBlocksize(80, 80, 1, v_size) for x in blocksize_algs], generator.generateMTX(80, 80, 312), delta_dp)) -kernels.append(generator.SparseKernel("sve_mixed_test5", 8, 8, 8, 10, 0, 8, 3.0, -0.9, [(2, 2), (4, 4)] + [x.getBlocksize(8, 8, 1, v_size) for x in blocksize_algs], generator.generateMTX(8, 8, 6), delta_dp)) -kernels.append(generator.DenseKernel("sve_mixed_test6", 8, 8, 8, 10, 8, 8, 3.0, -0.9, [(2, 2), (4, 4)] + [x.getBlocksize(8, 8, 1, v_size) for x in blocksize_algs], delta_dp)) - -kernels.append(generator.DenseKernel("sve_test3", 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4)], delta_dp)) - -kernels.append(generator.SparseKernel("sve_test1", 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1, v_size) for x in blocksize_algs], generator.generateMTX(56, 56, 30), delta_dp)) -kernels.append(generator.DenseKernel("sve_test2", 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1, v_size) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_test3", 8, 56, 56, 8, 56, 8, 0.0, 0.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 1, v_size) for x in blocksize_algs], delta_dp)) - -kernels.append(generator.SparseKernel("sve_arm_only_test1", 2, 3, 4, 2, 0, 2, 1.1233, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size) for x in blocksize_algs], generator.generateMTX(4, 3, 5), delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test2", 2, 3, 4, 20, 0, 14, 1.0, 1.0, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size) for x in blocksize_algs], generator.generateMTX(4, 3, 5), delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test3", 32, 80, 50, 32, 0, 32, 1.0, 3.0, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size) for x in blocksize_algs], generator.generateMTX(50, 80, 294), delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test4", 32, 32, 32, 34, 0, 32, 1.0, 0.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size) for x in blocksize_algs], generator.generateMTX(32, 32, 24), delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test5", 2, 1, 1, 2, 0, 8, 1.0, -1.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size) for x in blocksize_algs], generator.generateMTX(1, 1, 1), delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test6", 2, 2, 2, 2, 0, 2, 2.0, 234234.123, [(2, 1)] + [x.getBlocksize(2, 2, 1, v_size) for x in blocksize_algs], generator.generateMTX(2, 2, 1), delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test7", 16, 5, 7, 16, 0, 16, 0.0, -1.123, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size) for x in blocksize_algs], generator.generateMTX(7, 5, 35), delta_dp)) - -kernels.append(generator.DenseKernel("sve_arm_only_test8", 2, 3, 4, 2, 4, 2, 1.0, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_arm_only_test9", 2, 3, 4, 20, 12, 14, 2.0, 1.123, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_arm_only_test10", 32, 80, 50, 32, 50, 32, 0.0, 0.2, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_arm_only_test11", 32, 32, 32, 33, 68, 32, 1231.0, 14443.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_arm_only_test12", 2, 1, 1, 2, 1, 8, 1.0, 3.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_arm_only_test13", 2, 3, 3, 2, 3, 2, 1.0, 0.0, [(2, 1)] + [x.getBlocksize(2, 3, 1, v_size) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_arm_only_test14", 16, 5, 7, 16, 7, 16, 1.0, 1.0, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size) for x in blocksize_algs], delta_dp)) - -kernels.append(generator.DenseKernel("sve_arm_only_test15", 23, 29, 31, 23, 31, 23, 1.32, 0.96, [x.getBlocksize(23, 29, 1, v_size) for x in blocksize_algs], delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test16", 23, 29, 31, 23, 0, 23, 1.32, 0.96, [x.getBlocksize(23, 29, 1, v_size) for x in blocksize_algs], generator.generateMTX(31, 29, 61), delta_dp)) +kernels.append(generator.DenseKernel("sve_mixed_test1", Precision.DOUBLE, 9, 9, 9, 9, 9, 9, 1.0, 0.0, [(3, 3)] + [x.getBlocksize(9, 9, 1, v_size_d) for x in blocksize_algs], delta_dp)) +kernels.append(generator.SparseKernel("sve_mixed_test2", Precision.DOUBLE, 9, 9, 9, 9, 0, 9, 4.0, 2.5, [(3, 3)] + [x.getBlocksize(9, 9, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(9, 9, 20), delta_dp)) +kernels.append(generator.SparseKernel("sve_mixed_test3", Precision.DOUBLE, 18, 18, 18, 18, 0, 18, 3.4, -2.5, [(1, 1), (3, 3), (6, 6)] + [x.getBlocksize(18, 18, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(18, 18, 59), delta_dp)) +kernels.append(generator.SparseKernel("sve_mixed_test4", Precision.DOUBLE, 80, 80, 80, 80, 0, 80, 0.0, -2.5, [(4, 4), (8, 8)] + [x.getBlocksize(80, 80, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(80, 80, 312), delta_dp)) +kernels.append(generator.SparseKernel("sve_mixed_test5", Precision.DOUBLE, 8, 8, 8, 10, 0, 8, 3.0, -0.9, [(2, 2), (4, 4)] + [x.getBlocksize(8, 8, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(8, 8, 6), delta_dp)) +kernels.append(generator.DenseKernel("sve_mixed_test6", Precision.DOUBLE, 8, 8, 8, 10, 8, 8, 3.0, -0.9, [(2, 2), (4, 4)] + [x.getBlocksize(8, 8, 1, v_size_d) for x in blocksize_algs], delta_dp)) + +kernels.append(generator.DenseKernel("sve_test3", Precision.DOUBLE, 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4)], delta_dp)) + +kernels.append(generator.SparseKernel("sve_test1", Precision.DOUBLE, 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(56, 56, 30), delta_dp)) +kernels.append(generator.DenseKernel("sve_test2", Precision.DOUBLE, 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1, v_size_d) for x in blocksize_algs], delta_dp)) +kernels.append(generator.DenseKernel("sve_test3", Precision.DOUBLE, 8, 56, 56, 8, 56, 8, 0.0, 0.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 1, v_size_d) for x in blocksize_algs], delta_dp)) + +kernels.append(generator.SparseKernel("sve_arm_only_test1", Precision.DOUBLE, 2, 3, 4, 2, 0, 2, 1.1233, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(4, 3, 5), delta_dp)) +kernels.append(generator.SparseKernel("sve_arm_only_test2", Precision.DOUBLE, 2, 3, 4, 20, 0, 14, 1.0, 1.0, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(4, 3, 5), delta_dp)) +kernels.append(generator.SparseKernel("sve_arm_only_test3", Precision.DOUBLE, 32, 80, 50, 32, 0, 32, 1.0, 3.0, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(50, 80, 294), delta_dp)) +kernels.append(generator.SparseKernel("sve_arm_only_test4", Precision.DOUBLE, 32, 32, 32, 34, 0, 32, 1.0, 0.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(32, 32, 24), delta_dp)) +kernels.append(generator.SparseKernel("sve_arm_only_test5", Precision.DOUBLE, 2, 1, 1, 2, 0, 8, 1.0, -1.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(1, 1, 1), delta_dp)) +kernels.append(generator.SparseKernel("sve_arm_only_test6", Precision.DOUBLE, 2, 2, 2, 2, 0, 2, 2.0, 234234.123, [(2, 1)] + [x.getBlocksize(2, 2, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(2, 2, 1), delta_dp)) +kernels.append(generator.SparseKernel("sve_arm_only_test7", Precision.DOUBLE, 16, 5, 7, 16, 0, 16, 0.0, -1.123, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(7, 5, 35), delta_dp)) + +kernels.append(generator.DenseKernel("sve_arm_only_test8", Precision.DOUBLE, 2, 3, 4, 2, 4, 2, 1.0, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size_d) for x in blocksize_algs], delta_dp)) +kernels.append(generator.DenseKernel("sve_arm_only_test9", Precision.DOUBLE, 2, 3, 4, 20, 12, 14, 2.0, 1.123, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size_d) for x in blocksize_algs], delta_dp)) +kernels.append(generator.DenseKernel("sve_arm_only_test10", Precision.DOUBLE, 32, 80, 50, 32, 50, 32, 0.0, 0.2, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size_d) for x in blocksize_algs], delta_dp)) +kernels.append(generator.DenseKernel("sve_arm_only_test11", Precision.DOUBLE, 32, 32, 32, 33, 68, 32, 1231.0, 14443.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size_d) for x in blocksize_algs], delta_dp)) +kernels.append(generator.DenseKernel("sve_arm_only_test12", Precision.DOUBLE, 2, 1, 1, 2, 1, 8, 1.0, 3.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size_d) for x in blocksize_algs], delta_dp)) +kernels.append(generator.DenseKernel("sve_arm_only_test13", Precision.DOUBLE, 2, 3, 3, 2, 3, 2, 1.0, 0.0, [(2, 1)] + [x.getBlocksize(2, 3, 1, v_size_d) for x in blocksize_algs], delta_dp)) +kernels.append(generator.DenseKernel("sve_arm_only_test14", Precision.DOUBLE, 16, 5, 7, 16, 7, 16, 1.0, 1.0, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size_d) for x in blocksize_algs], delta_dp)) + +kernels.append(generator.DenseKernel("sve_arm_only_test15", Precision.DOUBLE, 23, 29, 31, 23, 31, 23, 1.32, 0.96, [x.getBlocksize(23, 29, 1, v_size_d) for x in blocksize_algs], delta_dp)) +kernels.append(generator.SparseKernel("sve_arm_only_test16", Precision.DOUBLE, 23, 29, 31, 23, 0, 23, 1.32, 0.96, [x.getBlocksize(23, 29, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(31, 29, 61), delta_dp)) # test cases for single precision multiplication -kernels.append(generator.DenseKernelS("sve_single_prec_test_S1", 9, 9, 9, 9, 9, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size_s) for x in blocksize_algs], delta_sp)) -kernels.append(generator.DenseKernelS("sve_single_prec_test_S2", 15, 15, 15, 15, 15, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size_s) for x in blocksize_algs], delta_sp)) -kernels.append(generator.DenseKernelS("sve_single_prec_test_S3", 23, 23, 23, 23, 23, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_s) for x in blocksize_algs], delta_sp)) -kernels.append(generator.DenseKernelS("sve_single_prec_test_S4", 23, 31, 13, 23, 13, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_s) for x in blocksize_algs], delta_sp)) -kernels.append(generator.SparseKernelS("sve_single_prec_test_S5", 9, 9, 9, 9, 0, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(9, 9, 8), delta_sp)) -kernels.append(generator.SparseKernelS("sve_single_prec_test_S6", 15, 15, 15, 15, 0, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(15, 15, 22), delta_sp)) -kernels.append(generator.SparseKernelS("sve_single_prec_test_S7", 23, 23, 23, 23, 0, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(23, 23, 52), delta_sp)) -kernels.append(generator.SparseKernelS("sve_single_prec_test_S8", 23, 31, 13, 23, 0, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(13, 31, 40), delta_sp)) +kernels.append(generator.DenseKernel("sve_single_prec_test_S1", Precision.SINGLE, 9, 9, 9, 9, 9, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size_s) for x in blocksize_algs], delta_sp)) +kernels.append(generator.DenseKernel("sve_single_prec_test_S2", Precision.SINGLE, 15, 15, 15, 15, 15, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size_s) for x in blocksize_algs], delta_sp)) +kernels.append(generator.DenseKernel("sve_single_prec_test_S3", Precision.SINGLE, 23, 23, 23, 23, 23, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_s) for x in blocksize_algs], delta_sp)) +kernels.append(generator.DenseKernel("sve_single_prec_test_S4", Precision.SINGLE, 23, 31, 13, 23, 13, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_s) for x in blocksize_algs], delta_sp)) +kernels.append(generator.SparseKernel("sve_single_prec_test_S5", Precision.SINGLE, 9, 9, 9, 9, 0, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(9, 9, 8), delta_sp)) +kernels.append(generator.SparseKernel("sve_single_prec_test_S6", Precision.SINGLE, 15, 15, 15, 15, 0, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(15, 15, 22), delta_sp)) +kernels.append(generator.SparseKernel("sve_single_prec_test_S7", Precision.SINGLE, 23, 23, 23, 23, 0, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(23, 23, 52), delta_sp)) +kernels.append(generator.SparseKernel("sve_single_prec_test_S8", Precision.SINGLE, 23, 31, 13, 23, 0, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(13, 31, 40), delta_sp)) + +kernels.append(generator.DenseKernel("sve_half_prec_test_S1", Precision.HALF, 9, 9, 9, 9, 9, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size_s) for x in blocksize_algs], delta_sp)) +kernels.append(generator.DenseKernel("sve_half_prec_test_S2", Precision.HALF, 15, 15, 15, 15, 15, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size_s) for x in blocksize_algs], delta_sp)) +kernels.append(generator.DenseKernel("sve_half_prec_test_S3", Precision.HALF, 23, 23, 23, 23, 23, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_s) for x in blocksize_algs], delta_sp)) +kernels.append(generator.DenseKernel("sve_half_prec_test_S4", Precision.HALF, 23, 31, 13, 23, 13, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_s) for x in blocksize_algs], delta_sp)) +kernels.append(generator.SparseKernel("sve_half_prec_test_S5", Precision.HALF, 9, 9, 9, 9, 0, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(9, 9, 8), delta_sp)) +kernels.append(generator.SparseKernel("sve_half_prec_test_S6", Precision.HALF, 15, 15, 15, 15, 0, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(15, 15, 22), delta_sp)) +kernels.append(generator.SparseKernel("sve_half_prec_test_S7", Precision.HALF, 23, 23, 23, 23, 0, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(23, 23, 52), delta_sp)) +kernels.append(generator.SparseKernel("sve_half_prec_test_S8", Precision.HALF, 23, 31, 13, 23, 0, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(13, 31, 40), delta_sp)) generator.make(kernels, f"arm_sve{bitlen}") diff --git a/tests/unit_tests_hsw.py b/tests/unit_tests_hsw.py index d9a60c7..a2368c4 100755 --- a/tests/unit_tests_hsw.py +++ b/tests/unit_tests_hsw.py @@ -3,31 +3,30 @@ import testsuite_generator as generator import pspamm.scripts.max_hsw as max_square -import pspamm.scripts.max_bn_hsw as max_bn import pspamm.scripts.old_hsw as old -blocksize_algs = [max_square, max_bn, old] +blocksize_algs = [max_square, old] kernels = [] -kernels.append(generator.SparseKernel("test1", 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) -kernels.append(generator.DenseKernel("test2", 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8,2)] + [x.getBlocksize(8, 40, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("test3", 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.SparseKernel("hsw_only_test1", 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 1), 0.0000001)) -kernels.append(generator.SparseKernel("hsw_only_test2", 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2)] + [x.getBlocksize(24, 40, 2) for x in blocksize_algs], generator.generateMTX(40, 40, 20), 0.0000001)) - -kernels.append(generator.SparseKernel("hsw_only_test3", 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 2), 0.0000001)) -kernels.append(generator.SparseKernel("hsw_only_test4", 24, 20, 10, 40, 0, 24, 35.222, 0.0, [] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], generator.generateMTX(10, 20, 1), 0.0000001)) -kernels.append(generator.SparseKernel("hsw_only_test5", 64, 5, 10, 64, 0, 64, 2.3, 0.0, [] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], generator.generateMTX(10, 5, 1), 0.0000001)) -kernels.append(generator.SparseKernel("hsw_only_test6", 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) -kernels.append(generator.SparseKernel("hsw_only_test7", 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8,1)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], generator.generateMTX(40, 24, 1), 0.0000001)) - -kernels.append(generator.DenseKernel("hsw_only_test8", 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("hsw_only_test9", 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2)] + [x.getBlocksize(32, 40, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("hsw_only_test10", 56, 28, 56, 56, 56, 56, 0.1, 3.0, [x.getBlocksize(56, 28, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("hsw_only_test11", 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("hsw_only_test12", 64, 5, 10, 64, 12, 64, 1.0, 1.0, [] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("hsw_only_test13", 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("hsw_only_test14", 8, 24, 40, 8, 41, 8, 2.0, 1.0, [] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.SparseKernel("test1", Precision.DOUBLE, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) +kernels.append(generator.DenseKernel("test2", Precision.DOUBLE, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8,2)] + [x.getBlocksize(8, 40, 2) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("test3", Precision.DOUBLE, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.SparseKernel("hsw_only_test1", Precision.DOUBLE, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 1), 0.0000001)) +kernels.append(generator.SparseKernel("hsw_only_test2", Precision.DOUBLE, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2)] + [x.getBlocksize(24, 40, 2) for x in blocksize_algs], generator.generateMTX(40, 40, 20), 0.0000001)) + +kernels.append(generator.SparseKernel("hsw_only_test3", Precision.DOUBLE, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 2), 0.0000001)) +kernels.append(generator.SparseKernel("hsw_only_test4", Precision.DOUBLE, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], generator.generateMTX(10, 20, 1), 0.0000001)) +kernels.append(generator.SparseKernel("hsw_only_test5", Precision.DOUBLE, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], generator.generateMTX(10, 5, 1), 0.0000001)) +kernels.append(generator.SparseKernel("hsw_only_test6", Precision.DOUBLE, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) +kernels.append(generator.SparseKernel("hsw_only_test7", Precision.DOUBLE, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8,1)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], generator.generateMTX(40, 24, 1), 0.0000001)) + +kernels.append(generator.DenseKernel("hsw_only_test8", Precision.DOUBLE, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("hsw_only_test9", Precision.DOUBLE, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2)] + [x.getBlocksize(32, 40, 2) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("hsw_only_test10", Precision.DOUBLE, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [x.getBlocksize(56, 28, 2) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("hsw_only_test11", Precision.DOUBLE, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("hsw_only_test12", Precision.DOUBLE, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("hsw_only_test13", Precision.DOUBLE, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("hsw_only_test14", Precision.DOUBLE, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], 0.0000001)) generator.make(kernels, "hsw") diff --git a/tests/unit_tests_knl.py b/tests/unit_tests_knl.py index 713f58e..c57ad6d 100755 --- a/tests/unit_tests_knl.py +++ b/tests/unit_tests_knl.py @@ -10,25 +10,25 @@ kernels = [] -kernels.append(generator.SparseKernel("test1", 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) -kernels.append(generator.DenseKernel("test2", 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("test3", 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.SparseKernel("knl_only_test1", 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 1), 0.0000001)) -kernels.append(generator.SparseKernel("knl_only_test2", 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2), (16,7)] + [x.getBlocksize(24, 40, 2) for x in blocksize_algs], generator.generateMTX(40, 40, 20), 0.0000001)) - -kernels.append(generator.SparseKernel("knl_only_test3", 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 2), 0.0000001)) -kernels.append(generator.SparseKernel("knl_only_test4", 24, 20, 10, 40, 0, 24, 35.222, 0.0, [(8, 20), (24,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], generator.generateMTX(10, 20, 1), 0.0000001)) -kernels.append(generator.SparseKernel("knl_only_test5", 64, 5, 10, 64, 0, 64, 2.3, 0.0, [(32, 2), (8,14)] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], generator.generateMTX(10, 5, 1), 0.0000001)) -kernels.append(generator.SparseKernel("knl_only_test6", 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) -kernels.append(generator.SparseKernel("knl_only_test7", 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8, 24), (8,1)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], generator.generateMTX(40, 24, 1), 0.0000001)) - -kernels.append(generator.DenseKernel("knl_only_test8", 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("knl_only_test9", 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2), (16,7)] + [x.getBlocksize(32, 40, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("knl_only_test10", 56, 28, 56, 56, 56, 56, 0.1, 3.0, [(8, 28)], 0.0000001)) -kernels.append(generator.DenseKernel("knl_only_test11", 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8, 20), (8,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("knl_only_test12", 64, 5, 10, 64, 12, 64, 1.0, 1.0, [(32, 2), (8,14)] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("knl_only_test13", 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("knl_only_test14", 8, 24, 40, 8, 41, 8, 2.0, 1.0, [(8, 24)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.SparseKernel("test1", Precision.DOUBLE, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) +kernels.append(generator.DenseKernel("test2", Precision.DOUBLE, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 2) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("test3", Precision.DOUBLE, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.SparseKernel("knl_only_test1", Precision.DOUBLE, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 1), 0.0000001)) +kernels.append(generator.SparseKernel("knl_only_test2", Precision.DOUBLE, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2), (16,7)] + [x.getBlocksize(24, 40, 2) for x in blocksize_algs], generator.generateMTX(40, 40, 20), 0.0000001)) + +kernels.append(generator.SparseKernel("knl_only_test3", Precision.DOUBLE, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 2), 0.0000001)) +kernels.append(generator.SparseKernel("knl_only_test4", Precision.DOUBLE, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [(8, 20), (24,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], generator.generateMTX(10, 20, 1), 0.0000001)) +kernels.append(generator.SparseKernel("knl_only_test5", Precision.DOUBLE, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [(32, 2), (8,14)] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], generator.generateMTX(10, 5, 1), 0.0000001)) +kernels.append(generator.SparseKernel("knl_only_test6", Precision.DOUBLE, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) +kernels.append(generator.SparseKernel("knl_only_test7", Precision.DOUBLE, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8, 24), (8,1)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], generator.generateMTX(40, 24, 1), 0.0000001)) + +kernels.append(generator.DenseKernel("knl_only_test8", Precision.DOUBLE, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("knl_only_test9", Precision.DOUBLE, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2), (16,7)] + [x.getBlocksize(32, 40, 2) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("knl_only_test10", Precision.DOUBLE, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [(8, 28)], 0.0000001)) +kernels.append(generator.DenseKernel("knl_only_test11", Precision.DOUBLE, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8, 20), (8,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("knl_only_test12", Precision.DOUBLE, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [(32, 2), (8,14)] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("knl_only_test13", Precision.DOUBLE, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], 0.0000001)) +kernels.append(generator.DenseKernel("knl_only_test14", Precision.DOUBLE, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [(8, 24)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], 0.0000001)) generator.make(kernels, "knl") From 23cbc2c76b80e451554c0351d2bffb1292fdf142 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Thu, 15 Aug 2024 14:14:45 +0200 Subject: [PATCH 18/64] Prepare for better sparsity loading --- pspamm/codegen/forms.py | 12 ++++++++++-- pspamm/matmul.py | 32 +++++++++++++++++--------------- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/pspamm/codegen/forms.py b/pspamm/codegen/forms.py index ddd5991..8a232fe 100644 --- a/pspamm/codegen/forms.py +++ b/pspamm/codegen/forms.py @@ -41,9 +41,17 @@ def contents(self): for _ in range(self.final_val % self.unroll): rest += onestep corrected_final_val = (self.final_val // self.unroll) * self.unroll - return [mov(self.initial_val, self.iteration_var, vector=False), + + allcode = [] + if corrected_final_val == self.initial_val + self.unroll: + allcode += body + elif corrected_final_val > self.initial_val: + allcode += [mov(self.initial_val, self.iteration_var, vector=False), label(self.label)] + body + [cmp(corrected_final_val, self.iteration_var), - jump(self.label, backwards=True)] + rest + jump(self.label, backwards=True)] + allcode += rest + + return allcode def body(self, *args): self.body_contents = block("Loop body", *args) diff --git a/pspamm/matmul.py b/pspamm/matmul.py index fcace55..ba06479 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -180,17 +180,14 @@ def __init__(self, self.output_overwrite = output_overwrite if ldb == 0: - pattern = Matrix.load(mtx_filename) + bpattern = Matrix.load(mtx_filename) + if self.masks: + self.generator.set_sparse() + + if lda == 0: + apattern = Matrix.load(mtx_filename) if self.masks: self.generator.set_sparse() - else: - mtx = numpy.zeros((k, n)) - for i in range(k): - for j in range(n): - mtx[i, j] = 1 - pattern = Matrix(mtx) - - blocks,patterns,mtx_overhead = decompose_pattern(self.k, self.n, pattern, self.bk, self.bn) self.nnz = 0 self.flop = 0 @@ -198,10 +195,9 @@ def __init__(self, if ldb == 0: for i in range(n): for j in range(k): - if pattern[j,i]: + if bpattern[j,i]: self.nnz += 1 self.flop = self.nnz * m * 2 - self.nnz += sum(mtx_overhead) else: self.nnz = ldb * self.n self.flop = m * n * k * 2 @@ -220,9 +216,15 @@ def __init__(self, self.alpha_bcst_reg, self.beta_bcst_reg = self.starting_regs[3], self.starting_regs[4] - self.A = DenseCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.value) + if lda == 0: + blocks, patterns, mtx_overhead = decompose_pattern(self.m, self.k, apattern, self.bm, self.bk) + self.A = BlockCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.value, blocks, patterns, mtx_overhead) + else: + self.A = DenseCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.value) if ldb == 0: - self.B = BlockCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.value, blocks, patterns,mtx_overhead) + blocks, patterns, mtx_overhead = decompose_pattern(self.k, self.n, bpattern, self.bk, self.bn) + self.B = BlockCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.value, blocks, patterns, mtx_overhead) + self.nnz += sum(mtx_overhead) else: self.B = DenseCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.value) self.C = DenseCursor("C", self.starting_regs[2], self.m, self.n, self.ldc, self.bm, self.bn, self.precision.value) @@ -276,7 +278,7 @@ def kernelK(asm, Bki, A_ptr, B_ptr): if unroll: to_A = Coords(right=Bki) to_B = Coords(right=Bni, down=Bki, absolute=True) - keep = self.B.has_nonzero_block(B_ptr, to_B) + keep = self.B.has_nonzero_block(B_ptr, to_B) and self.A.has_nonzero_block(A_ptr, to_A) else: # setting A_ptr, B_ptr here may be a bit too hacky... A_ptr = CursorLocation(Coords(right=Bki, absolute=True)) @@ -296,7 +298,7 @@ def kernelK(asm, Bki, A_ptr, B_ptr): kernelK(loopblock, 0, A_ptr, B_ptr) loopblock.add(self.B.move(B_ptr, Coords(down=1))[0]) loopblock.add(self.A.move(A_ptr, Coords(right=1))[0]) - asm.add(loop(self.loop_regs[2], 0, Bk-1, 1, unroll=4).body(loopblock)) + asm.add(loop(self.loop_regs[2], 0, Bk-1, 1).body(loopblock)) kernelK(asm, Bk-1, A_ptr, B_ptr) asm.add(self.B.move(B_ptr, Coords(down=1-Bk))[0]) asm.add(self.A.move(A_ptr, Coords(right=1-Bk))[0]) From 27e934dfe2ed523fa0d91e269602fa987b5202e5 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Thu, 15 Aug 2024 14:22:20 +0200 Subject: [PATCH 19/64] Unroll again --- pspamm/matmul.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pspamm/matmul.py b/pspamm/matmul.py index ba06479..e64f2e9 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -298,7 +298,7 @@ def kernelK(asm, Bki, A_ptr, B_ptr): kernelK(loopblock, 0, A_ptr, B_ptr) loopblock.add(self.B.move(B_ptr, Coords(down=1))[0]) loopblock.add(self.A.move(A_ptr, Coords(right=1))[0]) - asm.add(loop(self.loop_regs[2], 0, Bk-1, 1).body(loopblock)) + asm.add(loop(self.loop_regs[2], 0, Bk-1, 1, unroll=4).body(loopblock)) kernelK(asm, Bk-1, A_ptr, B_ptr) asm.add(self.B.move(B_ptr, Coords(down=1-Bk))[0]) asm.add(self.A.move(A_ptr, Coords(right=1-Bk))[0]) From 927e4f6a85ceeea3f6e96c3937ff9f33b082c301 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Mon, 19 Aug 2024 13:22:09 +0200 Subject: [PATCH 20/64] Update and extend tests --- pspamm/matmul.py | 2 +- tests/unit_tests_arm.py | 44 +++++++++++++++++++------------------ tests/unit_tests_arm_sve.py | 1 + tests/unit_tests_hsw.py | 40 +++++++++++++++++---------------- tests/unit_tests_knl.py | 40 +++++++++++++++++---------------- 5 files changed, 67 insertions(+), 60 deletions(-) diff --git a/pspamm/matmul.py b/pspamm/matmul.py index e64f2e9..0d5d6e1 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -230,7 +230,7 @@ def __init__(self, self.C = DenseCursor("C", self.starting_regs[2], self.m, self.n, self.ldc, self.bm, self.bn, self.precision.value) self.C_pf = DenseCursor("C_pf", prefetchReg, self.m, self.n, self.ldc, self.bm, self.bn, self.precision.value) if prefetchReg else None - self.unroll = ldb == 0 + self.unroll = ldb == 0 or lda == 0 def make_nk_unroll(self, unroll=True): diff --git a/tests/unit_tests_arm.py b/tests/unit_tests_arm.py index 2b1ff04..9fd5e54 100755 --- a/tests/unit_tests_arm.py +++ b/tests/unit_tests_arm.py @@ -4,32 +4,34 @@ import pspamm.scripts.max_arm as max_square import pspamm.scripts.old_arm as old +from pspamm.codegen.precision import * blocksize_algs = [max_square, old] kernels = [] -kernels.append(generator.DenseKernel("test3", Precision.DOUBLE, 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4)], 0.0000001)) - -kernels.append(generator.SparseKernel("test1", Precision.DOUBLE, 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) -kernels.append(generator.DenseKernel("test2", Precision.DOUBLE, 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("test3", Precision.DOUBLE, 8, 56, 56, 8, 56, 8, 0.0, 0.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 1) for x in blocksize_algs], 0.0000001)) - -kernels.append(generator.SparseKernel("arm_only_test1", Precision.DOUBLE, 2, 3, 4, 2, 0, 2, 1.1233, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], generator.generateMTX(4, 3, 5), 0.0000001)) -kernels.append(generator.SparseKernel("arm_only_test2", Precision.DOUBLE, 2, 3, 4, 20, 0, 14, 1.0, 1.0, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], generator.generateMTX(4, 3, 5), 0.0000001)) -kernels.append(generator.SparseKernel("arm_only_test3", Precision.DOUBLE, 32, 80, 50, 32, 0, 32, 1.0, 3.0, [(8, 5)] + [x.getBlocksize(32, 80, 1) for x in blocksize_algs], generator.generateMTX(50, 80, 294), 0.0000001)) -kernels.append(generator.SparseKernel("arm_only_test4", Precision.DOUBLE, 32, 32, 32, 34, 0, 32, 1.0, 0.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1) for x in blocksize_algs], generator.generateMTX(32, 32, 24), 0.0000001)) -kernels.append(generator.SparseKernel("arm_only_test5", Precision.DOUBLE, 2, 1, 1, 2, 0, 8, 1.0, -1.0, [(2, 1)] + [x.getBlocksize(2, 1, 1) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) -kernels.append(generator.SparseKernel("arm_only_test6", Precision.DOUBLE, 2, 2, 2, 2, 0, 2, 2.0, 234234.123, [(2, 1)] + [x.getBlocksize(2, 2, 1) for x in blocksize_algs], generator.generateMTX(2, 2, 1), 0.0000001)) -kernels.append(generator.SparseKernel("arm_only_test7", Precision.DOUBLE, 16, 5, 7, 16, 0, 16, 0.0, -1.123, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1) for x in blocksize_algs], generator.generateMTX(7, 5, 35), 0.0000001)) - -kernels.append(generator.DenseKernel("arm_only_test8", Precision.DOUBLE, 2, 3, 4, 2, 4, 2, 1.0, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("arm_only_test9", Precision.DOUBLE, 2, 3, 4, 20, 12, 14, 2.0, 1.123, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("arm_only_test10", Precision.DOUBLE, 32, 80, 50, 32, 50, 32, 0.0, 0.2, [(8, 5)] + [x.getBlocksize(32, 80, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("arm_only_test11", Precision.DOUBLE, 32, 32, 32, 33, 68, 32, 1231.0, 14443.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("arm_only_test12", Precision.DOUBLE, 2, 1, 1, 2, 1, 8, 1.0, 3.0, [(2, 1)] + [x.getBlocksize(2, 1, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("arm_only_test13", Precision.DOUBLE, 2, 3, 3, 2, 3, 2, 1.0, 0.0, [(2, 1)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("arm_only_test14", Precision.DOUBLE, 16, 5, 7, 16, 7, 16, 1.0, 1.0, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1) for x in blocksize_algs], 0.0000001)) +for precision in (Precision.SINGLE, Precision.DOUBLE): + kernels.append(generator.DenseKernel("test4", precision, 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4)], 0.0000001)) + + kernels.append(generator.SparseKernel("test1", precision, 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) + kernels.append(generator.DenseKernel("test2", precision, 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel("test3", precision, 8, 56, 56, 8, 56, 8, 0.0, 0.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 1) for x in blocksize_algs], 0.0000001)) + + kernels.append(generator.SparseKernel("arm_only_test1", precision, 2, 3, 4, 2, 0, 2, 1.1233, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], generator.generateMTX(4, 3, 5), 0.0000001)) + kernels.append(generator.SparseKernel("arm_only_test2", precision, 2, 3, 4, 20, 0, 14, 1.0, 1.0, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], generator.generateMTX(4, 3, 5), 0.0000001)) + kernels.append(generator.SparseKernel("arm_only_test3", precision, 32, 80, 50, 32, 0, 32, 1.0, 3.0, [(8, 5)] + [x.getBlocksize(32, 80, 1) for x in blocksize_algs], generator.generateMTX(50, 80, 294), 0.0000001)) + kernels.append(generator.SparseKernel("arm_only_test4", precision, 32, 32, 32, 34, 0, 32, 1.0, 0.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1) for x in blocksize_algs], generator.generateMTX(32, 32, 24), 0.0000001)) + kernels.append(generator.SparseKernel("arm_only_test5", precision, 2, 1, 1, 2, 0, 8, 1.0, -1.0, [(2, 1)] + [x.getBlocksize(2, 1, 1) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) + kernels.append(generator.SparseKernel("arm_only_test6", precision, 2, 2, 2, 2, 0, 2, 2.0, 234234.123, [(2, 1)] + [x.getBlocksize(2, 2, 1) for x in blocksize_algs], generator.generateMTX(2, 2, 1), 0.0000001)) + kernels.append(generator.SparseKernel("arm_only_test7", precision, 16, 5, 7, 16, 0, 16, 0.0, -1.123, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1) for x in blocksize_algs], generator.generateMTX(7, 5, 35), 0.0000001)) + + kernels.append(generator.DenseKernel("arm_only_test8", precision, 2, 3, 4, 2, 4, 2, 1.0, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel("arm_only_test9", precision, 2, 3, 4, 20, 12, 14, 2.0, 1.123, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel("arm_only_test10", precision, 32, 80, 50, 32, 50, 32, 0.0, 0.2, [(8, 5)] + [x.getBlocksize(32, 80, 1) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel("arm_only_test11", precision, 32, 32, 32, 33, 68, 32, 1231.0, 14443.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel("arm_only_test12", precision, 2, 1, 1, 2, 1, 8, 1.0, 3.0, [(2, 1)] + [x.getBlocksize(2, 1, 1) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel("arm_only_test13", precision, 2, 3, 3, 2, 3, 2, 1.0, 0.0, [(2, 1)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel("arm_only_test14", precision, 16, 5, 7, 16, 7, 16, 1.0, 1.0, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1) for x in blocksize_algs], 0.0000001)) generator.make(kernels, "arm") diff --git a/tests/unit_tests_arm_sve.py b/tests/unit_tests_arm_sve.py index d0a8a53..ad4a0fb 100644 --- a/tests/unit_tests_arm_sve.py +++ b/tests/unit_tests_arm_sve.py @@ -3,6 +3,7 @@ import sve_testsuite_generator as generator import pspamm.scripts.max_arm_sve as max_sve +from pspamm.codegen.precision import * import sys diff --git a/tests/unit_tests_hsw.py b/tests/unit_tests_hsw.py index a2368c4..8e16ea8 100755 --- a/tests/unit_tests_hsw.py +++ b/tests/unit_tests_hsw.py @@ -4,29 +4,31 @@ import pspamm.scripts.max_hsw as max_square import pspamm.scripts.old_hsw as old +from pspamm.codegen.precision import * blocksize_algs = [max_square, old] kernels = [] -kernels.append(generator.SparseKernel("test1", Precision.DOUBLE, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) -kernels.append(generator.DenseKernel("test2", Precision.DOUBLE, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8,2)] + [x.getBlocksize(8, 40, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("test3", Precision.DOUBLE, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.SparseKernel("hsw_only_test1", Precision.DOUBLE, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 1), 0.0000001)) -kernels.append(generator.SparseKernel("hsw_only_test2", Precision.DOUBLE, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2)] + [x.getBlocksize(24, 40, 2) for x in blocksize_algs], generator.generateMTX(40, 40, 20), 0.0000001)) - -kernels.append(generator.SparseKernel("hsw_only_test3", Precision.DOUBLE, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 2), 0.0000001)) -kernels.append(generator.SparseKernel("hsw_only_test4", Precision.DOUBLE, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], generator.generateMTX(10, 20, 1), 0.0000001)) -kernels.append(generator.SparseKernel("hsw_only_test5", Precision.DOUBLE, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], generator.generateMTX(10, 5, 1), 0.0000001)) -kernels.append(generator.SparseKernel("hsw_only_test6", Precision.DOUBLE, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) -kernels.append(generator.SparseKernel("hsw_only_test7", Precision.DOUBLE, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8,1)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], generator.generateMTX(40, 24, 1), 0.0000001)) - -kernels.append(generator.DenseKernel("hsw_only_test8", Precision.DOUBLE, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("hsw_only_test9", Precision.DOUBLE, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2)] + [x.getBlocksize(32, 40, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("hsw_only_test10", Precision.DOUBLE, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [x.getBlocksize(56, 28, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("hsw_only_test11", Precision.DOUBLE, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("hsw_only_test12", Precision.DOUBLE, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("hsw_only_test13", Precision.DOUBLE, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("hsw_only_test14", Precision.DOUBLE, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], 0.0000001)) +for precision in (Precision.SINGLE, Precision.DOUBLE): + kernels.append(generator.SparseKernel("test1", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) + kernels.append(generator.DenseKernel("test2", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8,2)] + [x.getBlocksize(8, 40, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel("test3", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.SparseKernel("hsw_only_test1", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 1), 0.0000001)) + kernels.append(generator.SparseKernel("hsw_only_test2", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2)] + [x.getBlocksize(24, 40, 2) for x in blocksize_algs], generator.generateMTX(40, 40, 20), 0.0000001)) + + kernels.append(generator.SparseKernel("hsw_only_test3", precision, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 2), 0.0000001)) + kernels.append(generator.SparseKernel("hsw_only_test4", precision, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], generator.generateMTX(10, 20, 1), 0.0000001)) + kernels.append(generator.SparseKernel("hsw_only_test5", precision, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], generator.generateMTX(10, 5, 1), 0.0000001)) + kernels.append(generator.SparseKernel("hsw_only_test6", precision, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) + kernels.append(generator.SparseKernel("hsw_only_test7", precision, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8,1)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], generator.generateMTX(40, 24, 1), 0.0000001)) + + kernels.append(generator.DenseKernel("hsw_only_test8", precision, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel("hsw_only_test9", precision, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2)] + [x.getBlocksize(32, 40, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel("hsw_only_test10", precision, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [x.getBlocksize(56, 28, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel("hsw_only_test11", precision, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel("hsw_only_test12", precision, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel("hsw_only_test13", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel("hsw_only_test14", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], 0.0000001)) generator.make(kernels, "hsw") diff --git a/tests/unit_tests_knl.py b/tests/unit_tests_knl.py index c57ad6d..6c0d2a4 100755 --- a/tests/unit_tests_knl.py +++ b/tests/unit_tests_knl.py @@ -5,30 +5,32 @@ import pspamm.scripts.max_knl as max_square import pspamm.scripts.max_bn_knl as max_bn import pspamm.scripts.old_knl as old +from pspamm.codegen.precision import * blocksize_algs = [max_square, max_bn, old] kernels = [] -kernels.append(generator.SparseKernel("test1", Precision.DOUBLE, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) -kernels.append(generator.DenseKernel("test2", Precision.DOUBLE, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("test3", Precision.DOUBLE, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.SparseKernel("knl_only_test1", Precision.DOUBLE, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 1), 0.0000001)) -kernels.append(generator.SparseKernel("knl_only_test2", Precision.DOUBLE, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2), (16,7)] + [x.getBlocksize(24, 40, 2) for x in blocksize_algs], generator.generateMTX(40, 40, 20), 0.0000001)) - -kernels.append(generator.SparseKernel("knl_only_test3", Precision.DOUBLE, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 2), 0.0000001)) -kernels.append(generator.SparseKernel("knl_only_test4", Precision.DOUBLE, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [(8, 20), (24,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], generator.generateMTX(10, 20, 1), 0.0000001)) -kernels.append(generator.SparseKernel("knl_only_test5", Precision.DOUBLE, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [(32, 2), (8,14)] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], generator.generateMTX(10, 5, 1), 0.0000001)) -kernels.append(generator.SparseKernel("knl_only_test6", Precision.DOUBLE, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) -kernels.append(generator.SparseKernel("knl_only_test7", Precision.DOUBLE, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8, 24), (8,1)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], generator.generateMTX(40, 24, 1), 0.0000001)) - -kernels.append(generator.DenseKernel("knl_only_test8", Precision.DOUBLE, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("knl_only_test9", Precision.DOUBLE, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2), (16,7)] + [x.getBlocksize(32, 40, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("knl_only_test10", Precision.DOUBLE, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [(8, 28)], 0.0000001)) -kernels.append(generator.DenseKernel("knl_only_test11", Precision.DOUBLE, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8, 20), (8,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("knl_only_test12", Precision.DOUBLE, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [(32, 2), (8,14)] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("knl_only_test13", Precision.DOUBLE, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], 0.0000001)) -kernels.append(generator.DenseKernel("knl_only_test14", Precision.DOUBLE, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [(8, 24)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], 0.0000001)) +for precision in (Precision.SINGLE, Precision.DOUBLE): + kernels.append(generator.SparseKernel("test1", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) + kernels.append(generator.DenseKernel("test2", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel("test3", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.SparseKernel("knl_only_test1", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 1), 0.0000001)) + kernels.append(generator.SparseKernel("knl_only_test2", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2), (16,7)] + [x.getBlocksize(24, 40, 2) for x in blocksize_algs], generator.generateMTX(40, 40, 20), 0.0000001)) + + kernels.append(generator.SparseKernel("knl_only_test3", precision, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 2), 0.0000001)) + kernels.append(generator.SparseKernel("knl_only_test4", precision, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [(8, 20), (24,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], generator.generateMTX(10, 20, 1), 0.0000001)) + kernels.append(generator.SparseKernel("knl_only_test5", precision, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [(32, 2), (8,14)] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], generator.generateMTX(10, 5, 1), 0.0000001)) + kernels.append(generator.SparseKernel("knl_only_test6", precision, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) + kernels.append(generator.SparseKernel("knl_only_test7", precision, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8, 24), (8,1)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], generator.generateMTX(40, 24, 1), 0.0000001)) + + kernels.append(generator.DenseKernel("knl_only_test8", precision, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel("knl_only_test9", precision, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2), (16,7)] + [x.getBlocksize(32, 40, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel("knl_only_test10", precision, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [(8, 28)], 0.0000001)) + kernels.append(generator.DenseKernel("knl_only_test11", precision, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8, 20), (8,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel("knl_only_test12", precision, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [(32, 2), (8,14)] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel("knl_only_test13", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel("knl_only_test14", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [(8, 24)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], 0.0000001)) generator.make(kernels, "knl") From 56f9d3d6a4627fa46b93706ecc042f4933a5c8af Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 1 Oct 2024 23:28:10 +0200 Subject: [PATCH 21/64] Test cleanup --- pspamm/codegen/precision.py | 6 +++++ tests/sve_testsuite_generator.py | 7 +----- tests/unit_tests_arm.py | 42 ++++++++++++++++---------------- tests/unit_tests_hsw.py | 38 ++++++++++++++--------------- tests/unit_tests_knl.py | 38 ++++++++++++++--------------- 5 files changed, 66 insertions(+), 65 deletions(-) diff --git a/pspamm/codegen/precision.py b/pspamm/codegen/precision.py index e88349e..df5153e 100644 --- a/pspamm/codegen/precision.py +++ b/pspamm/codegen/precision.py @@ -19,4 +19,10 @@ def size(self): self.BFLOAT16: 2 }[self] raise NotImplementedError() + + def __repr__(self): + return self.getCType(self) + + def __str__(self): + return self.getCType(self) diff --git a/tests/sve_testsuite_generator.py b/tests/sve_testsuite_generator.py index a3830ee..48fe477 100644 --- a/tests/sve_testsuite_generator.py +++ b/tests/sve_testsuite_generator.py @@ -6,6 +6,7 @@ import sys import os import testsuite_generator as test_generator +from pspamm.codegen.precision import * BASEDIR = 'build' @@ -30,8 +31,6 @@ def make(kernels, arch): f.write(test_generator.head_of_testsuite) - include_single_prec = False - for kern in kernels: arguments = ['pspamm-generator', str(kern.m), str(kern.n), str(kern.k), str(kern.lda), str(kern.ldb), str(kern.ldc), str(kern.alpha), str(kern.beta)] @@ -41,8 +40,6 @@ def make(kernels, arch): prec = 's' if kern.precision == Precision.SINGLE else 'd' arguments += ['--precision', prec] - if prec == 's': - include_single_prec = True block_sizes = list(set(kern.block_sizes)) @@ -100,8 +97,6 @@ def make(kernels, arch): bm = bs[0] bn = bs[1] - prec = 's' if kern.precision == Precision.SINGLE else 'd' - if arch.startswith("arm_sve"): veclen = int(arch[7:]) assert veclen % 128 == 0 and veclen <= 2048 diff --git a/tests/unit_tests_arm.py b/tests/unit_tests_arm.py index 9fd5e54..50ba6c4 100755 --- a/tests/unit_tests_arm.py +++ b/tests/unit_tests_arm.py @@ -11,27 +11,27 @@ kernels = [] for precision in (Precision.SINGLE, Precision.DOUBLE): - kernels.append(generator.DenseKernel("test4", precision, 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4)], 0.0000001)) - - kernels.append(generator.SparseKernel("test1", precision, 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) - kernels.append(generator.DenseKernel("test2", precision, 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel("test3", precision, 8, 56, 56, 8, 56, 8, 0.0, 0.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 1) for x in blocksize_algs], 0.0000001)) - - kernels.append(generator.SparseKernel("arm_only_test1", precision, 2, 3, 4, 2, 0, 2, 1.1233, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], generator.generateMTX(4, 3, 5), 0.0000001)) - kernels.append(generator.SparseKernel("arm_only_test2", precision, 2, 3, 4, 20, 0, 14, 1.0, 1.0, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], generator.generateMTX(4, 3, 5), 0.0000001)) - kernels.append(generator.SparseKernel("arm_only_test3", precision, 32, 80, 50, 32, 0, 32, 1.0, 3.0, [(8, 5)] + [x.getBlocksize(32, 80, 1) for x in blocksize_algs], generator.generateMTX(50, 80, 294), 0.0000001)) - kernels.append(generator.SparseKernel("arm_only_test4", precision, 32, 32, 32, 34, 0, 32, 1.0, 0.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1) for x in blocksize_algs], generator.generateMTX(32, 32, 24), 0.0000001)) - kernels.append(generator.SparseKernel("arm_only_test5", precision, 2, 1, 1, 2, 0, 8, 1.0, -1.0, [(2, 1)] + [x.getBlocksize(2, 1, 1) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) - kernels.append(generator.SparseKernel("arm_only_test6", precision, 2, 2, 2, 2, 0, 2, 2.0, 234234.123, [(2, 1)] + [x.getBlocksize(2, 2, 1) for x in blocksize_algs], generator.generateMTX(2, 2, 1), 0.0000001)) - kernels.append(generator.SparseKernel("arm_only_test7", precision, 16, 5, 7, 16, 0, 16, 0.0, -1.123, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1) for x in blocksize_algs], generator.generateMTX(7, 5, 35), 0.0000001)) - - kernels.append(generator.DenseKernel("arm_only_test8", precision, 2, 3, 4, 2, 4, 2, 1.0, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel("arm_only_test9", precision, 2, 3, 4, 20, 12, 14, 2.0, 1.123, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel("arm_only_test10", precision, 32, 80, 50, 32, 50, 32, 0.0, 0.2, [(8, 5)] + [x.getBlocksize(32, 80, 1) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel("arm_only_test11", precision, 32, 32, 32, 33, 68, 32, 1231.0, 14443.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel("arm_only_test12", precision, 2, 1, 1, 2, 1, 8, 1.0, 3.0, [(2, 1)] + [x.getBlocksize(2, 1, 1) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel("arm_only_test13", precision, 2, 3, 3, 2, 3, 2, 1.0, 0.0, [(2, 1)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel("arm_only_test14", precision, 16, 5, 7, 16, 7, 16, 1.0, 1.0, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"test4_{precision}", precision, 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4)], 0.0000001)) + + kernels.append(generator.SparseKernel(f"test1_{precision}", precision, 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) + kernels.append(generator.DenseKernel(f"test2_{precision}", precision, 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"test3_{precision}", precision, 8, 56, 56, 8, 56, 8, 0.0, 0.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 1) for x in blocksize_algs], 0.0000001)) + + kernels.append(generator.SparseKernel(f"arm_only_test1_{precision}", precision, 2, 3, 4, 2, 0, 2, 1.1233, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], generator.generateMTX(4, 3, 5), 0.0000001)) + kernels.append(generator.SparseKernel(f"arm_only_test2_{precision}", precision, 2, 3, 4, 20, 0, 14, 1.0, 1.0, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], generator.generateMTX(4, 3, 5), 0.0000001)) + kernels.append(generator.SparseKernel(f"arm_only_test3_{precision}", precision, 32, 80, 50, 32, 0, 32, 1.0, 3.0, [(8, 5)] + [x.getBlocksize(32, 80, 1) for x in blocksize_algs], generator.generateMTX(50, 80, 294), 0.0000001)) + kernels.append(generator.SparseKernel(f"arm_only_test4_{precision}", precision, 32, 32, 32, 34, 0, 32, 1.0, 0.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1) for x in blocksize_algs], generator.generateMTX(32, 32, 24), 0.0000001)) + kernels.append(generator.SparseKernel(f"arm_only_test5_{precision}", precision, 2, 1, 1, 2, 0, 8, 1.0, -1.0, [(2, 1)] + [x.getBlocksize(2, 1, 1) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) + kernels.append(generator.SparseKernel(f"arm_only_test6_{precision}", precision, 2, 2, 2, 2, 0, 2, 2.0, 234234.123, [(2, 1)] + [x.getBlocksize(2, 2, 1) for x in blocksize_algs], generator.generateMTX(2, 2, 1), 0.0000001)) + kernels.append(generator.SparseKernel(f"arm_only_test7_{precision}", precision, 16, 5, 7, 16, 0, 16, 0.0, -1.123, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1) for x in blocksize_algs], generator.generateMTX(7, 5, 35), 0.0000001)) + + kernels.append(generator.DenseKernel(f"arm_only_test8_{precision}", precision, 2, 3, 4, 2, 4, 2, 1.0, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"arm_only_test9_{precision}", precision, 2, 3, 4, 20, 12, 14, 2.0, 1.123, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"arm_only_test10_{precision}", precision, 32, 80, 50, 32, 50, 32, 0.0, 0.2, [(8, 5)] + [x.getBlocksize(32, 80, 1) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"arm_only_test11_{precision}", precision, 32, 32, 32, 33, 68, 32, 1231.0, 14443.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"arm_only_test12_{precision}", precision, 2, 1, 1, 2, 1, 8, 1.0, 3.0, [(2, 1)] + [x.getBlocksize(2, 1, 1) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"arm_only_test13_{precision}", precision, 2, 3, 3, 2, 3, 2, 1.0, 0.0, [(2, 1)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"arm_only_test14_{precision}", precision, 16, 5, 7, 16, 7, 16, 1.0, 1.0, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1) for x in blocksize_algs], 0.0000001)) generator.make(kernels, "arm") diff --git a/tests/unit_tests_hsw.py b/tests/unit_tests_hsw.py index 8e16ea8..47316e5 100755 --- a/tests/unit_tests_hsw.py +++ b/tests/unit_tests_hsw.py @@ -10,25 +10,25 @@ kernels = [] for precision in (Precision.SINGLE, Precision.DOUBLE): - kernels.append(generator.SparseKernel("test1", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) - kernels.append(generator.DenseKernel("test2", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8,2)] + [x.getBlocksize(8, 40, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel("test3", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.SparseKernel("hsw_only_test1", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 1), 0.0000001)) - kernels.append(generator.SparseKernel("hsw_only_test2", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2)] + [x.getBlocksize(24, 40, 2) for x in blocksize_algs], generator.generateMTX(40, 40, 20), 0.0000001)) - - kernels.append(generator.SparseKernel("hsw_only_test3", precision, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 2), 0.0000001)) - kernels.append(generator.SparseKernel("hsw_only_test4", precision, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], generator.generateMTX(10, 20, 1), 0.0000001)) - kernels.append(generator.SparseKernel("hsw_only_test5", precision, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], generator.generateMTX(10, 5, 1), 0.0000001)) - kernels.append(generator.SparseKernel("hsw_only_test6", precision, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) - kernels.append(generator.SparseKernel("hsw_only_test7", precision, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8,1)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], generator.generateMTX(40, 24, 1), 0.0000001)) - - kernels.append(generator.DenseKernel("hsw_only_test8", precision, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel("hsw_only_test9", precision, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2)] + [x.getBlocksize(32, 40, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel("hsw_only_test10", precision, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [x.getBlocksize(56, 28, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel("hsw_only_test11", precision, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel("hsw_only_test12", precision, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel("hsw_only_test13", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel("hsw_only_test14", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.SparseKernel(f"test1_{precision}", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) + kernels.append(generator.DenseKernel(f"test2_{precision}", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8,2)] + [x.getBlocksize(8, 40, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"test3_{precision}", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.SparseKernel(f"hsw_only_test1_{precision}", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 1), 0.0000001)) + kernels.append(generator.SparseKernel(f"hsw_only_test2_{precision}", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2)] + [x.getBlocksize(24, 40, 2) for x in blocksize_algs], generator.generateMTX(40, 40, 20), 0.0000001)) + + kernels.append(generator.SparseKernel(f"hsw_only_test3_{precision}", precision, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 2), 0.0000001)) + kernels.append(generator.SparseKernel(f"hsw_only_test4_{precision}", precision, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], generator.generateMTX(10, 20, 1), 0.0000001)) + kernels.append(generator.SparseKernel(f"hsw_only_test5_{precision}", precision, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], generator.generateMTX(10, 5, 1), 0.0000001)) + kernels.append(generator.SparseKernel(f"hsw_only_test6_{precision}", precision, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) + kernels.append(generator.SparseKernel(f"hsw_only_test7_{precision}", precision, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8,1)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], generator.generateMTX(40, 24, 1), 0.0000001)) + + kernels.append(generator.DenseKernel(f"hsw_only_test8_{precision}", precision, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"hsw_only_test9_{precision}", precision, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2)] + [x.getBlocksize(32, 40, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"hsw_only_test10_{precision}", precision, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [x.getBlocksize(56, 28, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"hsw_only_test11_{precision}", precision, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"hsw_only_test12_{precision}", precision, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"hsw_only_test13_{precision}", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"hsw_only_test14_{precision}", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], 0.0000001)) generator.make(kernels, "hsw") diff --git a/tests/unit_tests_knl.py b/tests/unit_tests_knl.py index 6c0d2a4..1795301 100755 --- a/tests/unit_tests_knl.py +++ b/tests/unit_tests_knl.py @@ -12,25 +12,25 @@ kernels = [] for precision in (Precision.SINGLE, Precision.DOUBLE): - kernels.append(generator.SparseKernel("test1", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) - kernels.append(generator.DenseKernel("test2", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel("test3", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.SparseKernel("knl_only_test1", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 1), 0.0000001)) - kernels.append(generator.SparseKernel("knl_only_test2", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2), (16,7)] + [x.getBlocksize(24, 40, 2) for x in blocksize_algs], generator.generateMTX(40, 40, 20), 0.0000001)) - - kernels.append(generator.SparseKernel("knl_only_test3", precision, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 2), 0.0000001)) - kernels.append(generator.SparseKernel("knl_only_test4", precision, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [(8, 20), (24,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], generator.generateMTX(10, 20, 1), 0.0000001)) - kernels.append(generator.SparseKernel("knl_only_test5", precision, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [(32, 2), (8,14)] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], generator.generateMTX(10, 5, 1), 0.0000001)) - kernels.append(generator.SparseKernel("knl_only_test6", precision, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) - kernels.append(generator.SparseKernel("knl_only_test7", precision, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8, 24), (8,1)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], generator.generateMTX(40, 24, 1), 0.0000001)) - - kernels.append(generator.DenseKernel("knl_only_test8", precision, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel("knl_only_test9", precision, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2), (16,7)] + [x.getBlocksize(32, 40, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel("knl_only_test10", precision, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [(8, 28)], 0.0000001)) - kernels.append(generator.DenseKernel("knl_only_test11", precision, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8, 20), (8,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel("knl_only_test12", precision, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [(32, 2), (8,14)] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel("knl_only_test13", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel("knl_only_test14", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [(8, 24)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.SparseKernel(f"test1_{precision}", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) + kernels.append(generator.DenseKernel(f"test2_{precision}", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"test3_{precision}", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.SparseKernel(f"knl_only_test1_{precision}", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 1), 0.0000001)) + kernels.append(generator.SparseKernel(f"knl_only_test2_{precision}", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2), (16,7)] + [x.getBlocksize(24, 40, 2) for x in blocksize_algs], generator.generateMTX(40, 40, 20), 0.0000001)) + + kernels.append(generator.SparseKernel(f"knl_only_test3_{precision}", precision, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 2), 0.0000001)) + kernels.append(generator.SparseKernel(f"knl_only_test4_{precision}", precision, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [(8, 20), (24,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], generator.generateMTX(10, 20, 1), 0.0000001)) + kernels.append(generator.SparseKernel(f"knl_only_test5_{precision}", precision, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [(32, 2), (8,14)] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], generator.generateMTX(10, 5, 1), 0.0000001)) + kernels.append(generator.SparseKernel(f"knl_only_test6_{precision}", precision, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) + kernels.append(generator.SparseKernel(f"knl_only_test7_{precision}", precision, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8, 24), (8,1)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], generator.generateMTX(40, 24, 1), 0.0000001)) + + kernels.append(generator.DenseKernel(f"knl_only_test8_{precision}", precision, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"knl_only_test9_{precision}", precision, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2), (16,7)] + [x.getBlocksize(32, 40, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"knl_only_test10_{precision}", precision, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [(8, 28)], 0.0000001)) + kernels.append(generator.DenseKernel(f"knl_only_test11_{precision}", precision, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8, 20), (8,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"knl_only_test12_{precision}", precision, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [(32, 2), (8,14)] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"knl_only_test13_{precision}", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], 0.0000001)) + kernels.append(generator.DenseKernel(f"knl_only_test14_{precision}", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [(8, 24)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], 0.0000001)) generator.make(kernels, "knl") From dae3e37de41f67b24a86833679215a8cd70f7013 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Thu, 10 Oct 2024 15:03:44 +0200 Subject: [PATCH 22/64] Fix test names --- .github/workflows/codegen.yml | 1 + tests/sve_testsuite_generator.py | 10 ++++++---- tests/testsuite_generator.py | 10 ++++++---- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/.github/workflows/codegen.yml b/.github/workflows/codegen.yml index 69f06de..5a3a9ba 100644 --- a/.github/workflows/codegen.yml +++ b/.github/workflows/codegen.yml @@ -159,6 +159,7 @@ jobs: needs: install-pspamm # include vector lengths for SVE manually (for now) strategy: + fail-fast: false matrix: vectorlen: - 128 diff --git a/tests/sve_testsuite_generator.py b/tests/sve_testsuite_generator.py index 48fe477..d3832bf 100644 --- a/tests/sve_testsuite_generator.py +++ b/tests/sve_testsuite_generator.py @@ -46,6 +46,7 @@ def make(kernels, arch): for bs in block_sizes: bm = bs[0] bn = bs[1] + bk = bs[2] if len(bs) > 2 else 1 if arch == "knl": assert (bm % 8 == 0 and (bn + 1) * (bm / 8) <= 32) @@ -64,18 +65,18 @@ def make(kernels, arch): print(f'Skipping block size {bm}x{bn} for {arch}') continue - name = kern.name + '_' + str(bm) + '_' + str(bn) + name = kern.name + '_' + str(bm) + '_' + str(bn) + '_' + str(bk) additional_args = ['--output_funcname', name, '--output_filename', os.path.join(BASEDIR, arch, name + '.h'), '--output_overwrite'] - additional_args += ['--bm', str(bm), '--bn', str(bn), '--arch', arch, '--prefetching', 'BL2viaC'] + additional_args += ['--bm', str(bm), '--bn', str(bn), '--bk', str(bk), '--arch', arch, '--prefetching', 'BL2viaC'] try: subprocess.check_output(arguments + additional_args, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) - f.write('#include "' + arch + '/' + kern.name + '_' + str(bm) + '_' + str(bn) + '.h"\n') + f.write('#include "' + arch + '/' + name + '.h"\n') f.write('\n') # necessary functions are defined in testsuite_generator.py @@ -96,6 +97,7 @@ def make(kernels, arch): for bs in block_sizes: bm = bs[0] bn = bs[1] + bk = bs[2] if len(bs) > 2 else 1 if arch.startswith("arm_sve"): veclen = int(arch[7:]) @@ -110,7 +112,7 @@ def make(kernels, arch): # print(f'Skipping block size {bm}x{bn} for {arch}') continue - name = kern.name + '_' + str(bm) + '_' + str(bn) + name = kern.name + '_' + str(bm) + '_' + str(bn) + '_' + str(bk) if isinstance(kern, SparseKernel): mtx = kern.mtx diff --git a/tests/testsuite_generator.py b/tests/testsuite_generator.py index dfc39a9..e4bc9be 100755 --- a/tests/testsuite_generator.py +++ b/tests/testsuite_generator.py @@ -232,17 +232,18 @@ def make(kernels, arch): for bs in block_sizes: bm = bs[0] bn = bs[1] + bk = bs[2] if len(bs) > 2 else 1 if arch == "knl": assert (bm % 8 == 0 and (bn + 1) * (bm / 8) <= 32) elif arch == "arm": assert (bm % 2 == 0 and (bn + 1) * (bm / 2) + bn <= 32) - name = kern.name + '_' + str(bm) + '_' + str(bn) + name = kern.name + '_' + str(bm) + '_' + str(bn) + '_' + str(bk) additional_args = ['--output_funcname', name, '--output_filename', os.path.join(BASEDIR, arch, name + '.h'), '--output_overwrite'] - additional_args += ['--bm', str(bm), '--bn', str(bn), '--arch', arch] + additional_args += ['--bm', str(bm), '--bn', str(bn), '--bk', str(bk), '--arch', arch] try: print(' '.join(arguments + additional_args)) @@ -250,7 +251,7 @@ def make(kernels, arch): except subprocess.CalledProcessError as e: raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) - f.write('#include "' + arch + '/' + kern.name + '_' + str(bm) + '_' + str(bn) + '.h"\n') + f.write('#include "' + arch + '/' + name + '.h"\n') f.write('\n') @@ -264,7 +265,8 @@ def make(kernels, arch): for bs in block_sizes: bm = bs[0] bn = bs[1] - name = kern.name + '_' + str(bm) + '_' + str(bn) + bk = bs[2] if len(bs) > 2 else 1 + name = kern.name + '_' + str(bm) + '_' + str(bn) + '_' + str(bk) if isinstance(kern, SparseKernel): mtx = kern.mtx From 0438eefde97b6d7b3e5d71c2fb7d3b61c4ecdf85 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Thu, 10 Oct 2024 15:38:26 +0200 Subject: [PATCH 23/64] Fix bugs in generators --- .github/workflows/codegen.yml | 14 ++- pspamm/codegen/architectures/arm/generator.py | 24 +++--- .../architectures/arm_sve/generator.py | 4 +- pspamm/codegen/architectures/hsw/generator.py | 10 +-- pspamm/codegen/architectures/knl/generator.py | 23 +++-- .../architectures/knl/inlineprinter.py | 5 +- pspamm/codegen/precision.py | 3 + pspamm/matmul.py | 13 +-- tests/sve_testsuite_generator.py | 85 +++++++------------ tests/testsuite_generator.py | 84 +++++++++++------- tests/unit_tests_arm_sve.py | 19 +++-- tests/unit_tests_hsw.py | 3 +- tests/unit_tests_knl.py | 3 +- 13 files changed, 155 insertions(+), 135 deletions(-) diff --git a/.github/workflows/codegen.yml b/.github/workflows/codegen.yml index 5a3a9ba..8894641 100644 --- a/.github/workflows/codegen.yml +++ b/.github/workflows/codegen.yml @@ -69,12 +69,14 @@ jobs: - name: pspamm-tests-compile run: | cd tests/ - g++ -static -mavx512f build/hsw_testsuite.cpp -o build/hsw-test + g++ -static -mavx2 build/hsw256_testsuite.cpp -o build/hsw256-test + g++ -static -mavx2 build/hsw128_testsuite.cpp -o build/hsw128-test - name: pspamm-tests-run run: | cd tests/ - qemu-x86_64-static -cpu Haswell build/hsw-test + qemu-x86_64-static -cpu Haswell build/hsw256-test + qemu-x86_64-static -cpu Haswell build/hsw128-test pspamm-codegen-avx512-no-run: name: pspamm-codegen-avx512-no-run @@ -107,13 +109,17 @@ jobs: - name: pspamm-tests-compile run: | cd tests/ - g++ -static -mavx512f build/knl_testsuite.cpp -o build/knl-test + g++ -static -mavx512f build/knl512_testsuite.cpp -o build/knl512-test + g++ -static -mavx512f build/knl256_testsuite.cpp -o build/knl256-test + g++ -static -mavx512f build/knl128_testsuite.cpp -o build/knl128-test # disabled, since qemu doesn't support AVX512F (yet) with of Ubuntu 24.04 # - name: pspamm-tests-run # run: | # cd tests/ - # qemu-x86_64-static -cpu Skylake-Server build/knl-test + # qemu-x86_64-static -cpu Skylake-Server build/knl512-test + # qemu-x86_64-static -cpu Skylake-Server build/knl256-test + # qemu-x86_64-static -cpu Skylake-Server build/knl128-test pspamm-codegen-aarch64: name: pspamm-codegen-aarch64 diff --git a/pspamm/codegen/architectures/arm/generator.py b/pspamm/codegen/architectures/arm/generator.py index 89de2e3..fcb6a75 100644 --- a/pspamm/codegen/architectures/arm/generator.py +++ b/pspamm/codegen/architectures/arm/generator.py @@ -32,9 +32,7 @@ class Generator(AbstractGenerator): """ def get_v_size(self): - if self.precision == Precision.DOUBLE: - return 2 - raise NotImplementedError + return 16 // self.precision.size() def get_template(self): return Generator.template @@ -137,7 +135,7 @@ def move_register_block(self, next_offset = [0, 0] if ir+1 < rows: next_offset = [1, 0] - elif ic +1 < rows: + elif ic +1 < cols: next_offset = [0, 1] addr_next, comment_next = cursor.look(cursor_ptr, block_offset, Coords(down=(ir+next_offset[0])*v_size, right=ic+next_offset[1])) @@ -153,16 +151,16 @@ def move_register_block(self, addr.disp = 0 addr.base = additional_regs[0] - if not skipflag: - if store: - asm.add(st(registers[ir,ic], addr, True, comment)) - else: - asm.add(ld(addr, registers[ir,ic], True, comment)) - else: - if store: - asm.add(st(registers[ir,ic], addr, True, comment, registers[ir+next_offset[0],ic+next_offset[1]])) + if not skipflag: + if store: + asm.add(st(registers[ir,ic], addr, True, comment)) + else: + asm.add(ld(addr, registers[ir,ic], True, comment)) else: - asm.add(ld(addr, registers[ir,ic], True, comment, registers[ir+next_offset[0],ic+next_offset[1]])) + if store: + asm.add(st(registers[ir,ic], addr, True, comment, registers[ir+next_offset[0],ic+next_offset[1]])) + else: + asm.add(ld(addr, registers[ir,ic], True, comment, registers[ir+next_offset[0],ic+next_offset[1]])) return asm diff --git a/pspamm/codegen/architectures/arm_sve/generator.py b/pspamm/codegen/architectures/arm_sve/generator.py index ac56177..61a8217 100644 --- a/pspamm/codegen/architectures/arm_sve/generator.py +++ b/pspamm/codegen/architectures/arm_sve/generator.py @@ -230,7 +230,7 @@ def move_register_block(self, # addr = base "pointer" + relative offset in bytes addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.value * load_offset + addr.disp += self.precision.size() * load_offset # count how many elements we have processed between last step and this step cont_counter = ((addr.disp - prev_disp) // mul_vl) @@ -313,7 +313,7 @@ def make_microkernel(self, cur11 = -1000 Vm = max(self.ceil_div(bm, v_size), 1) - multiple = self.precision.value + multiple = self.precision.size() # for ld1rw (single prec): immediate offset is multiple of 4 in range of 0 to 252 # for ld1rd (double prec): immediate offset is multiple of 8 in range of 0 to 504 # in both cases: instruction encodes the immediate offset within 6 bits diff --git a/pspamm/codegen/architectures/hsw/generator.py b/pspamm/codegen/architectures/hsw/generator.py index 47bceae..912183a 100644 --- a/pspamm/codegen/architectures/hsw/generator.py +++ b/pspamm/codegen/architectures/hsw/generator.py @@ -87,11 +87,11 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: reg_count = 0 - for i in range(1024, min(max(nnz * self.precision.value, m*k*self.precision.value, m*n*self.precision.value),8000), 2048): + for i in range(1024, min(max(nnz * self.precision.size(), m*k*self.precision.size(), m*n*self.precision.size()),8000), 2048): additional_regs.append(available_regs[reg_count]) reg_count += 1 - for i in range(8192, min(nnz * self.precision.value, 33000), 8192): + for i in range(8192, min(nnz * self.precision.size(), 33000), 8192): additional_regs.append(available_regs[reg_count]) reg_count += 1 @@ -134,7 +134,7 @@ def make_b_pointers(self, reg_count = 5 - for i in range(8192, min(nnz * self.precision.value, 33000), 8192): + for i in range(8192, min(nnz * self.precision.size(), 33000), 8192): asm.add(lea(B_reg, additional_regs[reg_count], i)) reg_count += 1 @@ -183,7 +183,7 @@ def move_register_block(self, if (mask is None) or (mask[ir,ic]): cell_offset = Coords(down=ir*v_size, right=ic) addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.value * load_offset + addr.disp += self.precision.size() * load_offset if store: asm.add(mov(registers[ir,ic], addr, True, comment)) if prefetching == 'BL2viaC': @@ -212,7 +212,7 @@ def move_register_single(self, if (mask is None) or (mask[ir,ic]): cell_offset = Coords(down=ir*v_size, right=ic) addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.value * load_offset + addr.disp += self.precision.size() * load_offset asm.add(mov(addr, registers[ir,ic], True, comment)) return asm diff --git a/pspamm/codegen/architectures/knl/generator.py b/pspamm/codegen/architectures/knl/generator.py index ad8a6ae..9530ef8 100644 --- a/pspamm/codegen/architectures/knl/generator.py +++ b/pspamm/codegen/architectures/knl/generator.py @@ -44,7 +44,14 @@ def use_broadcast(self): return False def has_masks(self): - return True + return False # for now + + def pred_n_trues(self, count, v_size, mode): + # a bit hacky at the moment (won't work for all masks) + if count < v_size and count > 0: + return mask(0) + else: + return None def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int): vm = self.ceil_div(bm, v_size) @@ -74,15 +81,15 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: reg_count = 0 - for i in range(1024, min(max(nnz * self.precision.value, m*k*self.precision.value, m*n*self.precision.value),8000), 2048): + for i in range(1024, min(max(nnz * self.precision.size(), m*k*self.precision.size(), m*n*self.precision.size()),8000), 2048): additional_regs.append(available_regs[reg_count]) reg_count += 1 - for i in range(8192, min(nnz * self.precision.value, 33000), 8192): + for i in range(8192, min(nnz * self.precision.size(), 33000), 8192): additional_regs.append(available_regs[reg_count]) reg_count += 1 - loop_reg = [r(12), r(13), r(14)] + loop_regs = [r(12), r(13), r(14)] return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, mask_regs @@ -93,8 +100,8 @@ def init_mask(self, bm, v_size, tempreg, maskregs): else: asm = block("Set mask register") restval = (1 << rest) - 1 - asm.add(mov(restval, tempreg)) - asm.add(mov(tempreg, maskreg[0])) + asm.add(mov(restval, tempreg, False)) + asm.add(mov(tempreg, maskregs[0], False)) return asm def bcst_alpha_beta(self, @@ -131,7 +138,7 @@ def make_b_pointers(self, reg_count = 5 - for i in range(8192, min(nnz * self.precision.value, 33000), 8192): + for i in range(8192, min(nnz * self.precision.size(), 33000), 8192): asm.add(lea(B_reg, additional_regs[reg_count], i)) reg_count += 1 @@ -180,7 +187,7 @@ def move_register_block(self, if (mask is None) or (mask[ir,ic]): cell_offset = Coords(down=ir*v_size, right=ic) addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.value * load_offset + addr.disp += self.precision.size() * load_offset if store: asm.add(mov(registers[ir,ic], addr, True, comment)) if prefetching == 'BL2viaC': diff --git a/pspamm/codegen/architectures/knl/inlineprinter.py b/pspamm/codegen/architectures/knl/inlineprinter.py index fe8aa1c..33b313f 100644 --- a/pspamm/codegen/architectures/knl/inlineprinter.py +++ b/pspamm/codegen/architectures/knl/inlineprinter.py @@ -54,7 +54,8 @@ def addLine(self, stmt: str, comment: str): self.output.append(line) - + def maskformat(self, pred): + pass def visitFma(self, stmt: FmaStmt): b = stmt.bcast_src.ugly @@ -75,7 +76,7 @@ def visitMul(self, stmt: MulStmt): b = stmt.src.ugly m = stmt.mult_src.ugly a = stmt.dest.ugly - regsize = stmt.add_dest.size() // 16 + regsize = stmt.dest.size() // 16 if stmt.mult_src.typeinfo == AsmType.i64: # in this case, m is a Register that points to alpha/beta; manually format to be a memory address s = "vmulp{} 0({})%{{1to{}%}}, {}, {}".format(self.precision, m, regsize * self.broadcast_multiplier, b, a) diff --git a/pspamm/codegen/precision.py b/pspamm/codegen/precision.py index df5153e..417c9a6 100644 --- a/pspamm/codegen/precision.py +++ b/pspamm/codegen/precision.py @@ -10,6 +10,9 @@ class Precision(Enum): def getCType(cls, precision): ctype = {cls.DOUBLE: 'double', cls.SINGLE: 'float', cls.HALF: 'uint16_t', cls.BFLOAT16: 'uint16_t'} return ctype[precision] + + def ctype(self): + return self.getCType(self) def size(self): return { diff --git a/pspamm/matmul.py b/pspamm/matmul.py index 0d5d6e1..6b43b9f 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -218,17 +218,18 @@ def __init__(self, if lda == 0: blocks, patterns, mtx_overhead = decompose_pattern(self.m, self.k, apattern, self.bm, self.bk) - self.A = BlockCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.value, blocks, patterns, mtx_overhead) + self.A = BlockCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.size(), blocks, patterns, mtx_overhead) + self.nnz += sum(mtx_overhead) else: - self.A = DenseCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.value) + self.A = DenseCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.size()) if ldb == 0: blocks, patterns, mtx_overhead = decompose_pattern(self.k, self.n, bpattern, self.bk, self.bn) - self.B = BlockCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.value, blocks, patterns, mtx_overhead) + self.B = BlockCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.size(), blocks, patterns, mtx_overhead) self.nnz += sum(mtx_overhead) else: - self.B = DenseCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.value) - self.C = DenseCursor("C", self.starting_regs[2], self.m, self.n, self.ldc, self.bm, self.bn, self.precision.value) - self.C_pf = DenseCursor("C_pf", prefetchReg, self.m, self.n, self.ldc, self.bm, self.bn, self.precision.value) if prefetchReg else None + self.B = DenseCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.size()) + self.C = DenseCursor("C", self.starting_regs[2], self.m, self.n, self.ldc, self.bm, self.bn, self.precision.size()) + self.C_pf = DenseCursor("C_pf", prefetchReg, self.m, self.n, self.ldc, self.bm, self.bn, self.precision.size()) if prefetchReg else None self.unroll = ldb == 0 or lda == 0 diff --git a/tests/sve_testsuite_generator.py b/tests/sve_testsuite_generator.py index d3832bf..71511b3 100644 --- a/tests/sve_testsuite_generator.py +++ b/tests/sve_testsuite_generator.py @@ -31,6 +31,8 @@ def make(kernels, arch): f.write(test_generator.head_of_testsuite) + testcases = [] + for kern in kernels: arguments = ['pspamm-generator', str(kern.m), str(kern.n), str(kern.k), str(kern.lda), str(kern.ldb), str(kern.ldc), str(kern.alpha), str(kern.beta)] @@ -41,12 +43,12 @@ def make(kernels, arch): prec = 's' if kern.precision == Precision.SINGLE else 'd' arguments += ['--precision', prec] - block_sizes = list(set(kern.block_sizes)) + block_sizes = list(set(bs if len(bs) > 2 else (bs[0], bs[1], 1) for bs in kern.block_sizes)) for bs in block_sizes: bm = bs[0] bn = bs[1] - bk = bs[2] if len(bs) > 2 else 1 + bk = bs[2] if arch == "knl": assert (bm % 8 == 0 and (bn + 1) * (bm / 8) <= 32) @@ -58,11 +60,10 @@ def make(kernels, arch): reglen = veclen // 128 v_len = (16 // kern.precision.size()) * reglen # this should be the same assertion as in ../scripts/max_arm_sve.py - bk = 1 # ceiling division - vm = -(bm // -v_len) + vm = -(bm // -v_len) if not ((bn + bk) * vm + bn * bk <= 32): - print(f'Skipping block size {bm}x{bn} for {arch}') + print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') continue name = kern.name + '_' + str(bm) + '_' + str(bn) + '_' + str(bk) @@ -78,58 +79,38 @@ def make(kernels, arch): f.write('#include "' + arch + '/' + name + '.h"\n') - f.write('\n') - # necessary functions are defined in testsuite_generator.py - f.write(test_generator.function_definitions) - f.write(setup_prefetching) - f.write(test_generator.setup_main) - # add variable declarations for single precision test cases - f.write(""" std::tuple fpointers; - float falpha; float fbeta; - double* prefetch; - float* fprefetch; - """) - - for kern in kernels: - - block_sizes = list(set(kern.block_sizes)) - - for bs in block_sizes: - bm = bs[0] - bn = bs[1] - bk = bs[2] if len(bs) > 2 else 1 - - if arch.startswith("arm_sve"): - veclen = int(arch[7:]) - assert veclen % 128 == 0 and veclen <= 2048 - reglen = veclen // 128 - v_len = (16 // kern.precision.size()) * reglen - # this should be the same assertion as in ../scripts/max_arm_sve.py - bk = 1 - # ceiling division - vm = -( bm // -v_len) - if not ((bn + bk) * vm + bn * bk <= 32): - # print(f'Skipping block size {bm}x{bn} for {arch}') - continue - - name = kern.name + '_' + str(bm) + '_' + str(bn) + '_' + str(bk) - if isinstance(kern, SparseKernel): mtx = kern.mtx else: mtx = "" - # for double precision: set prec to '' to conform to test_generator.function_definitions - prec = 'f' if kern.precision == Precision.SINGLE else '' - - f.write(""" - {p}alpha = {alpha}; {p}beta = {beta}; ldb = {ldb}; - {p}pointers = pre<{T}>({m}, {n}, {k}, {lda}, ldb, {ldc}, "{mtx}"); - setup_prefetch({p}prefetch, std::get<3>({p}pointers), {n}, {ldc}); - {name}(std::get<0>({p}pointers), std::get<{sparse}>({p}pointers), std::get<3>({p}pointers), {p}alpha, {p}beta, {p}prefetch); - result = post<{T}>({m}, {n}, {k}, {lda}, &ldb, {ldc}, &{p}alpha, &{p}beta, std::get<0>({p}pointers), std::get<1>({p}pointers), std::get<3>({p}pointers), std::get<4>({p}pointers), {delta:.7f}); + + prec2 = 'f' if kern.precision == Precision.SINGLE else '' + + testcases += [ + """ +{{ + unsigned ldb = {ldb}; + {T} alpha = {alpha}; + {T} beta = {beta}; + {T}* prefetch = nullptr; + auto pointers = pre<{T}>({m}, {n}, {k}, {lda}, ldb, {ldc}, "{mtx}"); + setup_prefetch(prefetch, std::get<3>(pointers), {n}, {ldc}); + {name}(std::get<0>(pointers), std::get<{sparse}>(pointers), std::get<3>(pointers), alpha, beta, prefetch); + const auto result = post<{T}>({m}, {n}, {k}, {lda}, &ldb, {ldc}, &alpha, &beta, std::get<0>(pointers), std::get<1>(pointers), std::get<3>(pointers), std::get<4>(pointers), {delta:.7f}); results.push_back(std::make_tuple("{name}", result)); - free(std::get<0>({p}pointers)); free(std::get<1>({p}pointers)); free(std::get<2>({p}pointers)); free(std::get<3>({p}pointers)); free(std::get<4>({p}pointers)); free({p}prefetch); + free(std::get<0>(pointers)); free(std::get<1>(pointers)); free(std::get<2>(pointers)); free(std::get<3>(pointers)); free(std::get<4>(pointers)); free(prefetch); +}} """.format(m=kern.m, n=kern.n, k=kern.k, lda=kern.lda, ldb=kern.ldb, ldc=kern.ldc, alpha=kern.alpha, beta=kern.beta, - mtx=mtx, delta=kern.delta, name=name, sparse=2 if kern.ldb == 0 else 1, p=prec, T="float" if prec == 'f' else "double")) + mtx=mtx, delta=kern.delta, name=name, sparse=2 if kern.ldb == 0 else 1, p=prec2, T=kern.precision.ctype()) + ] + + f.write('\n') + # necessary functions are defined in testsuite_generator.py + f.write(test_generator.function_definitions) + f.write(setup_prefetching) + f.write(test_generator.setup_main) + + for testcase in testcases: + f.write(testcase) f.write(test_generator.end_of_testsuite) diff --git a/tests/testsuite_generator.py b/tests/testsuite_generator.py index e4bc9be..2668c1f 100755 --- a/tests/testsuite_generator.py +++ b/tests/testsuite_generator.py @@ -4,6 +4,7 @@ import random import sys import os.path +from pspamm.codegen.precision import * BASEDIR = 'build' @@ -145,22 +146,20 @@ int main() { std::vector> results; - std::tuple pointers; - int result; - - // A compiler related issue makes it necessary to store certain values in variables before using them - unsigned ldb; - double alpha; double beta; """ setup_single_testcase = """ - ldb = {ldb}; alpha = {alpha}; beta = {beta}; - pointers = pre({m}, {n}, {k}, {lda}, ldb, {ldc}, "{mtx}"); +{{ + unsigned ldb = {ldb}; + {precision} alpha = {alpha}; + {precision} beta = {beta}; + auto pointers = pre<{precision}>({m}, {n}, {k}, {lda}, ldb, {ldc}, "{mtx}"); {name}(std::get<0>(pointers), std::get<{sparse}>(pointers), std::get<3>(pointers), {alpha}, {beta}, nullptr); - result = post({m}, {n}, {k}, {lda}, &ldb, {ldc}, &alpha, &beta, std::get<0>(pointers), std::get<1>(pointers), std::get<3>(pointers), std::get<4>(pointers), {delta:.7f}); + const auto result = post<{precision}>({m}, {n}, {k}, {lda}, &ldb, {ldc}, &alpha, &beta, std::get<0>(pointers), std::get<1>(pointers), std::get<3>(pointers), std::get<4>(pointers), {delta:.7f}); results.push_back(std::make_tuple("{name}", result)); free(std::get<0>(pointers)); free(std::get<1>(pointers)); free(std::get<2>(pointers)); free(std::get<3>(pointers)); free(std::get<4>(pointers)); +}} """ end_of_testsuite = """ @@ -219,6 +218,8 @@ def make(kernels, arch): f.write(head_of_testsuite) + testcases = [] + for kern in kernels: arguments = ['pspamm-generator', str(kern.m), str(kern.n), str(kern.k), str(kern.lda), str(kern.ldb), @@ -227,17 +228,38 @@ def make(kernels, arch): if isinstance(kern, SparseKernel): arguments += ['--mtx_filename', kern.mtx] - block_sizes = list(set(kern.block_sizes)) + prec = 's' if kern.precision == Precision.SINGLE else 'd' + arguments += ['--precision', prec] + + block_sizes = list(set(bs if len(bs) > 2 else (bs[0], bs[1], 1) for bs in kern.block_sizes)) for bs in block_sizes: bm = bs[0] bn = bs[1] - bk = bs[2] if len(bs) > 2 else 1 - - if arch == "knl": - assert (bm % 8 == 0 and (bn + 1) * (bm / 8) <= 32) - elif arch == "arm": - assert (bm % 2 == 0 and (bn + 1) * (bm / 2) + bn <= 32) + bk = bs[2] + + veclen = int(arch[3:]) if arch[3:] != '' else 128 + assert veclen % 128 == 0 + reglen = veclen // 128 + v_len = (16 // kern.precision.size()) * reglen + # this should be the same assertion as in ../scripts/max_arm_sve.py + # ceiling division + vm = -(bm // -v_len) + v_size = v_len + + if arch.startswith("knl"): + print(f'{bn} {bk} {vm} {bm} {v_size}') + if not ((bn+bk) * vm <= 32) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: + print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') + continue + elif arch.startswith("hsw"): + if not ((bn+bk) * vm + bn * bk <= 16) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: + print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') + continue + elif arch.startswith("arm"): + if not ((bn+bk) * vm + bn * bk <= 32) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: + print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') + continue name = kern.name + '_' + str(bm) + '_' + str(bn) + '_' + str(bk) @@ -253,28 +275,24 @@ def make(kernels, arch): f.write('#include "' + arch + '/' + name + '.h"\n') - f.write('\n') - - f.write(function_definitions) - f.write(setup_main) - - for kern in kernels: - - block_sizes = list(set(kern.block_sizes)) - - for bs in block_sizes: - bm = bs[0] - bn = bs[1] - bk = bs[2] if len(bs) > 2 else 1 - name = kern.name + '_' + str(bm) + '_' + str(bn) + '_' + str(bk) - if isinstance(kern, SparseKernel): mtx = kern.mtx else: mtx = "" - f.write(setup_single_testcase.format( + testcases += [ + setup_single_testcase.format( m=kern.m, n=kern.n, k=kern.k, lda=kern.lda, ldb=kern.ldb, ldc=kern.ldc, alpha=kern.alpha, - beta=kern.beta, mtx=mtx, delta=kern.delta, name=name, sparse=2 if kern.ldb == 0 else 1)) + beta=kern.beta, mtx=mtx, delta=kern.delta, name=name, sparse=2 if kern.ldb == 0 else 1, + precision=kern.precision.ctype()) + ] + + f.write('\n') + + f.write(function_definitions) + f.write(setup_main) + + for testcase in testcases: + f.write(testcase) f.write(end_of_testsuite) diff --git a/tests/unit_tests_arm_sve.py b/tests/unit_tests_arm_sve.py index ad4a0fb..b57b2f9 100644 --- a/tests/unit_tests_arm_sve.py +++ b/tests/unit_tests_arm_sve.py @@ -16,6 +16,7 @@ v_size = lambda prec: (16 // prec.size()) * v_len v_size_d = v_size(Precision.DOUBLE) v_size_s = v_size(Precision.SINGLE) +v_size_h = v_size(Precision.HALF) bitlen = v_len * 128 kernels = [] @@ -67,13 +68,15 @@ kernels.append(generator.SparseKernel("sve_single_prec_test_S7", Precision.SINGLE, 23, 23, 23, 23, 0, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(23, 23, 52), delta_sp)) kernels.append(generator.SparseKernel("sve_single_prec_test_S8", Precision.SINGLE, 23, 31, 13, 23, 0, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(13, 31, 40), delta_sp)) -kernels.append(generator.DenseKernel("sve_half_prec_test_S1", Precision.HALF, 9, 9, 9, 9, 9, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size_s) for x in blocksize_algs], delta_sp)) -kernels.append(generator.DenseKernel("sve_half_prec_test_S2", Precision.HALF, 15, 15, 15, 15, 15, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size_s) for x in blocksize_algs], delta_sp)) -kernels.append(generator.DenseKernel("sve_half_prec_test_S3", Precision.HALF, 23, 23, 23, 23, 23, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_s) for x in blocksize_algs], delta_sp)) -kernels.append(generator.DenseKernel("sve_half_prec_test_S4", Precision.HALF, 23, 31, 13, 23, 13, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_s) for x in blocksize_algs], delta_sp)) -kernels.append(generator.SparseKernel("sve_half_prec_test_S5", Precision.HALF, 9, 9, 9, 9, 0, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(9, 9, 8), delta_sp)) -kernels.append(generator.SparseKernel("sve_half_prec_test_S6", Precision.HALF, 15, 15, 15, 15, 0, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(15, 15, 22), delta_sp)) -kernels.append(generator.SparseKernel("sve_half_prec_test_S7", Precision.HALF, 23, 23, 23, 23, 0, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(23, 23, 52), delta_sp)) -kernels.append(generator.SparseKernel("sve_half_prec_test_S8", Precision.HALF, 23, 31, 13, 23, 0, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(13, 31, 40), delta_sp)) +""" +kernels.append(generator.DenseKernel("sve_half_prec_test_S1", Precision.HALF, 9, 9, 9, 9, 9, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size_h) for x in blocksize_algs], delta_sp)) +kernels.append(generator.DenseKernel("sve_half_prec_test_S2", Precision.HALF, 15, 15, 15, 15, 15, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size_h) for x in blocksize_algs], delta_sp)) +kernels.append(generator.DenseKernel("sve_half_prec_test_S3", Precision.HALF, 23, 23, 23, 23, 23, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_h) for x in blocksize_algs], delta_sp)) +kernels.append(generator.DenseKernel("sve_half_prec_test_S4", Precision.HALF, 23, 31, 13, 23, 13, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_h) for x in blocksize_algs], delta_sp)) +kernels.append(generator.SparseKernel("sve_half_prec_test_S5", Precision.HALF, 9, 9, 9, 9, 0, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size_h) for x in blocksize_algs], generator.generateMTX(9, 9, 8), delta_sp)) +kernels.append(generator.SparseKernel("sve_half_prec_test_S6", Precision.HALF, 15, 15, 15, 15, 0, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size_h) for x in blocksize_algs], generator.generateMTX(15, 15, 22), delta_sp)) +kernels.append(generator.SparseKernel("sve_half_prec_test_S7", Precision.HALF, 23, 23, 23, 23, 0, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_h) for x in blocksize_algs], generator.generateMTX(23, 23, 52), delta_sp)) +kernels.append(generator.SparseKernel("sve_half_prec_test_S8", Precision.HALF, 23, 31, 13, 23, 0, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_h) for x in blocksize_algs], generator.generateMTX(13, 31, 40), delta_sp)) +""" generator.make(kernels, f"arm_sve{bitlen}") diff --git a/tests/unit_tests_hsw.py b/tests/unit_tests_hsw.py index 47316e5..92fa298 100755 --- a/tests/unit_tests_hsw.py +++ b/tests/unit_tests_hsw.py @@ -30,6 +30,7 @@ kernels.append(generator.DenseKernel(f"hsw_only_test13_{precision}", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], 0.0000001)) kernels.append(generator.DenseKernel(f"hsw_only_test14_{precision}", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], 0.0000001)) -generator.make(kernels, "hsw") +for arch in ('hsw256', 'hsw128'): + generator.make(kernels, arch) diff --git a/tests/unit_tests_knl.py b/tests/unit_tests_knl.py index 1795301..b673be3 100755 --- a/tests/unit_tests_knl.py +++ b/tests/unit_tests_knl.py @@ -32,6 +32,7 @@ kernels.append(generator.DenseKernel(f"knl_only_test13_{precision}", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], 0.0000001)) kernels.append(generator.DenseKernel(f"knl_only_test14_{precision}", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [(8, 24)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], 0.0000001)) -generator.make(kernels, "knl") +for arch in ('knl512', 'knl256', 'knl128'): + generator.make(kernels, arch) From 2a64dfd037f53d51d13a5b53b35cb11d8254eeeb Mon Sep 17 00:00:00 2001 From: David Schneller Date: Fri, 11 Oct 2024 13:54:40 +0200 Subject: [PATCH 24/64] Fix codegen bugs, prepare for AVX512 mask codegen --- pspamm/codegen/architectures/arm/generator.py | 6 +- .../architectures/arm/inlineprinter.py | 57 ++++++++------- pspamm/codegen/architectures/arm/operands.py | 7 +- .../architectures/arm_sve/inlineprinter.py | 73 ++++++++++--------- .../architectures/hsw/inlineprinter.py | 29 ++++---- .../architectures/knl/inlineprinter.py | 55 +++++++++----- pspamm/codegen/architectures/knl/operands.py | 5 ++ pspamm/codegen/ast.py | 5 ++ 8 files changed, 142 insertions(+), 95 deletions(-) diff --git a/pspamm/codegen/architectures/arm/generator.py b/pspamm/codegen/architectures/arm/generator.py index fcb6a75..3546955 100644 --- a/pspamm/codegen/architectures/arm/generator.py +++ b/pspamm/codegen/architectures/arm/generator.py @@ -52,9 +52,9 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: assert((bn+bk) * vm + bn * bk <= 32) # Needs to fit in NEON v registers prec = { - Precision.DOUBLE: "2D", - Precision.SINGLE: "4S", - Precision.HALF: "8H", + Precision.DOUBLE: "2d", + Precision.SINGLE: "4s", + Precision.HALF: "8h", }[self.get_precision()] A_regs = Matrix([[v(vm*c + r, prec) for c in range(bk)] for r in range(vm)]) diff --git a/pspamm/codegen/architectures/arm/inlineprinter.py b/pspamm/codegen/architectures/arm/inlineprinter.py index 7ecd7ba..1fba1d9 100644 --- a/pspamm/codegen/architectures/arm/inlineprinter.py +++ b/pspamm/codegen/architectures/arm/inlineprinter.py @@ -20,6 +20,7 @@ class InlinePrinter(Visitor): def __init__(self, precision: Precision): self.output = [] self.stack = [] + self.precision = precision assert precision in (Precision.HALF, Precision.SINGLE, Precision.DOUBLE) def show(self): @@ -49,69 +50,71 @@ def visitFma(self, stmt: FmaStmt): m = stmt.mult_src.ugly a = stmt.add_dest.ugly if stmt.bcast: - s = "fmla {}, {}, {}[0]".format(a,m,b) + s = f"fmla {a}, {m}, {b}[0]" else: - s = "fmla {}, {}, {}".format(a,m,b) + s = f"fmla {a}, {m}, {b}" self.addLine(s, stmt.comment) def visitMul(self, stmt: MulStmt): b = stmt.src.ugly m = stmt.mult_src.ugly a = stmt.dest.ugly - s = "fmul {}, {}, {}".format(a,m,b) + s = f"fmul {a}, {m}, {b}" self.addLine(s, stmt.comment) def visitBcst(self, stmt: BcstStmt): - b = stmt.bcast_src.ugly + b = stmt.bcast_src.ugly if self.precision == Precision.DOUBLE else stmt.bcast_src.ugly_b32 a = stmt.dest.ugly - s = "dup {}, {}".format(a, b) + s = f"dup {a}, {b}" self.addLine(s, stmt.comment) def visitAdd(self, stmt: AddStmt): if isinstance(stmt.src, Constant) and (stmt.src.value > 4095 or stmt.src.value < -4095): if (stmt.src.value >> 16) & 0xFFFF > 0 and stmt.src.value < 0: s = "mov x11, #-1" - s1 = "movk x11, #{}".format((stmt.src.value) & 0xFFFF) - val = ((stmt.src.value >> 16) & 0xFFFF) - s2 = "movk x11, #{}, lsl #16".format(val) + val1 = (stmt.src.value) & 0xFFFF + s1 = f"movk x11, #{val1}" + val2 = ((stmt.src.value >> 16) & 0xFFFF) + s2 = f"movk x11, #{val2}, lsl #16" self.addLine(s, "") self.addLine(s1, "load lower 16 bit of immediate that requires more than 16 bit") self.addLine(s2, "load upper 16 bit of immediate that requires more than 16 bit") elif (stmt.src.value >> 16) & 0xFFFF: - s1 = "mov x11, #{}".format((stmt.src.value) & 0xFFFF) - val = ((stmt.src.value >> 16) & 0xFFFF) - s2 = "movk x11, #{}, lsl #16".format(val) + val1 = (stmt.src.value) & 0xFFFF + s1 = f"mov x11, #{val1}" + val2 = ((stmt.src.value >> 16) & 0xFFFF) + s2 = f"movk x11, #{val2}, lsl #16" self.addLine(s1, "load lower 16 bit of immediate that requires more than 16 bit") self.addLine(s2, "load upper 16 bit of immediate that requires more than 16 bit") else: - s = "mov x11, {}".format(stmt.src.ugly) + s = f"mov x11, {stmt.src.ugly}" self.addLine(s, "load lower 16 bit of immediate ") if stmt.dest.ugly != "x11": - s = "add {}, {}, x11".format(stmt.dest.ugly,stmt.dest.ugly) + s = f"add {stmt.dest.ugly}, {stmt.dest.ugly}, x11" self.addLine(s, stmt.comment) if stmt.additional is not None: - s = "add {}, {}, {}".format(stmt.dest.ugly,stmt.dest.ugly,stmt.additional.ugly) + s = f"add {stmt.dest.ugly}, {stmt.dest.ugly}, {stmt.additional.ugly}" self.addLine(s, stmt.comment) else: if stmt.additional is not None: - s = "add {}, {}, {}".format(stmt.dest.ugly,stmt.additional.ugly,stmt.src.ugly) + s = f"add {stmt.dest.ugly}, {stmt.additional.ugly}, {stmt.src.ugly}" else: - s = "add {}, {}, {}".format(stmt.dest.ugly,stmt.dest.ugly,stmt.src.ugly) + s = f"add {stmt.dest.ugly}, {stmt.dest.ugly}, {stmt.src.ugly}" self.addLine(s, stmt.comment) def visitLabel(self, stmt: LabelStmt): - s = "{}:".format(stmt.label.ugly) + s = f"{stmt.label.ugly}:" self.addLine(s, stmt.comment) def visitCmp(self, stmt: CmpStmt): - s = "cmp {}, {}".format(stmt.rhs.ugly,stmt.lhs.ugly) + s = f"cmp {stmt.rhs.ugly}, {stmt.lhs.ugly}" self.addLine(s, stmt.comment) def visitJump(self, stmt: JumpStmt): - s = "b.lo {}".format(stmt.destination.ugly) + s = f"b.lo {stmt.destination.ugly}" self.addLine(s, stmt.comment) def visitMov(self, stmt: MovStmt): @@ -120,9 +123,9 @@ def visitMov(self, stmt: MovStmt): else: src_str = stmt.src.ugly if stmt.typ == AsmType.f64x8: - s = "fmov {}, {}".format(stmt.dest.ugly_scalar_1d,src_str) + s = f"fmov {stmt.dest.ugly_scalar_1d}, {src_str}" else: - s = "mov {}, {}".format(stmt.dest.ugly,src_str) + s = f"mov {stmt.dest.ugly}, {src_str}" self.addLine(s, stmt.comment) @@ -133,12 +136,12 @@ def visitLoad(self, stmt: LoadStmt): src_str = stmt.src.ugly if stmt.typ == AsmType.i64: - s = "add {}, {}, {}".format(stmt.dest.ugly,stmt.dest.ugly,src_str) + s = f"add {stmt.dest.ugly}, {stmt.dest.ugly}, {src_str}" elif stmt.typ == AsmType.f64x8 and stmt.aligned: if stmt.dest2 is not None: - s = "ldp {}, {}, {}".format(stmt.dest.ugly_scalar,stmt.dest2.ugly_scalar,src_str) + s = f"ldp {stmt.dest.ugly_scalar}, {stmt.dest2.ugly_scalar}, {src_str}" else: - s = "ldr {}, {}".format(stmt.dest.ugly_scalar,src_str) + s = f"ldr {stmt.dest.ugly_scalar}, {src_str}" else: raise NotImplementedError() self.addLine(s, stmt.comment) @@ -151,12 +154,12 @@ def visitStore(self, stmt: StoreStmt): src_str = stmt.src.ugly if stmt.typ == AsmType.i64: - s = "add {}, {}, {}".format(stmt.dest.ugly,stmt.dest.ugly,src_str) + s = f"add {stmt.dest.ugly}, {stmt.dest.ugly}, {src_str}" elif stmt.typ == AsmType.f64x8 and stmt.aligned: if stmt.src2 is not None: - s = "stp {}, {}, {}".format(stmt.src.ugly_scalar,stmt.src2.ugly_scalar,stmt.dest.ugly) + s = f"stp {stmt.src.ugly_scalar}, {stmt.src2.ugly_scalar}, {stmt.dest.ugly}" else: - s = "str {}, {}".format(stmt.src.ugly_scalar,stmt.dest.ugly) + s = f"str {stmt.src.ugly_scalar}, {stmt.dest.ugly}" else: raise NotImplementedError() self.addLine(s, stmt.comment) diff --git a/pspamm/codegen/architectures/arm/operands.py b/pspamm/codegen/architectures/arm/operands.py index 5d2c643..2642c70 100644 --- a/pspamm/codegen/architectures/arm/operands.py +++ b/pspamm/codegen/architectures/arm/operands.py @@ -63,7 +63,8 @@ def ugly_lsl_shift(self): @property def clobbered(self): - return (self.value.split(".")[0]).replace("x", "r") + # removed [this comment should stay here for now---in case there's some compiler expecting it]: .replace("x", "r") + return (self.value.split(".")[0]) @property def ugly_scalar(self): @@ -72,6 +73,10 @@ def ugly_scalar(self): @property def ugly_scalar_1d(self): return (self.value.split(".")[0]).replace("v", "d") + + @property + def ugly_b32(self): + return (self.value.split(".")[0]).replace("x", "w") r = lambda n: Register_ARM(AsmType.i64, "x" + str(n)) diff --git a/pspamm/codegen/architectures/arm_sve/inlineprinter.py b/pspamm/codegen/architectures/arm_sve/inlineprinter.py index 73f9768..7e32526 100644 --- a/pspamm/codegen/architectures/arm_sve/inlineprinter.py +++ b/pspamm/codegen/architectures/arm_sve/inlineprinter.py @@ -19,7 +19,12 @@ def __init__(self, precision: Precision): self.output = [] self.stack = [] self.precision = precision - self.ugly_precision = "w" if self.precision.size() <= 4 else "x" + self.ugly_precision ={ + Precision.DOUBLE: "d", + Precision.SINGLE: "w", + Precision.HALF: "h", + Precision.BFLOAT16: "h", + }[self.precision] assert precision in (Precision.BFLOAT16, Precision.HALF, Precision.SINGLE, Precision.DOUBLE) @@ -46,7 +51,7 @@ def visitFma(self, stmt: FmaStmt): m = stmt.mult_src.ugly a = stmt.add_dest.ugly p = self.p_string(stmt.pred) - s = "fmla {}, {}{}, {}".format(a, p, m, b) + s = f"fmla {a}, {p}{m}, {b}" self.addLine(s, stmt.comment) @@ -56,12 +61,12 @@ def visitMul(self, stmt: MulStmt): a = stmt.dest.ugly if a != b: - s1 = "movprfx {}, {}".format(a.split(".")[0], b.split(".")[0]) + s1 = f"movprfx {a.split('.')[0]}, {b.split('.')[0]}" self.addLine(s1, "move {} into {}".format(b, a)) b = a p = self.p_string(stmt.pred) - s = "fmul {}, {}{}, {}".format(a, p, b, m) + s = f"fmul {a}, {p}{b}, {m}" self.addLine(s, stmt.comment) def visitBcst(self, stmt: BcstStmt): @@ -71,7 +76,7 @@ def visitBcst(self, stmt: BcstStmt): # make sure the src register is a W register when using single/half precision if self.precision.size() <= 4: b = "w" + b[1:] - s = "dup {}, {}".format(a, b) + s = f"dup {a}, {b}" self.addLine(s, stmt.comment) def visitAdd(self, stmt: AddStmt): @@ -81,48 +86,50 @@ def visitAdd(self, stmt: AddStmt): # https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/ADD--immediate---Add--immediate-- if (stmt.src.value >> 16) & 0xFFFF > 0 and stmt.src.value < 0: s = "mov x11, #-1" - s1 = "movk x11, #{}".format((stmt.src.value) & 0xFFFF) - val = ((stmt.src.value >> 16) & 0xFFFF) - s2 = "movk x11, #{}, lsl #16".format(val) + val1 = (stmt.src.value) & 0xFFFF + s1 = f"movk x11, #{val1}" + val2 = ((stmt.src.value >> 16) & 0xFFFF) + s2 = f"movk x11, #{val2}, lsl #16" self.addLine(s, "") self.addLine(s1, "load lower 16 bit of immediate that requires more than 16 bit") self.addLine(s2, "load upper 16 bit of immediate that requires more than 16 bit") elif (stmt.src.value >> 16) != 0: - s1 = "mov x11, #{}".format((stmt.src.value) & 0xFFFF) - val = ((stmt.src.value >> 16) & 0xFFFF) - s2 = "movk x11, #{}, lsl #16".format(val) + val1 = (stmt.src.value) & 0xFFFF + s1 = "mov x11, #{val1}" + val2 = ((stmt.src.value >> 16) & 0xFFFF) + s2 = "movk x11, #{val2}, lsl #16" self.addLine(s1, "load lower 16 bit of immediate that requires more than 16 bit") self.addLine(s2, "load upper 16 bit of immediate that requires more than 16 bit") else: - s = "mov x11, {}".format(stmt.src.ugly) + s = f"mov x11, {stmt.src.ugly}" self.addLine(s, "load lower 16 bit of immediate ") if stmt.dest.ugly != "x11": - s = "add {}, {}, x11".format(stmt.dest.ugly, stmt.dest.ugly) + s = f"add {stmt.dest.ugly}, {stmt.dest.ugly}, x11" self.addLine(s, stmt.comment) if stmt.additional is not None: - s = "add {}, {}, {}".format(stmt.dest.ugly, stmt.dest.ugly, stmt.additional.ugly) + s = f"add {stmt.dest.ugly}, {stmt.dest.ugly}, {stmt.additional.ugly}" self.addLine(s, stmt.comment) else: # if stmt.src is a Constant but outside of the above range of value < -4095 or value > 4095 # we can simply add the Constant to a register if stmt.additional is not None: - s = "add {}, {}, {}".format(stmt.dest.ugly, stmt.additional.ugly, stmt.src.ugly) + s = f"add {stmt.dest.ugly}, {stmt.additional.ugly}, {stmt.src.ugly}" else: - s = "add {}, {}, {}".format(stmt.dest.ugly, stmt.dest.ugly, stmt.src.ugly) + s = f"add {stmt.dest.ugly}, {stmt.dest.ugly}, {stmt.src.ugly}" self.addLine(s, stmt.comment) def visitLabel(self, stmt: LabelStmt): - s = "{}:".format(stmt.label.ugly) + s = f"{stmt.label.ugly}:" self.addLine(s, stmt.comment) def visitCmp(self, stmt: CmpStmt): - s = "cmp {}, {}".format(stmt.rhs.ugly, stmt.lhs.ugly) + s = f"cmp {stmt.rhs.ugly}, {stmt.lhs.ugly}" self.addLine(s, stmt.comment) def visitJump(self, stmt: JumpStmt): - s = "b.lo {}".format(stmt.destination.ugly) + s = f"b.lo {stmt.destination.ugly}" self.addLine(s, stmt.comment) def visitMov(self, stmt: MovStmt): @@ -131,18 +138,18 @@ def visitMov(self, stmt: MovStmt): else: src_str = stmt.src.ugly if stmt.typ == AsmType.f64x8: - s = "fmov {}, {}".format(stmt.dest.ugly, src_str) + s = f"fmov {stmt.dest.ugly}, {src_str}" else: - s = "mov {}, {}".format(stmt.dest.ugly, src_str) + s = f"mov {stmt.dest.ugly}, {src_str}" self.addLine(s, stmt.comment) def visitLoad(self, stmt: LoadStmt): if isinstance(stmt.src, Label): src_str = "#" + stmt.src.ugly elif stmt.src.ugly_offset != "0" and stmt.scalar_offs: - self.addLine("mov {}, #{}".format(stmt.add_reg.ugly, stmt.src.ugly_offset), "move immediate offset into {}".format(stmt.add_reg.ugly)) + self.addLine(f"mov {stmt.add_reg.ugly}, #{stmt.src.ugly_offset}", f"move immediate offset into {stmt.add_reg.ugly}") # TODO: adapt ugly_lsl_shift to account for possible single precision instead of double precision - src_str = "[{}, {}, LSL #{}]".format(stmt.src.ugly_base, stmt.add_reg.ugly, stmt.dest.ugly_lsl_shift) + src_str = f"[{stmt.src.ugly_base}, {stmt.add_reg.ugly}, LSL #{stmt.dest.ugly_lsl_shift}]" else: src_str = stmt.src.ugly if not stmt.is_B else stmt.src.ugly_no_vl_scaling @@ -150,12 +157,12 @@ def visitLoad(self, stmt: LoadStmt): prec = self.ugly_precision if stmt.typ == AsmType.i64: - s = "add {}, {}, {}".format(stmt.dest.ugly, stmt.dest.ugly, src_str) + s = f"add {stmt.dest.ugly}, {stmt.dest.ugly}, {src_str}" elif stmt.typ == AsmType.f64x8 and stmt.aligned: if stmt.is_B: - s = "ld1r{} {}, {}{}".format(prec, stmt.dest.ugly, p, src_str) + s = f"ld1r{prec} {stmt.dest.ugly}, {p}{src_str}" else: - s = "ld1{} {}, {}{}".format(prec, stmt.dest.ugly, p, src_str) + s = f"ld1{prec} {stmt.dest.ugly}, {p}{src_str}" else: raise NotImplementedError() self.addLine(s, stmt.comment) @@ -164,10 +171,10 @@ def visitStore(self, stmt: StoreStmt): if isinstance(stmt.src, Label): src_str = "#" + stmt.src.ugly elif stmt.dest.ugly_offset != "0" and stmt.scalar_offs: - self.addLine("mov {}, #{}".format(stmt.add_reg.ugly, stmt.dest.ugly_offset), - "move immediate offset into {}".format(stmt.add_reg.ugly)) + self.addLine(f"mov {stmt.add_reg.ugly}, #{stmt.dest.ugly_offset}", + f"move immediate offset into {stmt.add_reg.ugly}") # TODO: adapt ugly_lsl_shift to account for possible single precision instead of double precision - dest_str = "[{}, {}, LSL #{}]".format(stmt.dest.ugly_base, stmt.add_reg.ugly, stmt.src.ugly_lsl_shift) + dest_str = f"[{stmt.dest.ugly_base}, {stmt.add_reg.ugly}, LSL #{stmt.src.ugly_lsl_shift}]" else: dest_str = stmt.dest.ugly @@ -175,9 +182,9 @@ def visitStore(self, stmt: StoreStmt): prec = self.ugly_precision if stmt.typ == AsmType.i64: - s = "add {}, {}, {}".format(stmt.dest.ugly, stmt.dest.ugly, src_str) + s = f"add {stmt.dest.ugly}, {stmt.dest.ugly}, {src_str}" elif stmt.typ == AsmType.f64x8 and stmt.aligned: - s = "st1{} {}, {}{}".format(prec, stmt.src.ugly, p, dest_str) + s = f"st1{prec} {stmt.src.ugly}, {p}{dest_str}" else: raise NotImplementedError() self.addLine(s, stmt.comment) @@ -188,10 +195,10 @@ def visitPrefetch(self, stmt: PrefetchStmt): temporality = "KEEP" # could use "STRM" for non-temporal prefetching if needed xn = stmt.dest.ugly_base offset = stmt.dest.ugly_offset - src_string = "[{}, {}, MUL VL]".format(xn, offset) + src_string = f"[{xn}, {offset}, MUL VL]" p = self.p_string(stmt.pred) prec = self.ugly_precision - s = "prf{} P{}{}{}, {}{}".format(prec, stmt.access_type, cache_level, temporality, p.split('/')[0], src_string) + s = f"prf{prec} P{stmt.access_type}{cache_level}{temporality}, {p.split('/')[0]}{src_string}" self.addLine(s, "prefetch from memory") def visitBlock(self, block: Block): diff --git a/pspamm/codegen/architectures/hsw/inlineprinter.py b/pspamm/codegen/architectures/hsw/inlineprinter.py index f39ab34..54d98de 100644 --- a/pspamm/codegen/architectures/hsw/inlineprinter.py +++ b/pspamm/codegen/architectures/hsw/inlineprinter.py @@ -21,7 +21,8 @@ def __init__(self, precision: Precision): self.output = [] self.stack = [] assert precision in (Precision.SINGLE, Precision.DOUBLE) - self.precision = { + self.precision = precision + self.psuffix = { Precision.DOUBLE: "d", Precision.SINGLE: "s" }[precision] @@ -51,14 +52,14 @@ def visitFma(self, stmt: FmaStmt): a = stmt.add_dest.ugly # no broadcasting supported inside the instruction (unlike AVX-512) - s = "vfmadd231p{} {}, {}, {}".format(self.precision, b,m,a) + s = f"vfmadd231p{self.psuffix} {b}, {m}, {a}" self.addLine(s, stmt.comment) def visitMul(self, stmt: MulStmt): b = stmt.src.ugly m = stmt.mult_src.ugly a = stmt.dest.ugly - s = "vmulp{} {}, {}, {}".format(self.precision, b,m,a) + s = f"vmulp{self.psuffix} {b}, {m}, {a}" self.addLine(s, stmt.comment) def visitBcst(self, stmt: BcstStmt): @@ -68,23 +69,25 @@ def visitBcst(self, stmt: BcstStmt): if isinstance(stmt.bcast_src, Register): # reformat bcast_src to be a memory address b = "0({})".format(b) - s = "vbroadcasts{} {}, {}".format(self.precision, b, a) + regsize = stmt.dest.size() + instruction = "vmovddup" if self.precision == Precision.DOUBLE and regsize == 16 else f"vbroadcasts{self.psuffix}" + s = f"{instruction} {b}, {a}" self.addLine(s, stmt.comment) def visitAdd(self, stmt: AddStmt): - s = "addq {}, {}".format(stmt.src.ugly,stmt.dest.ugly) + s = f"addq {stmt.src.ugly}, {stmt.dest.ugly}" self.addLine(s, stmt.comment) def visitLabel(self, stmt: LabelStmt): - s = "{}:".format(stmt.label.ugly) + s = f"{stmt.label.ugly}:" self.addLine(s, stmt.comment) def visitCmp(self, stmt: CmpStmt): - s = "cmp {}, {}".format(stmt.lhs.ugly,stmt.rhs.ugly) + s = f"cmp {stmt.lhs.ugly}, {stmt.rhs.ugly}" self.addLine(s, stmt.comment) def visitJump(self, stmt: JumpStmt): - s = "jl {}".format(stmt.destination.ugly) + s = f"jl {stmt.destination.ugly}" self.addLine(s, stmt.comment) def visitMov(self, stmt: MovStmt): @@ -94,22 +97,22 @@ def visitMov(self, stmt: MovStmt): src_str = stmt.src.ugly if stmt.typ == AsmType.i64: - s = "movq {}, {}".format(src_str,stmt.dest.ugly) + s = f"movq {src_str}, {stmt.dest.ugly}" elif stmt.typ == AsmType.f64x8 and stmt.aligned: if isinstance(stmt.src, Constant) and stmt.src.value == 0: - s = "vpxor {}, {}, {}".format(stmt.dest.ugly,stmt.dest.ugly,stmt.dest.ugly) + s = f"vpxor {stmt.dest.ugly}, {stmt.dest.ugly}, {stmt.dest.ugly}" else: - s = "vmovup{} {}, {}".format(self.precision, src_str,stmt.dest.ugly) + s = f"vmovup{self.psuffix} {src_str}, {stmt.dest.ugly}" else: raise NotImplementedError() self.addLine(s, stmt.comment) def visitLea(self, stmt: LeaStmt): - s = "leaq {}({}), {}".format(stmt.offset,stmt.src.ugly,stmt.dest.ugly) + s = f"leaq {stmt.offset}({stmt.src.ugly}), {stmt.dest.ugly}" self.addLine(s, stmt.comment) def visitPrefetch(self, stmt: PrefetchStmt): - s = "prefetcht1 {}".format(stmt.dest.ugly) + s = f"prefetcht1 {stmt.dest.ugly}" self.addLine(s, stmt.comment) def visitBlock(self, block: Block): diff --git a/pspamm/codegen/architectures/knl/inlineprinter.py b/pspamm/codegen/architectures/knl/inlineprinter.py index 33b313f..01403c7 100644 --- a/pspamm/codegen/architectures/knl/inlineprinter.py +++ b/pspamm/codegen/architectures/knl/inlineprinter.py @@ -21,7 +21,8 @@ def __init__(self, precision: Precision): self.output = [] self.stack = [] assert precision in (Precision.BFLOAT16, Precision.HALF, Precision.SINGLE, Precision.DOUBLE) - self.precision = { + self.precision = precision + self.psuffix = { Precision.DOUBLE: 'd', Precision.SINGLE: 's', Precision.HALF: 'h', @@ -55,83 +56,101 @@ def addLine(self, stmt: str, comment: str): self.output.append(line) def maskformat(self, pred): - pass + if pred is None: + return '' + elif pred.zero: + return f'{{{pred.register.ugly}}}{{z}}' + else: + return f'{{{pred.register.ugly}}}' def visitFma(self, stmt: FmaStmt): + mask = self.maskformat(stmt.pred) b = stmt.bcast_src.ugly m = stmt.mult_src.ugly a = stmt.add_dest.ugly regsize = stmt.add_dest.size() // 16 + extent = regsize * self.broadcast_multiplier if stmt.bcast: - s = "vfmadd231p{} {}%{{1to{}%}}, {}, {}".format(self.precision, b, regsize * self.broadcast_multiplier, m, a) + s = f"vfmadd231p{self.psuffix} {b}%{{1to{extent}%}} {mask}, {m}, {a}" else: if stmt.mult_src.typeinfo == AsmType.i64: # in this case, m is a Register that points to alpha; manually format to be a memory address - s = "vfmadd231p{} 0({})%{{1to{}%}}, {}, {}".format(self.precision, m, regsize * self.broadcast_multiplier, b, a) + s = f"vfmadd231p{self.psuffix} 0({m})%{{1to{extent}%}} {mask}, {b}, {a}" else: - s = "vfmadd231p{} {}, {}, {}".format(self.precision, b,m,a) + s = f"vfmadd231p{self.psuffix} {b} {mask}, {m}, {a}" self.addLine(s, stmt.comment) def visitMul(self, stmt: MulStmt): + mask = self.maskformat(stmt.pred) b = stmt.src.ugly m = stmt.mult_src.ugly a = stmt.dest.ugly regsize = stmt.dest.size() // 16 + extent = regsize * self.broadcast_multiplier if stmt.mult_src.typeinfo == AsmType.i64: # in this case, m is a Register that points to alpha/beta; manually format to be a memory address - s = "vmulp{} 0({})%{{1to{}%}}, {}, {}".format(self.precision, m, regsize * self.broadcast_multiplier, b, a) + s = f"vmulp{self.psuffix} 0({m})%{{1to{extent}%}} {mask}, {b}, {a}" else: - s = "vmulp{} {}, {}, {}".format(self.precision, b,m,a) + s = f"vmulp{self.psuffix} {b} {mask}, {m}, {a}" self.addLine(s, stmt.comment) def visitBcst(self, stmt: BcstStmt): + mask = self.maskformat(stmt.pred) b = stmt.bcast_src.ugly a = stmt.dest.ugly - s = "vbroadcasts{} {}, {}".format(self.precision, b,a) + regsize = stmt.dest.size() + instruction = "vmovddup" if self.precision == Precision.DOUBLE and regsize == 16 else f"vbroadcasts{self.psuffix}" + s = f"{instruction} {b} {mask}, {a}" self.addLine(s, stmt.comment) def visitAdd(self, stmt: AddStmt): - s = "addq {}, {}".format(stmt.src.ugly,stmt.dest.ugly) + mask = self.maskformat(stmt.pred) + s = f"addq {stmt.src.ugly} {mask}, {stmt.dest.ugly}" self.addLine(s, stmt.comment) def visitLabel(self, stmt: LabelStmt): - s = "{}:".format(stmt.label.ugly) + s = f"{stmt.label.ugly}:" self.addLine(s, stmt.comment) def visitCmp(self, stmt: CmpStmt): - s = "cmp {}, {}".format(stmt.lhs.ugly,stmt.rhs.ugly) + mask = self.maskformat(stmt.pred) + s = f"cmp {stmt.lhs.ugly} {mask}, {stmt.rhs.ugly}" self.addLine(s, stmt.comment) def visitJump(self, stmt: JumpStmt): - s = "jl {}".format(stmt.destination.ugly) + s = f"jl {stmt.destination.ugly}" self.addLine(s, stmt.comment) def visitMov(self, stmt: MovStmt): + mask = self.maskformat(stmt.pred) + if isinstance(stmt.src, Label): src_str = "$" + stmt.src.ugly else: src_str = stmt.src.ugly if stmt.typ == AsmType.i64: + assert(stmt.pred == None) if stmt.dest.ugly[0] == 'k': - s = "kmovq {}, {}".format(src_str,stmt.dest.ugly) + s = f"kmovq {src_str}, {stmt.dest.ugly}" else: - s = "movq {}, {}".format(src_str,stmt.dest.ugly) + s = f"movq {src_str}, {stmt.dest.ugly}" elif stmt.typ == AsmType.f64x8 and stmt.aligned: if isinstance(stmt.src, Constant) and stmt.src.value == 0: - s = "vpxord {}, {}, {}".format(stmt.dest.ugly,stmt.dest.ugly,stmt.dest.ugly) + s = f"vpxord {stmt.dest.ugly} {mask}, {stmt.dest.ugly}, {stmt.dest.ugly}" else: - s = "vmovup{} {}, {}".format(self.precision, src_str,stmt.dest.ugly) + s = f"vmovup{self.psuffix} {src_str} {mask}, {stmt.dest.ugly}" else: raise NotImplementedError() self.addLine(s, stmt.comment) def visitLea(self, stmt: LeaStmt): - s = "leaq {}({}), {}".format(stmt.offset,stmt.src.ugly,stmt.dest.ugly) + mask = self.maskformat(stmt.pred) + s = f"leaq {stmt.offset}({stmt.src.ugly}) {mask}, {stmt.dest.ugly}" self.addLine(s, stmt.comment) def visitPrefetch(self, stmt: PrefetchStmt): - s = "prefetcht1 {}".format(stmt.dest.ugly) + s = f"prefetcht1 {stmt.dest.ugly}" self.addLine(s, stmt.comment) def visitBlock(self, block: Block): diff --git a/pspamm/codegen/architectures/knl/operands.py b/pspamm/codegen/architectures/knl/operands.py index b7c450f..b0fe394 100644 --- a/pspamm/codegen/architectures/knl/operands.py +++ b/pspamm/codegen/architectures/knl/operands.py @@ -76,3 +76,8 @@ def mem(base, offset, index=None, scaling=None): ymm = lambda n: Register_KNL(AsmType.f64x4, "ymm"+str(n)) zmm = lambda n: Register_KNL(AsmType.f64x8, "zmm"+str(n)) mask = lambda n: Register_KNL(AsmType.i64, "k"+str(n)) + +class Predicate: + def __init__(self, register: Register_KNL, zero: bool): + self.register = register + self.zero = zero diff --git a/pspamm/codegen/ast.py b/pspamm/codegen/ast.py index 9287930..356478c 100644 --- a/pspamm/codegen/ast.py +++ b/pspamm/codegen/ast.py @@ -29,6 +29,7 @@ class MovStmt(AsmStmt): dest = None typ = None aligned = False + pred = None def accept(self, visitor: "Visitor"): visitor.visitMov(self) @@ -39,6 +40,7 @@ class LeaStmt(AsmStmt): offset = None typ = None aligned = False + pred = None def accept(self, visitor: "Visitor"): visitor.visitLea(self) @@ -105,6 +107,7 @@ def accept(self, visitor: "Visitor"): class BcstStmt(AsmStmt): bcast_src = None dest = None + pred = None def accept(self, visitor: "Visitor"): visitor.visitBcst(self) @@ -114,6 +117,7 @@ class AddStmt(AsmStmt): dest = None typ = None additional = None + pred = None def accept(self, visitor: "Visitor"): visitor.visitAdd(self) @@ -121,6 +125,7 @@ def accept(self, visitor: "Visitor"): class CmpStmt(AsmStmt): lhs = None rhs = None + pred = None def accept(self, visitor: "Visitor"): visitor.visitCmp(self) From 17f23f9f956e9c33c06ff13899b746c0a95a2469 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 12 Oct 2024 12:11:25 +0200 Subject: [PATCH 25/64] Merge test scripts --- tests/sve_testsuite_generator.py | 116 ------------------------------- tests/testsuite_generator.py | 17 ++++- tests/unit_tests_arm_sve.py | 2 +- 3 files changed, 17 insertions(+), 118 deletions(-) delete mode 100644 tests/sve_testsuite_generator.py diff --git a/tests/sve_testsuite_generator.py b/tests/sve_testsuite_generator.py deleted file mode 100644 index 71511b3..0000000 --- a/tests/sve_testsuite_generator.py +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/python3 -from collections import namedtuple -import subprocess -import numpy as np -import random -import sys -import os -import testsuite_generator as test_generator -from pspamm.codegen.precision import * - -BASEDIR = 'build' - -SparseKernel = test_generator.SparseKernel -DenseKernel = test_generator.DenseKernel - -setup_prefetching = """ -template -void setup_prefetch(T*& prefetch, T* matrix, unsigned n, unsigned ldc) { - posix_memalign(reinterpret_cast(&prefetch), 64, ldc*n*sizeof(T)); - std::memcpy(prefetch, matrix, ldc*n*sizeof(T)); -} -""" - -def generateMTX(k, n, nnz): - return test_generator.generateMTX(k, n, nnz) - -def make(kernels, arch): - os.makedirs(os.path.join(BASEDIR, arch), exist_ok=True) - - f = open(os.path.join(BASEDIR, f'{arch}_testsuite.cpp'), 'w') - - f.write(test_generator.head_of_testsuite) - - testcases = [] - - for kern in kernels: - arguments = ['pspamm-generator', str(kern.m), str(kern.n), str(kern.k), str(kern.lda), - str(kern.ldb), str(kern.ldc), str(kern.alpha), str(kern.beta)] - - if isinstance(kern, SparseKernel): - arguments += ['--mtx_filename', kern.mtx] - - prec = 's' if kern.precision == Precision.SINGLE else 'd' - arguments += ['--precision', prec] - - block_sizes = list(set(bs if len(bs) > 2 else (bs[0], bs[1], 1) for bs in kern.block_sizes)) - - for bs in block_sizes: - bm = bs[0] - bn = bs[1] - bk = bs[2] - - if arch == "knl": - assert (bm % 8 == 0 and (bn + 1) * (bm / 8) <= 32) - elif arch == "arm": - assert (bm % 2 == 0 and (bn + 1) * (bm / 2) + bn <= 32) - elif arch.startswith("arm_sve"): - veclen = int(arch[7:]) - assert veclen % 128 == 0 and veclen <= 2048 - reglen = veclen // 128 - v_len = (16 // kern.precision.size()) * reglen - # this should be the same assertion as in ../scripts/max_arm_sve.py - # ceiling division - vm = -(bm // -v_len) - if not ((bn + bk) * vm + bn * bk <= 32): - print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') - continue - - name = kern.name + '_' + str(bm) + '_' + str(bn) + '_' + str(bk) - - additional_args = ['--output_funcname', name, '--output_filename', os.path.join(BASEDIR, arch, name + '.h'), - '--output_overwrite'] - additional_args += ['--bm', str(bm), '--bn', str(bn), '--bk', str(bk), '--arch', arch, '--prefetching', 'BL2viaC'] - - try: - subprocess.check_output(arguments + additional_args, stderr=subprocess.STDOUT) - except subprocess.CalledProcessError as e: - raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) - - f.write('#include "' + arch + '/' + name + '.h"\n') - - if isinstance(kern, SparseKernel): - mtx = kern.mtx - else: - mtx = "" - - prec2 = 'f' if kern.precision == Precision.SINGLE else '' - - testcases += [ - """ -{{ - unsigned ldb = {ldb}; - {T} alpha = {alpha}; - {T} beta = {beta}; - {T}* prefetch = nullptr; - auto pointers = pre<{T}>({m}, {n}, {k}, {lda}, ldb, {ldc}, "{mtx}"); - setup_prefetch(prefetch, std::get<3>(pointers), {n}, {ldc}); - {name}(std::get<0>(pointers), std::get<{sparse}>(pointers), std::get<3>(pointers), alpha, beta, prefetch); - const auto result = post<{T}>({m}, {n}, {k}, {lda}, &ldb, {ldc}, &alpha, &beta, std::get<0>(pointers), std::get<1>(pointers), std::get<3>(pointers), std::get<4>(pointers), {delta:.7f}); - results.push_back(std::make_tuple("{name}", result)); - free(std::get<0>(pointers)); free(std::get<1>(pointers)); free(std::get<2>(pointers)); free(std::get<3>(pointers)); free(std::get<4>(pointers)); free(prefetch); -}} -""".format(m=kern.m, n=kern.n, k=kern.k, lda=kern.lda, ldb=kern.ldb, ldc=kern.ldc, alpha=kern.alpha, beta=kern.beta, - mtx=mtx, delta=kern.delta, name=name, sparse=2 if kern.ldb == 0 else 1, p=prec2, T=kern.precision.ctype()) - ] - - f.write('\n') - # necessary functions are defined in testsuite_generator.py - f.write(test_generator.function_definitions) - f.write(setup_prefetching) - f.write(test_generator.setup_main) - - for testcase in testcases: - f.write(testcase) - - f.write(test_generator.end_of_testsuite) diff --git a/tests/testsuite_generator.py b/tests/testsuite_generator.py index 2668c1f..d57f730 100755 --- a/tests/testsuite_generator.py +++ b/tests/testsuite_generator.py @@ -50,6 +50,12 @@ } } +template +void setup_prefetch(T*& prefetch, T* matrix, unsigned n, unsigned ldc) { + posix_memalign(reinterpret_cast(&prefetch), 64, ldc*n*sizeof(T)); + std::memcpy(prefetch, matrix, ldc*n*sizeof(T)); +} + template std::tuple pre(unsigned M, unsigned N, unsigned K, unsigned LDA, unsigned LDB, unsigned LDC, std::string MTX) { @@ -154,7 +160,9 @@ unsigned ldb = {ldb}; {precision} alpha = {alpha}; {precision} beta = {beta}; + {precision}* prefetch = nullptr; auto pointers = pre<{precision}>({m}, {n}, {k}, {lda}, ldb, {ldc}, "{mtx}"); + setup_prefetch(prefetch, std::get<3>(pointers), {n}, {ldc}); {name}(std::get<0>(pointers), std::get<{sparse}>(pointers), std::get<3>(pointers), {alpha}, {beta}, nullptr); const auto result = post<{precision}>({m}, {n}, {k}, {lda}, &ldb, {ldc}, &alpha, &beta, std::get<0>(pointers), std::get<1>(pointers), std::get<3>(pointers), std::get<4>(pointers), {delta:.7f}); results.push_back(std::make_tuple("{name}", result)); @@ -238,7 +246,10 @@ def make(kernels, arch): bn = bs[1] bk = bs[2] - veclen = int(arch[3:]) if arch[3:] != '' else 128 + if arch.startswith("arm_sve"): + veclen = int(arch[7:]) if arch[7:] != '' else 128 + else: + veclen = int(arch[3:]) if arch[3:] != '' else 128 assert veclen % 128 == 0 reglen = veclen // 128 v_len = (16 // kern.precision.size()) * reglen @@ -256,6 +267,10 @@ def make(kernels, arch): if not ((bn+bk) * vm + bn * bk <= 16) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') continue + elif arch.startswith("arm_sve"): + if not ((bn+bk) * vm + bn * bk <= 32): + print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') + continue elif arch.startswith("arm"): if not ((bn+bk) * vm + bn * bk <= 32) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') diff --git a/tests/unit_tests_arm_sve.py b/tests/unit_tests_arm_sve.py index b57b2f9..3d5203d 100644 --- a/tests/unit_tests_arm_sve.py +++ b/tests/unit_tests_arm_sve.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -import sve_testsuite_generator as generator +import testsuite_generator as generator import pspamm.scripts.max_arm_sve as max_sve from pspamm.codegen.precision import * From 1d2656a1a5a3df64cd06f140b51d35dffaf7c245 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Fri, 18 Oct 2024 16:50:01 +0200 Subject: [PATCH 26/64] Adjust tests a bit more, and correct instructions --- .../architectures/knl/inlineprinter.py | 31 ++++++++----- pspamm/matmul.py | 4 -- tests/unit_tests_hsw.py | 45 ++++++++++--------- 3 files changed, 44 insertions(+), 36 deletions(-) diff --git a/pspamm/codegen/architectures/knl/inlineprinter.py b/pspamm/codegen/architectures/knl/inlineprinter.py index 01403c7..f21ddfb 100644 --- a/pspamm/codegen/architectures/knl/inlineprinter.py +++ b/pspamm/codegen/architectures/knl/inlineprinter.py @@ -26,7 +26,13 @@ def __init__(self, precision: Precision): Precision.DOUBLE: 'd', Precision.SINGLE: 's', Precision.HALF: 'h', - Precision.BFLOAT16: 'bf16' + Precision.BFLOAT16: 'h' + }[precision] + self.alupsuffix = { + Precision.DOUBLE: 'pd', + Precision.SINGLE: 'ps', + Precision.HALF: 'ph', + Precision.BFLOAT16: 'nepbf16' }[precision] self.broadcast_multiplier = { Precision.DOUBLE: 2, @@ -71,13 +77,13 @@ def visitFma(self, stmt: FmaStmt): regsize = stmt.add_dest.size() // 16 extent = regsize * self.broadcast_multiplier if stmt.bcast: - s = f"vfmadd231p{self.psuffix} {b}%{{1to{extent}%}} {mask}, {m}, {a}" + s = f"vfmadd231{self.alupsuffix} {b}%{{1to{extent}%}} {mask}, {m}, {a}" else: if stmt.mult_src.typeinfo == AsmType.i64: # in this case, m is a Register that points to alpha; manually format to be a memory address - s = f"vfmadd231p{self.psuffix} 0({m})%{{1to{extent}%}} {mask}, {b}, {a}" + s = f"vfmadd231{self.alupsuffix} 0({m})%{{1to{extent}%}} {mask}, {b}, {a}" else: - s = f"vfmadd231p{self.psuffix} {b} {mask}, {m}, {a}" + s = f"vfmadd231{self.alupsuffix} {b} {mask}, {m}, {a}" self.addLine(s, stmt.comment) def visitMul(self, stmt: MulStmt): @@ -89,9 +95,9 @@ def visitMul(self, stmt: MulStmt): extent = regsize * self.broadcast_multiplier if stmt.mult_src.typeinfo == AsmType.i64: # in this case, m is a Register that points to alpha/beta; manually format to be a memory address - s = f"vmulp{self.psuffix} 0({m})%{{1to{extent}%}} {mask}, {b}, {a}" + s = f"vmul{self.alupsuffix} 0({m})%{{1to{extent}%}} {mask}, {b}, {a}" else: - s = f"vmulp{self.psuffix} {b} {mask}, {m}, {a}" + s = f"vmul{self.alupsuffix} {b} {mask}, {m}, {a}" self.addLine(s, stmt.comment) def visitBcst(self, stmt: BcstStmt): @@ -99,13 +105,18 @@ def visitBcst(self, stmt: BcstStmt): b = stmt.bcast_src.ugly a = stmt.dest.ugly regsize = stmt.dest.size() - instruction = "vmovddup" if self.precision == Precision.DOUBLE and regsize == 16 else f"vbroadcasts{self.psuffix}" + if self.precision == Precision.HALF or self.precision == Precision.BFLOAT16: + instruction = 'vpbroadcastw' + elif self.precision == Precision.DOUBLE and regsize == 16: + instruction = 'vmovddup' + else: + instruction = f"vbroadcasts{self.psuffix}" s = f"{instruction} {b} {mask}, {a}" self.addLine(s, stmt.comment) def visitAdd(self, stmt: AddStmt): - mask = self.maskformat(stmt.pred) - s = f"addq {stmt.src.ugly} {mask}, {stmt.dest.ugly}" + # only used for scalar addition right now + s = f"addq {stmt.src.ugly}, {stmt.dest.ugly}" self.addLine(s, stmt.comment) def visitLabel(self, stmt: LabelStmt): @@ -139,7 +150,7 @@ def visitMov(self, stmt: MovStmt): if isinstance(stmt.src, Constant) and stmt.src.value == 0: s = f"vpxord {stmt.dest.ugly} {mask}, {stmt.dest.ugly}, {stmt.dest.ugly}" else: - s = f"vmovup{self.psuffix} {src_str} {mask}, {stmt.dest.ugly}" + s = f"vmovupd {src_str} {mask}, {stmt.dest.ugly}" else: raise NotImplementedError() self.addLine(s, stmt.comment) diff --git a/pspamm/matmul.py b/pspamm/matmul.py index 6b43b9f..0fe8bdc 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -202,10 +202,6 @@ def __init__(self, self.nnz = ldb * self.n self.flop = m * n * k * 2 - #if prefetching is not None: - # prefetchReg = self.generator.init_prefetching(self.prefetching) - #else: - # prefetchReg = None prefetchReg = self.generator.init_prefetching(self.prefetching) # if matrices are always padded to multiple of v_size, we can remove the if-part and execute the assert for SVE too diff --git a/tests/unit_tests_hsw.py b/tests/unit_tests_hsw.py index 92fa298..4c30fb1 100755 --- a/tests/unit_tests_hsw.py +++ b/tests/unit_tests_hsw.py @@ -8,29 +8,30 @@ blocksize_algs = [max_square, old] +delta_sp = 1e-4 # epsilon is around e-7 => /2 ... For most cases, 1e-6 is enough +delta_dp = 1e-7 # epsilon is around e-15 => /2 + kernels = [] -for precision in (Precision.SINGLE, Precision.DOUBLE): - kernels.append(generator.SparseKernel(f"test1_{precision}", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) - kernels.append(generator.DenseKernel(f"test2_{precision}", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8,2)] + [x.getBlocksize(8, 40, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel(f"test3_{precision}", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.SparseKernel(f"hsw_only_test1_{precision}", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 1), 0.0000001)) - kernels.append(generator.SparseKernel(f"hsw_only_test2_{precision}", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2)] + [x.getBlocksize(24, 40, 2) for x in blocksize_algs], generator.generateMTX(40, 40, 20), 0.0000001)) - - kernels.append(generator.SparseKernel(f"hsw_only_test3_{precision}", precision, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 2), 0.0000001)) - kernels.append(generator.SparseKernel(f"hsw_only_test4_{precision}", precision, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], generator.generateMTX(10, 20, 1), 0.0000001)) - kernels.append(generator.SparseKernel(f"hsw_only_test5_{precision}", precision, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], generator.generateMTX(10, 5, 1), 0.0000001)) - kernels.append(generator.SparseKernel(f"hsw_only_test6_{precision}", precision, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) - kernels.append(generator.SparseKernel(f"hsw_only_test7_{precision}", precision, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8,1)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], generator.generateMTX(40, 24, 1), 0.0000001)) - - kernels.append(generator.DenseKernel(f"hsw_only_test8_{precision}", precision, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel(f"hsw_only_test9_{precision}", precision, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2)] + [x.getBlocksize(32, 40, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel(f"hsw_only_test10_{precision}", precision, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [x.getBlocksize(56, 28, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel(f"hsw_only_test11_{precision}", precision, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel(f"hsw_only_test12_{precision}", precision, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel(f"hsw_only_test13_{precision}", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel(f"hsw_only_test14_{precision}", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], 0.0000001)) +for precision, delta in zip((Precision.SINGLE, Precision.DOUBLE), (delta_sp, delta_dp)): + kernels.append(generator.SparseKernel(f"test1_{precision}", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], generator.generateMTX(56, 56, 30), delta)) + kernels.append(generator.DenseKernel(f"test2_{precision}", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8,2)] + [x.getBlocksize(8, 40, 2) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel(f"test3_{precision}", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], delta)) + kernels.append(generator.SparseKernel(f"hsw_only_test1_{precision}", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 1), delta)) + kernels.append(generator.SparseKernel(f"hsw_only_test2_{precision}", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2)] + [x.getBlocksize(24, 40, 2) for x in blocksize_algs], generator.generateMTX(40, 40, 20), delta)) + + kernels.append(generator.SparseKernel(f"hsw_only_test3_{precision}", precision, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 2), delta)) + kernels.append(generator.SparseKernel(f"hsw_only_test4_{precision}", precision, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], generator.generateMTX(10, 20, 1), delta)) + kernels.append(generator.SparseKernel(f"hsw_only_test5_{precision}", precision, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], generator.generateMTX(10, 5, 1), delta)) + kernels.append(generator.SparseKernel(f"hsw_only_test6_{precision}", precision, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], generator.generateMTX(1, 1, 1), delta)) + kernels.append(generator.SparseKernel(f"hsw_only_test7_{precision}", precision, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8,1)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], generator.generateMTX(40, 24, 1), delta)) + + kernels.append(generator.DenseKernel(f"hsw_only_test8_{precision}", precision, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel(f"hsw_only_test9_{precision}", precision, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2)] + [x.getBlocksize(32, 40, 2) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel(f"hsw_only_test10_{precision}", precision, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [x.getBlocksize(56, 28, 2) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel(f"hsw_only_test11_{precision}", precision, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel(f"hsw_only_test12_{precision}", precision, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel(f"hsw_only_test13_{precision}", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel(f"hsw_only_test14_{precision}", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], delta)) for arch in ('hsw256', 'hsw128'): generator.make(kernels, arch) - - From 1edd02c35fbc0871004ccbedb40ee038706e1dbb Mon Sep 17 00:00:00 2001 From: David Schneller Date: Fri, 18 Oct 2024 17:04:36 +0200 Subject: [PATCH 27/64] Output more error infos --- tests/testsuite_generator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/testsuite_generator.py b/tests/testsuite_generator.py index d57f730..ea1d979 100755 --- a/tests/testsuite_generator.py +++ b/tests/testsuite_generator.py @@ -16,7 +16,8 @@ #include #include #include -#include +#include +#include #include @@ -139,6 +140,7 @@ // we use the relative error instead of the absolute error because of an issue we found for sparse single precision // kernels presumably due to limited precision of floats if(std::abs((C[i + j * LDC] - Cref[i + j * LDC])) / Cref[i + j * LDC] > DELTA) { + std::cout << i << " " << j << " " << C[i + j * LDC] << " " << Cref[i + j * LDC] << std::endl; return 0; } } From ad887eb9cfe9b6077607557f48914bb3ae3aec4a Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 14:14:18 +0200 Subject: [PATCH 28/64] Generalize broadcasting interface a tiny bit --- pspamm/codegen/architectures/arm/generator.py | 2 +- pspamm/codegen/architectures/arm/inlineprinter.py | 4 ++-- pspamm/codegen/architectures/arm_sve/generator.py | 2 +- pspamm/codegen/architectures/arm_sve/inlineprinter.py | 5 ++++- pspamm/codegen/architectures/hsw/generator.py | 2 +- pspamm/codegen/architectures/knl/generator.py | 2 +- pspamm/codegen/architectures/knl/inlineprinter.py | 2 +- pspamm/codegen/sugar.py | 2 +- pspamm/matmul.py | 2 +- 9 files changed, 13 insertions(+), 10 deletions(-) diff --git a/pspamm/codegen/architectures/arm/generator.py b/pspamm/codegen/architectures/arm/generator.py index 3546955..9ed2799 100644 --- a/pspamm/codegen/architectures/arm/generator.py +++ b/pspamm/codegen/architectures/arm/generator.py @@ -236,7 +236,7 @@ def make_microkernel(self, if B.has_nonzero_cell(B_ptr, to_B_block, to_cell): B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) comment = "C[{}:{},{}] += A[{}:{},{}]*{}".format(Vmi*v_size, Vmi*v_size+v_size, bni, Vmi*v_size, Vmi*v_size+v_size, bki, B_comment) - asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment)) + asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=0)) return asm diff --git a/pspamm/codegen/architectures/arm/inlineprinter.py b/pspamm/codegen/architectures/arm/inlineprinter.py index 1fba1d9..efb93f7 100644 --- a/pspamm/codegen/architectures/arm/inlineprinter.py +++ b/pspamm/codegen/architectures/arm/inlineprinter.py @@ -49,8 +49,8 @@ def visitFma(self, stmt: FmaStmt): b = stmt.bcast_src.ugly m = stmt.mult_src.ugly a = stmt.add_dest.ugly - if stmt.bcast: - s = f"fmla {a}, {m}, {b}[0]" + if stmt.bcast is not None: + s = f"fmla {a}, {m}, {b}[{stmt.bcast}]" else: s = f"fmla {a}, {m}, {b}" self.addLine(s, stmt.comment) diff --git a/pspamm/codegen/architectures/arm_sve/generator.py b/pspamm/codegen/architectures/arm_sve/generator.py index 61a8217..cf9d331 100644 --- a/pspamm/codegen/architectures/arm_sve/generator.py +++ b/pspamm/codegen/architectures/arm_sve/generator.py @@ -350,7 +350,7 @@ def make_microkernel(self, B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) comment = "C[{}:{},{}] += A[{}:{},{}]*{}".format(Vmi * v_size, end_index, bni, Vmi * v_size, end_index, bki, B_comment) - asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, pred=p_merging)) + asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, pred=p_merging, bcast=None)) return asm def init_prefetching(self, prefetching): diff --git a/pspamm/codegen/architectures/arm_sve/inlineprinter.py b/pspamm/codegen/architectures/arm_sve/inlineprinter.py index 7e32526..dd456f8 100644 --- a/pspamm/codegen/architectures/arm_sve/inlineprinter.py +++ b/pspamm/codegen/architectures/arm_sve/inlineprinter.py @@ -51,7 +51,10 @@ def visitFma(self, stmt: FmaStmt): m = stmt.mult_src.ugly a = stmt.add_dest.ugly p = self.p_string(stmt.pred) - s = f"fmla {a}, {p}{m}, {b}" + if stmt.bcast is not None: + s = f"fmla {a}, {p}{m}, {b}[{stmt.bcast}]" + else: + s = f"fmla {a}, {p}{m}, {b}" self.addLine(s, stmt.comment) diff --git a/pspamm/codegen/architectures/hsw/generator.py b/pspamm/codegen/architectures/hsw/generator.py index 912183a..ac09192 100644 --- a/pspamm/codegen/architectures/hsw/generator.py +++ b/pspamm/codegen/architectures/hsw/generator.py @@ -286,7 +286,7 @@ def make_microkernel(self, B_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) self.reg_based_scaling(B_addr, additional_regs, True) comment = "C[{}:{},{}] += A[{}:{},{}]*{}".format(Vmi*v_size,Vmi*v_size+v_size,bni,Vmi*v_size,Vmi*v_size+v_size,bki,B_comment) - asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=False)) + asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=None)) return asm def init_prefetching(self, prefetching): diff --git a/pspamm/codegen/architectures/knl/generator.py b/pspamm/codegen/architectures/knl/generator.py index 9530ef8..2a7e3a5 100644 --- a/pspamm/codegen/architectures/knl/generator.py +++ b/pspamm/codegen/architectures/knl/generator.py @@ -243,7 +243,7 @@ def make_microkernel(self, B_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) self.reg_based_scaling(B_addr, additional_regs, True) comment = "C[{}:{},{}] += A[{}:{},{}]*{}".format(Vmi*v_size,Vmi*v_size+v_size,bni,Vmi*v_size,Vmi*v_size+v_size,bki,B_comment) - asm.add(fma(B_addr, A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment)) + asm.add(fma(B_addr, A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=0)) return asm def init_prefetching(self, prefetching): diff --git a/pspamm/codegen/architectures/knl/inlineprinter.py b/pspamm/codegen/architectures/knl/inlineprinter.py index f21ddfb..333ea31 100644 --- a/pspamm/codegen/architectures/knl/inlineprinter.py +++ b/pspamm/codegen/architectures/knl/inlineprinter.py @@ -76,7 +76,7 @@ def visitFma(self, stmt: FmaStmt): a = stmt.add_dest.ugly regsize = stmt.add_dest.size() // 16 extent = regsize * self.broadcast_multiplier - if stmt.bcast: + if stmt.bcast is not None: s = f"vfmadd231{self.alupsuffix} {b}%{{1to{extent}%}} {mask}, {m}, {a}" else: if stmt.mult_src.typeinfo == AsmType.i64: diff --git a/pspamm/codegen/sugar.py b/pspamm/codegen/sugar.py index 2014ab2..70387c4 100644 --- a/pspamm/codegen/sugar.py +++ b/pspamm/codegen/sugar.py @@ -19,7 +19,7 @@ def label(name: str): stmt.label = pspamm.architecture.operands.l(name) return stmt -def fma(bcast_src: Register, mult_src: Register, add_dest: Register, comment: str = None, bcast: bool = True, pred: Register = None): +def fma(bcast_src: Register, mult_src: Register, add_dest: Register, comment: str = None, bcast: Union[int, None] = None, pred: Register = None): stmt = FmaStmt() stmt.bcast_src = bcast_src stmt.mult_src = mult_src diff --git a/pspamm/matmul.py b/pspamm/matmul.py index 0fe8bdc..6601978 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -322,7 +322,7 @@ def kernelK(asm, Bki, A_ptr, B_ptr): if self.beta == 0.0: store_block.add(mul(regs[ir, x + ic], self.alpha_reg[1], A_regs_cut[ir, ic], "C = C + alpha * AB", pred=pred_m)) else: - store_block.add(fma(regs[ir, x + ic], self.alpha_reg[1], A_regs_cut[ir, ic], "C = C + alpha * AB", False, pred=pred_m)) + store_block.add(fma(regs[ir, x + ic], self.alpha_reg[1], A_regs_cut[ir, ic], "C = C + alpha * AB", None, pred=pred_m)) store_block.add(self.generator.move_register_block(self.C, C_ptr, Coords(), A_regs_cut, self.v_size, self.additional_regs, None, True, self.prefetching, self.ldc * x)) asm.add(store_block) From b98e8f0a9c5c490acd63447a29ca5d717bc9726f Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 14:52:25 +0200 Subject: [PATCH 29/64] Avoid 0-adds --- pspamm/codegen/architectures/arm/inlineprinter.py | 3 +++ pspamm/codegen/architectures/arm_sve/inlineprinter.py | 5 ++++- pspamm/codegen/architectures/hsw/inlineprinter.py | 3 +++ pspamm/codegen/architectures/knl/inlineprinter.py | 4 ++++ 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pspamm/codegen/architectures/arm/inlineprinter.py b/pspamm/codegen/architectures/arm/inlineprinter.py index efb93f7..7b3e68b 100644 --- a/pspamm/codegen/architectures/arm/inlineprinter.py +++ b/pspamm/codegen/architectures/arm/inlineprinter.py @@ -69,6 +69,9 @@ def visitBcst(self, stmt: BcstStmt): self.addLine(s, stmt.comment) def visitAdd(self, stmt: AddStmt): + if isinstance(stmt.src, Constant) and stmt.src.value == 0: + # avoid 0 instructions + return if isinstance(stmt.src, Constant) and (stmt.src.value > 4095 or stmt.src.value < -4095): if (stmt.src.value >> 16) & 0xFFFF > 0 and stmt.src.value < 0: s = "mov x11, #-1" diff --git a/pspamm/codegen/architectures/arm_sve/inlineprinter.py b/pspamm/codegen/architectures/arm_sve/inlineprinter.py index dd456f8..89e0e71 100644 --- a/pspamm/codegen/architectures/arm_sve/inlineprinter.py +++ b/pspamm/codegen/architectures/arm_sve/inlineprinter.py @@ -83,7 +83,10 @@ def visitBcst(self, stmt: BcstStmt): self.addLine(s, stmt.comment) def visitAdd(self, stmt: AddStmt): - if isinstance(stmt.src, Constant) and (stmt.src.value > 4095 or stmt.src.value < -4095): + if isinstance(stmt.src, Constant) and stmt.src.value == 0: + # avoid 0 instructions + return + if isinstance(stmt.src, Constant) and (stmt.src.value > 4095 or stmt.src.value < -4095): # This condition is probably related to immediate values being restricted to 12 bits for add instructions # https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/ADD--immediate- # https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/ADD--immediate---Add--immediate-- diff --git a/pspamm/codegen/architectures/hsw/inlineprinter.py b/pspamm/codegen/architectures/hsw/inlineprinter.py index 54d98de..7ac5ce7 100644 --- a/pspamm/codegen/architectures/hsw/inlineprinter.py +++ b/pspamm/codegen/architectures/hsw/inlineprinter.py @@ -75,6 +75,9 @@ def visitBcst(self, stmt: BcstStmt): self.addLine(s, stmt.comment) def visitAdd(self, stmt: AddStmt): + if isinstance(stmt.src, Constant) and stmt.src.value == 0: + # avoid 0 instructions + return s = f"addq {stmt.src.ugly}, {stmt.dest.ugly}" self.addLine(s, stmt.comment) diff --git a/pspamm/codegen/architectures/knl/inlineprinter.py b/pspamm/codegen/architectures/knl/inlineprinter.py index 333ea31..a776d25 100644 --- a/pspamm/codegen/architectures/knl/inlineprinter.py +++ b/pspamm/codegen/architectures/knl/inlineprinter.py @@ -115,6 +115,10 @@ def visitBcst(self, stmt: BcstStmt): self.addLine(s, stmt.comment) def visitAdd(self, stmt: AddStmt): + if isinstance(stmt.src, Constant) and stmt.src.value == 0: + # avoid 0 instructions + return + # only used for scalar addition right now s = f"addq {stmt.src.ugly}, {stmt.dest.ugly}" self.addLine(s, stmt.comment) From 386266baba90325ccbecba479d21857f715aa399 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 14:52:41 +0200 Subject: [PATCH 30/64] Unique register clobbering --- pspamm/codegen/ccode.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pspamm/codegen/ccode.py b/pspamm/codegen/ccode.py index a691b4e..00c5721 100644 --- a/pspamm/codegen/ccode.py +++ b/pspamm/codegen/ccode.py @@ -15,9 +15,8 @@ def make_cfunc(funcName:str, template:str, body:Block, flop:int, starting_regs:L analyzer = Analyzer(starting_regs) body.accept(analyzer) - regs = ['"{}"'.format(reg.clobbered) for reg in analyzer.clobbered_registers] - regs.sort() - clobbered = ",".join(regs) + regs = set('"{}"'.format(reg.clobbered) for reg in analyzer.clobbered_registers) + clobbered = ",".join(sorted(regs)) return template.format(funcName = funcName, body_text = body_text, clobbered = clobbered, From fb70f324791bd6e42e6166de814528958c0a0607 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 16:22:40 +0200 Subject: [PATCH 31/64] Update blocksize args --- pspamm/matmul.py | 8 ++++---- pspamm/scripts/max_arm.py | 2 +- pspamm/scripts/max_arm_sve.py | 2 +- pspamm/scripts/max_bn_knl.py | 2 +- pspamm/scripts/max_hsw.py | 2 +- pspamm/scripts/max_knl.py | 2 +- pspamm/scripts/old_arm.py | 2 +- pspamm/scripts/old_hsw.py | 2 +- pspamm/scripts/old_knl.py | 2 +- 9 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pspamm/matmul.py b/pspamm/matmul.py index 6601978..ff805a1 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -158,13 +158,13 @@ def __init__(self, if bm == None or bn == None: if arch == 'knl': - (self.bm, self.bn, self.bk) = pspamm.scripts.max_bn_knl.getBlocksize(m, n, bk, self.v_size) + (self.bm, self.bn, self.bk) = pspamm.scripts.max_bn_knl.getBlocksize(m, n, bk, self.v_size, self.precision) elif arch == 'hsw': - (self.bm, self.bn, self.bk) = pspamm.scripts.max_hsw.getBlocksize(m, n, bk, self.v_size) + (self.bm, self.bn, self.bk) = pspamm.scripts.max_hsw.getBlocksize(m, n, bk, self.v_size, self.precision) elif arch == 'arm': - (self.bm, self.bn, self.bk) = pspamm.scripts.old_arm.getBlocksize(m, n, bk, self.v_size) + (self.bm, self.bn, self.bk) = pspamm.scripts.old_arm.getBlocksize(m, n, bk, self.v_size, self.precision) elif arch == 'arm_sve': - (self.bm, self.bn, self.bk) = pspamm.scripts.max_arm_sve.getBlocksize(m, n, bk, self.v_size) + (self.bm, self.bn, self.bk) = pspamm.scripts.max_arm_sve.getBlocksize(m, n, bk, self.v_size, self.precision) else: self.bm = bm self.bn = bn diff --git a/pspamm/scripts/max_arm.py b/pspamm/scripts/max_arm.py index 9d7bba0..060fe2b 100755 --- a/pspamm/scripts/max_arm.py +++ b/pspamm/scripts/max_arm.py @@ -1,4 +1,4 @@ -def getBlocksize(m , n, bk, v_size=2): +def getBlocksize(m , n, bk, v_size, prec): bm = 2 bn = 1 diff --git a/pspamm/scripts/max_arm_sve.py b/pspamm/scripts/max_arm_sve.py index f1df1f8..10fa262 100644 --- a/pspamm/scripts/max_arm_sve.py +++ b/pspamm/scripts/max_arm_sve.py @@ -1,4 +1,4 @@ -def getBlocksize(m, n, bk, v_size=2): +def getBlocksize(m, n, bk, v_size, prec): # v_size default is 2, however for SVE that parameter will always be larger bm = 2 bn = 1 diff --git a/pspamm/scripts/max_bn_knl.py b/pspamm/scripts/max_bn_knl.py index 2483484..a936c56 100755 --- a/pspamm/scripts/max_bn_knl.py +++ b/pspamm/scripts/max_bn_knl.py @@ -1,4 +1,4 @@ -def getBlocksize(m, n, bk, v_size=8): +def getBlocksize(m, n, bk, v_size, prec): bm = v_size bn = 1 diff --git a/pspamm/scripts/max_hsw.py b/pspamm/scripts/max_hsw.py index 80c5825..a896dcd 100755 --- a/pspamm/scripts/max_hsw.py +++ b/pspamm/scripts/max_hsw.py @@ -1,4 +1,4 @@ -def getBlocksize(m , n, bk, v_size=4): +def getBlocksize(m , n, bk, v_size, prec): bm = 4 bn = 1 diff --git a/pspamm/scripts/max_knl.py b/pspamm/scripts/max_knl.py index d75b1ab..76c3581 100755 --- a/pspamm/scripts/max_knl.py +++ b/pspamm/scripts/max_knl.py @@ -1,4 +1,4 @@ -def getBlocksize(m , n, bk, v_size=8): +def getBlocksize(m , n, bk, v_size, prec): bm = 8 bn = 1 diff --git a/pspamm/scripts/old_arm.py b/pspamm/scripts/old_arm.py index e7bb884..16ccad8 100755 --- a/pspamm/scripts/old_arm.py +++ b/pspamm/scripts/old_arm.py @@ -1,4 +1,4 @@ -def getBlocksize(m , n, bk, v_size=2): +def getBlocksize(m , n, bk, v_size, prec): bm = m bn = n diff --git a/pspamm/scripts/old_hsw.py b/pspamm/scripts/old_hsw.py index 9e2ff6f..1cc0a7e 100755 --- a/pspamm/scripts/old_hsw.py +++ b/pspamm/scripts/old_hsw.py @@ -1,4 +1,4 @@ -def getBlocksize(m , n, bk, v_size=4): +def getBlocksize(m , n, bk, v_size, prec): bm = m bn = n diff --git a/pspamm/scripts/old_knl.py b/pspamm/scripts/old_knl.py index dd075f4..24913a2 100755 --- a/pspamm/scripts/old_knl.py +++ b/pspamm/scripts/old_knl.py @@ -1,4 +1,4 @@ -def getBlocksize(m , n, bk, v_size=8): +def getBlocksize(m , n, bk, v_size, prec): bm = m bn = n From 535c44e65d3874a1f1cda21315208f819caaeeff Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 16:54:00 +0200 Subject: [PATCH 32/64] Clean up block sizes --- pspamm/codegen/architectures/arm/blocksize.py | 68 ++++++++++++++ .../architectures/arm_sve/blocksize.py | 38 ++++++++ pspamm/codegen/architectures/hsw/blocksize.py | 72 ++++++++++++++ pspamm/codegen/architectures/knl/blocksize.py | 94 +++++++++++++++++++ pspamm/matmul.py | 13 ++- pspamm/scripts/__init__.py | 0 pspamm/scripts/max_arm.py | 24 ----- pspamm/scripts/max_arm_sve.py | 34 ------- pspamm/scripts/max_bn_knl.py | 19 ---- pspamm/scripts/max_hsw.py | 29 ------ pspamm/scripts/max_knl.py | 29 ------ pspamm/scripts/old_arm.py | 36 ------- pspamm/scripts/old_hsw.py | 36 ------- pspamm/scripts/old_knl.py | 36 ------- 14 files changed, 281 insertions(+), 247 deletions(-) create mode 100644 pspamm/codegen/architectures/arm/blocksize.py create mode 100644 pspamm/codegen/architectures/arm_sve/blocksize.py create mode 100644 pspamm/codegen/architectures/hsw/blocksize.py create mode 100644 pspamm/codegen/architectures/knl/blocksize.py delete mode 100644 pspamm/scripts/__init__.py delete mode 100755 pspamm/scripts/max_arm.py delete mode 100644 pspamm/scripts/max_arm_sve.py delete mode 100755 pspamm/scripts/max_bn_knl.py delete mode 100755 pspamm/scripts/max_hsw.py delete mode 100755 pspamm/scripts/max_knl.py delete mode 100755 pspamm/scripts/old_arm.py delete mode 100755 pspamm/scripts/old_hsw.py delete mode 100755 pspamm/scripts/old_knl.py diff --git a/pspamm/codegen/architectures/arm/blocksize.py b/pspamm/codegen/architectures/arm/blocksize.py new file mode 100644 index 0000000..c9b4352 --- /dev/null +++ b/pspamm/codegen/architectures/arm/blocksize.py @@ -0,0 +1,68 @@ + +class Old: + @classmethod + def getBlocksize(cls, m , n, bk, v_size, prec): + + bm = m + bn = n + + if cls.ARM_condition(bm, bn, bk, v_size): + while cls.ARM_condition(bm, bn, bk+1, v_size): + bk += 1 + return (bm, bn, bk) + + while not cls.ARM_condition(bm, bn, bk, v_size): + bm, bn = cls.lowerToNextDiv(m, n, bm, bn, v_size) + + while cls.ARM_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn, bk) + + @classmethod + def lowerToNextDiv(cls, m, n, bm, bn, v_size): + if bm > bn and bm > v_size: + bm -= v_size + while m % bm != 0: + bm -= v_size + else: + bn -= 1 + while n % bn != 0: + bn -= 1 + + return bm, bn + + @classmethod + def ARM_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn+bk) * vm + bn*bk <= 32 + +class Max: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + bm = 2 + bn = 1 + maxval = 0 + + for i in range(v_size, m+1, v_size): + for j in range(1, n+1): + if cls.ARM_condition(i, j, bk, v_size): + if i*j > maxval: + maxval = i*j + bm = i + bn = j + + while cls.ARM_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn, bk) + + + @classmethod + def ARM_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn+bk) * vm + bn*bk <= 32 + +Default = Max diff --git a/pspamm/codegen/architectures/arm_sve/blocksize.py b/pspamm/codegen/architectures/arm_sve/blocksize.py new file mode 100644 index 0000000..f7bf65d --- /dev/null +++ b/pspamm/codegen/architectures/arm_sve/blocksize.py @@ -0,0 +1,38 @@ +class Max: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + # v_size default is 2, however for SVE that parameter will always be larger + bm = 2 + bn = 1 + maxval = 0 + + for i in range(1, m + 1, 1): + next_multiple = i + while next_multiple % v_size != 0: + next_multiple += 1 + for j in range(1, n + 1): + if cls.ARM_condition(next_multiple, j, bk, v_size) and cls.tileable(m, i): + if i * j >= maxval: + maxval = i * j + bm = i + bn = j + + if maxval == 0: + raise RuntimeError("Could not find an appropriate block size. We suggest padding the matrix dimensions") + + while cls.ARM_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn, bk) + + + def ARM_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn + bk) * vm + bn*bk <= 32 + + + def tileable(cls, m, bm): + return m % bm == 0 + +Default = Max diff --git a/pspamm/codegen/architectures/hsw/blocksize.py b/pspamm/codegen/architectures/hsw/blocksize.py new file mode 100644 index 0000000..4d65441 --- /dev/null +++ b/pspamm/codegen/architectures/hsw/blocksize.py @@ -0,0 +1,72 @@ +class Old: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + + bm = m + bn = n + + if cls.HSW_condition(bm, bn, bk, v_size): + while cls.HSW_condition(bm, bn, bk+1, v_size): + bk += 1 + return (bm, bn) + + while not cls.HSW_condition(bm, bn, bk, v_size): + bm, bn = cls.lowerToNextDiv(m, n, bm, bn, v_size) + + while cls.HSW_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn) + + @classmethod + def lowerToNextDiv(cls, m, n, bm, bn, v_size): + if bm > bn and bm > v_size: + bm -= v_size + while m % bm != 0: + bm -= v_size + else: + bn -= 1 + while n % bn != 0: + bn -= 1 + + return bm, bn + + @classmethod + def HSW_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn + bk) * vm + bn * bk <= 16 + +class Max: + def getBlocksize(cls, m, n, bk, v_size, prec): + bm = 4 + bn = 1 + maxval = 0 + + for i in range(v_size, m+1, v_size): + for j in range(1, n+1): + # can be replaced by cls.HSW_condition_extended here + # (but that seemed to be slower in the end) + if cls.HSW_condition(i, j, bk, v_size): + if i*j > maxval and (cls.HSW_condition(i, j, bk, v_size) or j > 1): + maxval = i*j + bm = i + bn = j + + while cls.HSW_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn, bk) + + def cls.HSW_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn + bk) * vm + bn * bk <= 16 + + def cls.HSW_condition_extended(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return bn * vm + bn * bk + 1 <= 16 + + +Default = Max diff --git a/pspamm/codegen/architectures/knl/blocksize.py b/pspamm/codegen/architectures/knl/blocksize.py new file mode 100644 index 0000000..21a9c2b --- /dev/null +++ b/pspamm/codegen/architectures/knl/blocksize.py @@ -0,0 +1,94 @@ +class Old: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + + bm = m + bn = n + + if cls.KNL_condition(bm, bn, bk, v_size): + while cls.KNL_condition(bm, bn, bk+1, v_size): + bk += 1 + return (bm, bn) + + while not cls.KNL_condition(bm, bn, bk, v_size): + bm, bn = cls.lowerToNextDiv(m, n, bm, bn, v_size) + + while cls.KNL_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn) + + @classmethod + def lowerToNextDiv(cls, m, n, bm, bn, v_size): + if bm > bn and bm > v_size: + bm -= v_size + while m % bm != 0: + bm -= v_size + else: + bn -= 1 + while n % bn != 0: + bn -= 1 + + return bm, bn + + @classmethod + def KNL_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn+bk) * vm <= 32 + +class Max: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + + bm = 8 + bn = 1 + maxval = 0 + + for i in range(1, m+1): + next_multiple = -(bm // -v_size) + for j in range(1, n+1): + if cls.KNL_condition(next_multiple, j, bk, v_size) and cls.tileable(m, bm): + if i*j > maxval: + maxval = i*j + bm = i + bn = j + + while cls.KNL_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn, bk) + + @classmethod + def KNL_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn+bk) * vm <= 32 + + @classmethod + def tileable(cls, m, bm): + return m % bm == 0 + +class MaxBn: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + + bm = v_size + bn = 1 + + for j in range(1, n+1): + if cls.KNL_condition(bm, j, bk, v_size): + bn = j + + while cls.KNL_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn, bk) + + @classmethod + def KNL_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn+bk) * vm <= 32 + +Default = MaxBn diff --git a/pspamm/matmul.py b/pspamm/matmul.py index ff805a1..b9b3996 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -126,6 +126,10 @@ def __init__(self, assert v_len_bits in (128, 256) v_len_regs = v_len_bits // 128 arch = 'hsw' + + if arch.startswith('arm'): + # only 128 supported + arch = 'arm' self.arch = arch assert precision.lower() in ['bf16', 'h', 's', 'd'] @@ -140,6 +144,7 @@ def __init__(self, pspamm.architecture.arch = arch pspamm.architecture.Generator = pspamm.architecture.get_class("pspamm.codegen.architectures." + arch + ".generator").Generator pspamm.architecture.operands = pspamm.architecture.get_class("pspamm.codegen.architectures." + arch + ".operands") + pspamm.architecture.blocksize = pspamm.architecture.get_class("pspamm.codegen.architectures." + arch + ".blocksize").Default self.generator = pspamm.architecture.Generator(self.precision) @@ -158,13 +163,13 @@ def __init__(self, if bm == None or bn == None: if arch == 'knl': - (self.bm, self.bn, self.bk) = pspamm.scripts.max_bn_knl.getBlocksize(m, n, bk, self.v_size, self.precision) + (self.bm, self.bn, self.bk) = pspamm.architecture.blocksize.getBlocksize(m, n, bk, self.v_size, self.precision) elif arch == 'hsw': - (self.bm, self.bn, self.bk) = pspamm.scripts.max_hsw.getBlocksize(m, n, bk, self.v_size, self.precision) + (self.bm, self.bn, self.bk) = pspamm.architecture.blocksize.getBlocksize(m, n, bk, self.v_size, self.precision) elif arch == 'arm': - (self.bm, self.bn, self.bk) = pspamm.scripts.old_arm.getBlocksize(m, n, bk, self.v_size, self.precision) + (self.bm, self.bn, self.bk) = pspamm.architecture.blocksize.getBlocksize(m, n, bk, self.v_size, self.precision) elif arch == 'arm_sve': - (self.bm, self.bn, self.bk) = pspamm.scripts.max_arm_sve.getBlocksize(m, n, bk, self.v_size, self.precision) + (self.bm, self.bn, self.bk) = pspamm.architecture.blocksize.getBlocksize(m, n, bk, self.v_size, self.precision) else: self.bm = bm self.bn = bn diff --git a/pspamm/scripts/__init__.py b/pspamm/scripts/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/pspamm/scripts/max_arm.py b/pspamm/scripts/max_arm.py deleted file mode 100755 index 060fe2b..0000000 --- a/pspamm/scripts/max_arm.py +++ /dev/null @@ -1,24 +0,0 @@ -def getBlocksize(m , n, bk, v_size, prec): - - bm = 2 - bn = 1 - maxval = 0 - - for i in range(v_size, m+1, v_size): - for j in range(1, n+1): - if ARM_condition(i, j, bk, v_size): - if i*j > maxval: - maxval = i*j - bm = i - bn = j - - while ARM_condition(bm, bn, bk+1, v_size): - bk += 1 - - return (bm, bn, bk) - - -def ARM_condition(bm, bn, bk, v_size): - # ceiling division - vm = -(bm // -v_size) - return (bn+bk) * vm + bn*bk <= 32 diff --git a/pspamm/scripts/max_arm_sve.py b/pspamm/scripts/max_arm_sve.py deleted file mode 100644 index 10fa262..0000000 --- a/pspamm/scripts/max_arm_sve.py +++ /dev/null @@ -1,34 +0,0 @@ -def getBlocksize(m, n, bk, v_size, prec): - # v_size default is 2, however for SVE that parameter will always be larger - bm = 2 - bn = 1 - maxval = 0 - - for i in range(1, m + 1, 1): - next_multiple = i - while next_multiple % v_size != 0: - next_multiple += 1 - for j in range(1, n + 1): - if ARM_condition(next_multiple, j, bk, v_size) and tileable(m, i): - if i * j >= maxval: - maxval = i * j - bm = i - bn = j - - if maxval == 0: - raise RuntimeError("Could not find an appropriate block size. We suggest padding the matrix dimensions") - - while ARM_condition(bm, bn, bk+1, v_size): - bk += 1 - - return (bm, bn, bk) - - -def ARM_condition(bm, bn, bk, v_size): - # ceiling division - vm = -(bm // -v_size) - return (bn + bk) * vm + bn*bk <= 32 - - -def tileable(m, bm): - return m % bm == 0 diff --git a/pspamm/scripts/max_bn_knl.py b/pspamm/scripts/max_bn_knl.py deleted file mode 100755 index a936c56..0000000 --- a/pspamm/scripts/max_bn_knl.py +++ /dev/null @@ -1,19 +0,0 @@ -def getBlocksize(m, n, bk, v_size, prec): - - bm = v_size - bn = 1 - - for j in range(1, n+1): - if KNL_condition(bm, j, bk, v_size): - bn = j - - while KNL_condition(bm, bn, bk+1, v_size): - bk += 1 - - return (bm, bn, bk) - - -def KNL_condition(bm, bn, bk, v_size): - # ceiling division - vm = -(bm // -v_size) - return (bn+bk) * vm <= 32 diff --git a/pspamm/scripts/max_hsw.py b/pspamm/scripts/max_hsw.py deleted file mode 100755 index a896dcd..0000000 --- a/pspamm/scripts/max_hsw.py +++ /dev/null @@ -1,29 +0,0 @@ -def getBlocksize(m , n, bk, v_size, prec): - - bm = 4 - bn = 1 - maxval = 0 - - for i in range(v_size, m+1, v_size): - for j in range(1, n+1): - # can be replaced by HSW_condition_extended here - if HSW_condition(i, j, bk, v_size): - if i*j > maxval and (HSW_condition(i, j, bk, v_size) or j > 1): - maxval = i*j - bm = i - bn = j - - while HSW_condition(bm, bn, bk+1, v_size): - bk += 1 - - return (bm, bn, bk) - -def HSW_condition(bm, bn, bk, v_size): - # ceiling division - vm = -(bm // -v_size) - return (bn + bk) * vm + bn * bk <= 16 - -def HSW_condition_extended(bm, bn, bk, v_size): - # ceiling division - vm = -(bm // -v_size) - return bn * vm + bn * bk + 1 <= 16 diff --git a/pspamm/scripts/max_knl.py b/pspamm/scripts/max_knl.py deleted file mode 100755 index 76c3581..0000000 --- a/pspamm/scripts/max_knl.py +++ /dev/null @@ -1,29 +0,0 @@ -def getBlocksize(m , n, bk, v_size, prec): - - bm = 8 - bn = 1 - maxval = 0 - - for i in range(1, m+1): - next_multiple = -(bm // -v_size) - for j in range(1, n+1): - if KNL_condition(next_multiple, j, bk, v_size) and tileable(m, bm): - if i*j > maxval: - maxval = i*j - bm = i - bn = j - - while KNL_condition(bm, bn, bk+1, v_size): - bk += 1 - - return (bm, bn, bk) - - -def KNL_condition(bm, bn, bk, v_size): - # ceiling division - vm = -(bm // -v_size) - return (bn+bk) * vm <= 32 - -def tileable(m, bm): - return m % bm == 0 - diff --git a/pspamm/scripts/old_arm.py b/pspamm/scripts/old_arm.py deleted file mode 100755 index 16ccad8..0000000 --- a/pspamm/scripts/old_arm.py +++ /dev/null @@ -1,36 +0,0 @@ -def getBlocksize(m , n, bk, v_size, prec): - - bm = m - bn = n - - if ARM_condition(bm, bn, bk, v_size): - while ARM_condition(bm, bn, bk+1, v_size): - bk += 1 - return (bm, bn, bk) - - while not ARM_condition(bm, bn, bk, v_size): - bm, bn = lowerToNextDiv(m, n, bm, bn, v_size) - - while ARM_condition(bm, bn, bk+1, v_size): - bk += 1 - - return (bm, bn, bk) - - -def lowerToNextDiv(m, n, bm, bn, v_size): - if bm > bn and bm > v_size: - bm -= v_size - while m % bm != 0: - bm -= v_size - else: - bn -= 1 - while n % bn != 0: - bn -= 1 - - return bm, bn - - -def ARM_condition(bm, bn, bk, v_size): - # ceiling division - vm = -(bm // -v_size) - return (bn+bk) * vm + bn*bk <= 32 diff --git a/pspamm/scripts/old_hsw.py b/pspamm/scripts/old_hsw.py deleted file mode 100755 index 1cc0a7e..0000000 --- a/pspamm/scripts/old_hsw.py +++ /dev/null @@ -1,36 +0,0 @@ -def getBlocksize(m , n, bk, v_size, prec): - - bm = m - bn = n - - if HSW_condition(bm, bn, bk, v_size): - while HSW_condition(bm, bn, bk+1, v_size): - bk += 1 - return (bm, bn) - - while not HSW_condition(bm, bn, bk, v_size): - bm, bn = lowerToNextDiv(m, n, bm, bn, v_size) - - while HSW_condition(bm, bn, bk+1, v_size): - bk += 1 - - return (bm, bn) - - -def lowerToNextDiv(m, n, bm, bn, v_size): - if bm > bn and bm > v_size: - bm -= v_size - while m % bm != 0: - bm -= v_size - else: - bn -= 1 - while n % bn != 0: - bn -= 1 - - return bm, bn - - -def HSW_condition(bm, bn, bk, v_size): - # ceiling division - vm = -(bm // -v_size) - return (bn + bk) * vm + bn * bk <= 16 diff --git a/pspamm/scripts/old_knl.py b/pspamm/scripts/old_knl.py deleted file mode 100755 index 24913a2..0000000 --- a/pspamm/scripts/old_knl.py +++ /dev/null @@ -1,36 +0,0 @@ -def getBlocksize(m , n, bk, v_size, prec): - - bm = m - bn = n - - if KNL_condition(bm, bn, bk, v_size): - while KNL_condition(bm, bn, bk+1, v_size): - bk += 1 - return (bm, bn) - - while not KNL_condition(bm, bn, bk, v_size): - bm, bn = lowerToNextDiv(m, n, bm, bn, v_size) - - while KNL_condition(bm, bn, bk+1, v_size): - bk += 1 - - return (bm, bn) - - -def lowerToNextDiv(m, n, bm, bn, v_size): - if bm > bn and bm > v_size: - bm -= v_size - while m % bm != 0: - bm -= v_size - else: - bn -= 1 - while n % bn != 0: - bn -= 1 - - return bm, bn - - -def KNL_condition(bm, bn, bk, v_size): - # ceiling division - vm = -(bm // -v_size) - return (bn+bk) * vm <= 32 From 97fe82cc253da7722dfa3d6ae99ceecddc8bc689 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 16:54:08 +0200 Subject: [PATCH 33/64] Clean up module finder --- pspamm/architecture.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/pspamm/architecture.py b/pspamm/architecture.py index d0f504c..c6f8f35 100755 --- a/pspamm/architecture.py +++ b/pspamm/architecture.py @@ -8,17 +8,5 @@ def init(): generator = None operands = None - - -#https://stackoverflow.com/questions/452969/does-python-have-an-equivalent-to-java-class-forname - def get_class( kls ): return import_module(kls) - parts = kls.split('.') - module = ".".join(parts[:-1]) - m = __import__( module ) - for comp in parts[1:]: - m = getattr(m, comp) - return m - - From 5d922501a5383b78640d65d36aede11c8f605100 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 16:54:56 +0200 Subject: [PATCH 34/64] Clean up unit tests --- .github/workflows/codegen.yml | 15 ++-- tests/runall-sve.sh | 2 +- tests/testsuite_generator.py | 2 +- tests/unit_test.py | 147 ++++++++++++++++++++++++++++++++++ tests/unit_tests_arm.py | 37 --------- tests/unit_tests_arm_sve.py | 82 ------------------- tests/unit_tests_hsw.py | 37 --------- tests/unit_tests_knl.py | 38 --------- 8 files changed, 158 insertions(+), 202 deletions(-) create mode 100644 tests/unit_test.py delete mode 100755 tests/unit_tests_arm.py delete mode 100644 tests/unit_tests_arm_sve.py delete mode 100755 tests/unit_tests_hsw.py delete mode 100755 tests/unit_tests_knl.py diff --git a/.github/workflows/codegen.yml b/.github/workflows/codegen.yml index 8894641..0029004 100644 --- a/.github/workflows/codegen.yml +++ b/.github/workflows/codegen.yml @@ -64,7 +64,8 @@ jobs: - name: pspamm-tests-generate run: | cd tests/ - python unit_tests_hsw.py + python unit_test.py hsw256 + python unit_test.py hsw128 - name: pspamm-tests-compile run: | @@ -104,7 +105,9 @@ jobs: - name: pspamm-tests-generate run: | cd tests/ - python unit_tests_knl.py + python unit_test.py knl512 + python unit_test.py knl256 + python unit_test.py knl128 - name: pspamm-tests-compile run: | @@ -147,17 +150,17 @@ jobs: - name: pspamm-tests-generate run: | cd tests/ - python unit_tests_arm.py + python unit_test.py arm128 - name: pspamm-tests-compile run: | cd tests/ - aarch64-linux-gnu-g++ -static -march=armv8.2-a build/arm_testsuite.cpp -o build/arm-test + aarch64-linux-gnu-g++ -static -march=armv8.2-a build/arm128_testsuite.cpp -o build/arm128-test - name: pspamm-tests-run run: | cd tests/ - qemu-aarch64-static -cpu max build/arm-test + qemu-aarch64-static -cpu max build/arm128-test pspamm-codegen-armsve: name: pspamm-codegen-armsve @@ -195,7 +198,7 @@ jobs: - name: pspamm-tests-generate run: | cd tests/ - python unit_tests_arm_sve.py ${{matrix.vectorlen}} + python unit_test.py arm_sve${{matrix.vectorlen}} - name: pspamm-tests-compile run: | diff --git a/tests/runall-sve.sh b/tests/runall-sve.sh index 1d85fde..da38854 100755 --- a/tests/runall-sve.sh +++ b/tests/runall-sve.sh @@ -8,7 +8,7 @@ do echo "" echo "" echo "Testing $BITLEN bit SVE register GEMM" - python unit_tests_arm_sve.py $BITLEN + python unit_test.py arm_sve$BITLEN aarch64-linux-gnu-g++ -static -march=armv8.2-a+sve -msve-vector-bits=${BITLEN} build/arm_sve${BITLEN}_testsuite.cpp -o build/sve${BITLEN}-test qemu-aarch64-static -cpu max,sve${BITLEN}=on,sve-default-vector-length=-1 build/sve${BITLEN}-test done diff --git a/tests/testsuite_generator.py b/tests/testsuite_generator.py index ea1d979..122a764 100755 --- a/tests/testsuite_generator.py +++ b/tests/testsuite_generator.py @@ -278,7 +278,7 @@ def make(kernels, arch): print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') continue - name = kern.name + '_' + str(bm) + '_' + str(bn) + '_' + str(bk) + name = f'{kern.name}_{kern.precision}_{bm}_{bn}_{bk}' additional_args = ['--output_funcname', name, '--output_filename', os.path.join(BASEDIR, arch, name + '.h'), '--output_overwrite'] diff --git a/tests/unit_test.py b/tests/unit_test.py new file mode 100644 index 0000000..9c14388 --- /dev/null +++ b/tests/unit_test.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 + +import testsuite_generator as generator +from importlib import import_module + +import pspamm.scripts +from pspamm.codegen.precision import * + +import sys +import re + +arch = sys.argv[1] + +parsedarch = re.fullmatch(r'(?P[a-zA-Z_]+)(?P\d+)', arch) + +archname = parsedarch.group('name') +archprec = parsedarch.group('prec') + +blocksize = import_module("pspamm.codegen.architectures." + archname + ".blocksize") + +scripts = { + "arm": lambda blocksize: [blocksize.Old, blocksize.Max], + "arm_sve": lambda blocksize: [blocksize.Max], + "knl": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.MaxBn], + "hsw": lambda blocksize: [blocksize.Old, blocksize.Max], +} + +blocksize_algs = scripts[archname](blocksize) + [blocksize.Default] + +bitlen = int(archprec) +v_len = bitlen // 128 +v_size_fun = lambda prec: (16 // prec.size()) * v_len + +# define the maximum allowed difference between elements of our solution and the reference solution for +# double and single precision +delta_hp = 1e-2 +delta_sp = 1e-4 # epsilon is around e-7 => /2 ... For most cases, 1e-6 is enough +delta_dp = 1e-7 # epsilon is around e-15 => /2 + +kernels = [] + +for precision, delta in zip((Precision.SINGLE, Precision.DOUBLE), (delta_sp, delta_dp)): + v_size = v_size_fun(precision) + kernels.append(generator.SparseKernel("test1", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(56, 56, 30), delta)) + kernels.append(generator.DenseKernel("test2", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("test3", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.SparseKernel("knl_only_test1", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 2, 1), delta)) + kernels.append(generator.SparseKernel("knl_only_test2", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2), (16,7)] + [x.getBlocksize(24, 40, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(40, 40, 20), delta)) + + kernels.append(generator.SparseKernel("knl_only_test3", precision, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 2, 2), delta)) + kernels.append(generator.SparseKernel("knl_only_test4", precision, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [(8, 20), (24,3)] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(10, 20, 1), delta)) + kernels.append(generator.SparseKernel("knl_only_test5", precision, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [(32, 2), (8,14)] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(10, 5, 1), delta)) + kernels.append(generator.SparseKernel("knl_only_test6", precision, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 1, 1), delta)) + kernels.append(generator.SparseKernel("knl_only_test7", precision, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8, 24), (8,1)] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(40, 24, 1), delta)) + + kernels.append(generator.DenseKernel("knl_only_test8", precision, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("knl_only_test9", precision, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2), (16,7)] + [x.getBlocksize(32, 40, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("knl_only_test10", precision, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [(8, 28)], delta)) + kernels.append(generator.DenseKernel("knl_only_test11", precision, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8, 20), (8,3)] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("knl_only_test12", precision, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [(32, 2), (8,14)] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("knl_only_test13", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("knl_only_test14", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [(8, 24)] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], delta)) + + kernels.append(generator.SparseKernel("test1", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(56, 56, 30), delta)) + kernels.append(generator.DenseKernel("test2", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8,2)] + [x.getBlocksize(8, 40, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("test3", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.SparseKernel("hsw_only_test1", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 2, 1), delta)) + kernels.append(generator.SparseKernel("hsw_only_test2", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2)] + [x.getBlocksize(24, 40, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(40, 40, 20), delta)) + + kernels.append(generator.SparseKernel("hsw_only_test3", precision, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 2, 2), delta)) + kernels.append(generator.SparseKernel("hsw_only_test4", precision, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(10, 20, 1), delta)) + kernels.append(generator.SparseKernel("hsw_only_test5", precision, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(10, 5, 1), delta)) + kernels.append(generator.SparseKernel("hsw_only_test6", precision, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 1, 1), delta)) + kernels.append(generator.SparseKernel("hsw_only_test7", precision, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8,1)] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(40, 24, 1), delta)) + + kernels.append(generator.DenseKernel("hsw_only_test8", precision, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("hsw_only_test9", precision, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2)] + [x.getBlocksize(32, 40, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("hsw_only_test10", precision, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [x.getBlocksize(56, 28, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("hsw_only_test11", precision, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8,3)] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("hsw_only_test12", precision, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("hsw_only_test13", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("hsw_only_test14", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], delta)) + + kernels.append(generator.DenseKernel("itest4", precision, 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4), (4,4,2), (4,4,4), (4,4,8)], delta)) + + kernels.append(generator.SparseKernel("itest1", precision, 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(56, 56, 30), delta)) + kernels.append(generator.DenseKernel("itest2", precision, 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("itest3", precision, 8, 56, 56, 8, 56, 8, 0.0, 0.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], delta)) + + kernels.append(generator.SparseKernel("arm_only_test1", precision, 2, 3, 4, 2, 0, 2, 1.1233, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(4, 3, 5), delta)) + kernels.append(generator.SparseKernel("arm_only_test2", precision, 2, 3, 4, 20, 0, 14, 1.0, 1.0, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(4, 3, 5), delta)) + kernels.append(generator.SparseKernel("arm_only_test3", precision, 32, 80, 50, 32, 0, 32, 1.0, 3.0, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(50, 80, 294), delta)) + kernels.append(generator.SparseKernel("arm_only_test4", precision, 32, 32, 32, 34, 0, 32, 1.0, 0.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(32, 32, 24), delta)) + kernels.append(generator.SparseKernel("arm_only_test5", precision, 2, 1, 1, 2, 0, 8, 1.0, -1.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 1, 1), delta)) + kernels.append(generator.SparseKernel("arm_only_test6", precision, 2, 2, 2, 2, 0, 2, 2.0, 234234.123, [(2, 1)] + [x.getBlocksize(2, 2, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(2, 2, 1), delta)) + kernels.append(generator.SparseKernel("arm_only_test7", precision, 16, 5, 7, 16, 0, 16, 0.0, -1.123, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(7, 5, 35), delta)) + + kernels.append(generator.DenseKernel("arm_only_test8", precision, 2, 3, 4, 2, 4, 2, 1.0, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("arm_only_test9", precision, 2, 3, 4, 20, 12, 14, 2.0, 1.123, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("arm_only_test10", precision, 32, 80, 50, 32, 50, 32, 0.0, 0.2, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("arm_only_test11", precision, 32, 32, 32, 33, 68, 32, 1231.0, 14443.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("arm_only_test12", precision, 2, 1, 1, 2, 1, 8, 1.0, 3.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("arm_only_test13", precision, 2, 3, 3, 2, 3, 2, 1.0, 0.0, [(2, 1)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("arm_only_test14", precision, 16, 5, 7, 16, 7, 16, 1.0, 1.0, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], delta)) + + kernels.append(generator.DenseKernel("sve_mixed_test1", precision, 9, 9, 9, 9, 9, 9, 1.0, 0.0, [(3, 3)] + [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + kernels.append(generator.SparseKernel("sve_mixed_test2", precision, 9, 9, 9, 9, 0, 9, 4.0, 2.5, [(3, 3)] + [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(9, 9, 20), delta_dp)) + kernels.append(generator.SparseKernel("sve_mixed_test3", precision, 18, 18, 18, 18, 0, 18, 3.4, -2.5, [(1, 1), (3, 3), (6, 6)] + [x.getBlocksize(18, 18, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(18, 18, 59), delta_dp)) + kernels.append(generator.SparseKernel("sve_mixed_test4", precision, 80, 80, 80, 80, 0, 80, 0.0, -2.5, [(4, 4), (8, 8)] + [x.getBlocksize(80, 80, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(80, 80, 312), delta_dp)) + kernels.append(generator.SparseKernel("sve_mixed_test5", precision, 8, 8, 8, 10, 0, 8, 3.0, -0.9, [(2, 2), (4, 4)] + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(8, 8, 6), delta_dp)) + kernels.append(generator.DenseKernel("sve_mixed_test6", precision, 8, 8, 8, 10, 8, 8, 3.0, -0.9, [(2, 2), (4, 4)] + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + + kernels.append(generator.DenseKernel("sve_test3", precision, 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4)], delta_dp)) + + kernels.append(generator.SparseKernel("sve_test1", precision, 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(56, 56, 30), delta_dp)) + kernels.append(generator.DenseKernel("sve_test2", precision, 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + kernels.append(generator.DenseKernel("sve_test3", precision, 8, 56, 56, 8, 56, 8, 0.0, 0.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + + kernels.append(generator.SparseKernel("sve_arm_only_test1", precision, 2, 3, 4, 2, 0, 2, 1.1233, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(4, 3, 5), delta_dp)) + kernels.append(generator.SparseKernel("sve_arm_only_test2", precision, 2, 3, 4, 20, 0, 14, 1.0, 1.0, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(4, 3, 5), delta_dp)) + kernels.append(generator.SparseKernel("sve_arm_only_test3", precision, 32, 80, 50, 32, 0, 32, 1.0, 3.0, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(50, 80, 294), delta_dp)) + kernels.append(generator.SparseKernel("sve_arm_only_test4", precision, 32, 32, 32, 34, 0, 32, 1.0, 0.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(32, 32, 24), delta_dp)) + kernels.append(generator.SparseKernel("sve_arm_only_test5", precision, 2, 1, 1, 2, 0, 8, 1.0, -1.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 1, 1), delta_dp)) + kernels.append(generator.SparseKernel("sve_arm_only_test6", precision, 2, 2, 2, 2, 0, 2, 2.0, 234234.123, [(2, 1)] + [x.getBlocksize(2, 2, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(2, 2, 1), delta_dp)) + kernels.append(generator.SparseKernel("sve_arm_only_test7", precision, 16, 5, 7, 16, 0, 16, 0.0, -1.123, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(7, 5, 35), delta_dp)) + + kernels.append(generator.DenseKernel("sve_arm_only_test8", precision, 2, 3, 4, 2, 4, 2, 1.0, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + kernels.append(generator.DenseKernel("sve_arm_only_test9", precision, 2, 3, 4, 20, 12, 14, 2.0, 1.123, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + kernels.append(generator.DenseKernel("sve_arm_only_test10", precision, 32, 80, 50, 32, 50, 32, 0.0, 0.2, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + kernels.append(generator.DenseKernel("sve_arm_only_test11", precision, 32, 32, 32, 33, 68, 32, 1231.0, 14443.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + kernels.append(generator.DenseKernel("sve_arm_only_test12", precision, 2, 1, 1, 2, 1, 8, 1.0, 3.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + kernels.append(generator.DenseKernel("sve_arm_only_test13", precision, 2, 3, 3, 2, 3, 2, 1.0, 0.0, [(2, 1)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + kernels.append(generator.DenseKernel("sve_arm_only_test14", precision, 16, 5, 7, 16, 7, 16, 1.0, 1.0, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + + kernels.append(generator.DenseKernel("sve_arm_only_test15", precision, 23, 29, 31, 23, 31, 23, 1.32, 0.96, [x.getBlocksize(23, 29, 1, v_size, precision) for x in blocksize_algs], delta_dp)) + kernels.append(generator.SparseKernel("sve_arm_only_test16", precision, 23, 29, 31, 23, 0, 23, 1.32, 0.96, [x.getBlocksize(23, 29, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(31, 29, 61), delta_dp)) + + kernels.append(generator.DenseKernel("sve_single_prec_test_S1", precision, 9, 9, 9, 9, 9, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], delta_sp)) + kernels.append(generator.DenseKernel("sve_single_prec_test_S2", precision, 15, 15, 15, 15, 15, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size, precision) for x in blocksize_algs], delta_sp)) + kernels.append(generator.DenseKernel("sve_single_prec_test_S3", precision, 23, 23, 23, 23, 23, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size, precision) for x in blocksize_algs], delta_sp)) + kernels.append(generator.DenseKernel("sve_single_prec_test_S4", precision, 23, 31, 13, 23, 13, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size, precision) for x in blocksize_algs], delta_sp)) + kernels.append(generator.SparseKernel("sve_single_prec_test_S5", precision, 9, 9, 9, 9, 0, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(9, 9, 8), delta_sp)) + kernels.append(generator.SparseKernel("sve_single_prec_test_S6", precision, 15, 15, 15, 15, 0, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(15, 15, 22), delta_sp)) + kernels.append(generator.SparseKernel("sve_single_prec_test_S7", precision, 23, 23, 23, 23, 0, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(23, 23, 52), delta_sp)) + kernels.append(generator.SparseKernel("sve_single_prec_test_S8", precision, 23, 31, 13, 23, 0, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(13, 31, 40), delta_sp)) + +generator.make(kernels, arch) diff --git a/tests/unit_tests_arm.py b/tests/unit_tests_arm.py deleted file mode 100755 index 50ba6c4..0000000 --- a/tests/unit_tests_arm.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 - -import testsuite_generator as generator - -import pspamm.scripts.max_arm as max_square -import pspamm.scripts.old_arm as old -from pspamm.codegen.precision import * - -blocksize_algs = [max_square, old] - -kernels = [] - -for precision in (Precision.SINGLE, Precision.DOUBLE): - kernels.append(generator.DenseKernel(f"test4_{precision}", precision, 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4)], 0.0000001)) - - kernels.append(generator.SparseKernel(f"test1_{precision}", precision, 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) - kernels.append(generator.DenseKernel(f"test2_{precision}", precision, 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel(f"test3_{precision}", precision, 8, 56, 56, 8, 56, 8, 0.0, 0.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 1) for x in blocksize_algs], 0.0000001)) - - kernels.append(generator.SparseKernel(f"arm_only_test1_{precision}", precision, 2, 3, 4, 2, 0, 2, 1.1233, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], generator.generateMTX(4, 3, 5), 0.0000001)) - kernels.append(generator.SparseKernel(f"arm_only_test2_{precision}", precision, 2, 3, 4, 20, 0, 14, 1.0, 1.0, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], generator.generateMTX(4, 3, 5), 0.0000001)) - kernels.append(generator.SparseKernel(f"arm_only_test3_{precision}", precision, 32, 80, 50, 32, 0, 32, 1.0, 3.0, [(8, 5)] + [x.getBlocksize(32, 80, 1) for x in blocksize_algs], generator.generateMTX(50, 80, 294), 0.0000001)) - kernels.append(generator.SparseKernel(f"arm_only_test4_{precision}", precision, 32, 32, 32, 34, 0, 32, 1.0, 0.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1) for x in blocksize_algs], generator.generateMTX(32, 32, 24), 0.0000001)) - kernels.append(generator.SparseKernel(f"arm_only_test5_{precision}", precision, 2, 1, 1, 2, 0, 8, 1.0, -1.0, [(2, 1)] + [x.getBlocksize(2, 1, 1) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) - kernels.append(generator.SparseKernel(f"arm_only_test6_{precision}", precision, 2, 2, 2, 2, 0, 2, 2.0, 234234.123, [(2, 1)] + [x.getBlocksize(2, 2, 1) for x in blocksize_algs], generator.generateMTX(2, 2, 1), 0.0000001)) - kernels.append(generator.SparseKernel(f"arm_only_test7_{precision}", precision, 16, 5, 7, 16, 0, 16, 0.0, -1.123, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1) for x in blocksize_algs], generator.generateMTX(7, 5, 35), 0.0000001)) - - kernels.append(generator.DenseKernel(f"arm_only_test8_{precision}", precision, 2, 3, 4, 2, 4, 2, 1.0, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel(f"arm_only_test9_{precision}", precision, 2, 3, 4, 20, 12, 14, 2.0, 1.123, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel(f"arm_only_test10_{precision}", precision, 32, 80, 50, 32, 50, 32, 0.0, 0.2, [(8, 5)] + [x.getBlocksize(32, 80, 1) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel(f"arm_only_test11_{precision}", precision, 32, 32, 32, 33, 68, 32, 1231.0, 14443.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel(f"arm_only_test12_{precision}", precision, 2, 1, 1, 2, 1, 8, 1.0, 3.0, [(2, 1)] + [x.getBlocksize(2, 1, 1) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel(f"arm_only_test13_{precision}", precision, 2, 3, 3, 2, 3, 2, 1.0, 0.0, [(2, 1)] + [x.getBlocksize(2, 3, 1) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel(f"arm_only_test14_{precision}", precision, 16, 5, 7, 16, 7, 16, 1.0, 1.0, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1) for x in blocksize_algs], 0.0000001)) - - -generator.make(kernels, "arm") diff --git a/tests/unit_tests_arm_sve.py b/tests/unit_tests_arm_sve.py deleted file mode 100644 index 3d5203d..0000000 --- a/tests/unit_tests_arm_sve.py +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env python3 - -import testsuite_generator as generator - -import pspamm.scripts.max_arm_sve as max_sve -from pspamm.codegen.precision import * - -import sys - -v_len = 4 - -if len(sys.argv) == 2: - v_len = int(sys.argv[1]) // 128 - -blocksize_algs = [max_sve] -v_size = lambda prec: (16 // prec.size()) * v_len -v_size_d = v_size(Precision.DOUBLE) -v_size_s = v_size(Precision.SINGLE) -v_size_h = v_size(Precision.HALF) -bitlen = v_len * 128 -kernels = [] - -# define the maximum allowed difference between elements of our solution and the reference solution for -# double and single precision -delta_sp = 1e-4 # epsilon is around e-7 => /2 ... For most cases, 1e-6 is enough -delta_dp = 1e-7 # epsilon is around e-15 => /2 - -# test cases for double precision multiplication -kernels.append(generator.DenseKernel("sve_mixed_test1", Precision.DOUBLE, 9, 9, 9, 9, 9, 9, 1.0, 0.0, [(3, 3)] + [x.getBlocksize(9, 9, 1, v_size_d) for x in blocksize_algs], delta_dp)) -kernels.append(generator.SparseKernel("sve_mixed_test2", Precision.DOUBLE, 9, 9, 9, 9, 0, 9, 4.0, 2.5, [(3, 3)] + [x.getBlocksize(9, 9, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(9, 9, 20), delta_dp)) -kernels.append(generator.SparseKernel("sve_mixed_test3", Precision.DOUBLE, 18, 18, 18, 18, 0, 18, 3.4, -2.5, [(1, 1), (3, 3), (6, 6)] + [x.getBlocksize(18, 18, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(18, 18, 59), delta_dp)) -kernels.append(generator.SparseKernel("sve_mixed_test4", Precision.DOUBLE, 80, 80, 80, 80, 0, 80, 0.0, -2.5, [(4, 4), (8, 8)] + [x.getBlocksize(80, 80, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(80, 80, 312), delta_dp)) -kernels.append(generator.SparseKernel("sve_mixed_test5", Precision.DOUBLE, 8, 8, 8, 10, 0, 8, 3.0, -0.9, [(2, 2), (4, 4)] + [x.getBlocksize(8, 8, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(8, 8, 6), delta_dp)) -kernels.append(generator.DenseKernel("sve_mixed_test6", Precision.DOUBLE, 8, 8, 8, 10, 8, 8, 3.0, -0.9, [(2, 2), (4, 4)] + [x.getBlocksize(8, 8, 1, v_size_d) for x in blocksize_algs], delta_dp)) - -kernels.append(generator.DenseKernel("sve_test3", Precision.DOUBLE, 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4)], delta_dp)) - -kernels.append(generator.SparseKernel("sve_test1", Precision.DOUBLE, 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(56, 56, 30), delta_dp)) -kernels.append(generator.DenseKernel("sve_test2", Precision.DOUBLE, 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1, v_size_d) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_test3", Precision.DOUBLE, 8, 56, 56, 8, 56, 8, 0.0, 0.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 1, v_size_d) for x in blocksize_algs], delta_dp)) - -kernels.append(generator.SparseKernel("sve_arm_only_test1", Precision.DOUBLE, 2, 3, 4, 2, 0, 2, 1.1233, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(4, 3, 5), delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test2", Precision.DOUBLE, 2, 3, 4, 20, 0, 14, 1.0, 1.0, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(4, 3, 5), delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test3", Precision.DOUBLE, 32, 80, 50, 32, 0, 32, 1.0, 3.0, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(50, 80, 294), delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test4", Precision.DOUBLE, 32, 32, 32, 34, 0, 32, 1.0, 0.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(32, 32, 24), delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test5", Precision.DOUBLE, 2, 1, 1, 2, 0, 8, 1.0, -1.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(1, 1, 1), delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test6", Precision.DOUBLE, 2, 2, 2, 2, 0, 2, 2.0, 234234.123, [(2, 1)] + [x.getBlocksize(2, 2, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(2, 2, 1), delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test7", Precision.DOUBLE, 16, 5, 7, 16, 0, 16, 0.0, -1.123, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(7, 5, 35), delta_dp)) - -kernels.append(generator.DenseKernel("sve_arm_only_test8", Precision.DOUBLE, 2, 3, 4, 2, 4, 2, 1.0, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size_d) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_arm_only_test9", Precision.DOUBLE, 2, 3, 4, 20, 12, 14, 2.0, 1.123, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size_d) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_arm_only_test10", Precision.DOUBLE, 32, 80, 50, 32, 50, 32, 0.0, 0.2, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size_d) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_arm_only_test11", Precision.DOUBLE, 32, 32, 32, 33, 68, 32, 1231.0, 14443.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size_d) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_arm_only_test12", Precision.DOUBLE, 2, 1, 1, 2, 1, 8, 1.0, 3.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size_d) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_arm_only_test13", Precision.DOUBLE, 2, 3, 3, 2, 3, 2, 1.0, 0.0, [(2, 1)] + [x.getBlocksize(2, 3, 1, v_size_d) for x in blocksize_algs], delta_dp)) -kernels.append(generator.DenseKernel("sve_arm_only_test14", Precision.DOUBLE, 16, 5, 7, 16, 7, 16, 1.0, 1.0, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size_d) for x in blocksize_algs], delta_dp)) - -kernels.append(generator.DenseKernel("sve_arm_only_test15", Precision.DOUBLE, 23, 29, 31, 23, 31, 23, 1.32, 0.96, [x.getBlocksize(23, 29, 1, v_size_d) for x in blocksize_algs], delta_dp)) -kernels.append(generator.SparseKernel("sve_arm_only_test16", Precision.DOUBLE, 23, 29, 31, 23, 0, 23, 1.32, 0.96, [x.getBlocksize(23, 29, 1, v_size_d) for x in blocksize_algs], generator.generateMTX(31, 29, 61), delta_dp)) - -# test cases for single precision multiplication -kernels.append(generator.DenseKernel("sve_single_prec_test_S1", Precision.SINGLE, 9, 9, 9, 9, 9, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size_s) for x in blocksize_algs], delta_sp)) -kernels.append(generator.DenseKernel("sve_single_prec_test_S2", Precision.SINGLE, 15, 15, 15, 15, 15, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size_s) for x in blocksize_algs], delta_sp)) -kernels.append(generator.DenseKernel("sve_single_prec_test_S3", Precision.SINGLE, 23, 23, 23, 23, 23, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_s) for x in blocksize_algs], delta_sp)) -kernels.append(generator.DenseKernel("sve_single_prec_test_S4", Precision.SINGLE, 23, 31, 13, 23, 13, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_s) for x in blocksize_algs], delta_sp)) -kernels.append(generator.SparseKernel("sve_single_prec_test_S5", Precision.SINGLE, 9, 9, 9, 9, 0, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(9, 9, 8), delta_sp)) -kernels.append(generator.SparseKernel("sve_single_prec_test_S6", Precision.SINGLE, 15, 15, 15, 15, 0, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(15, 15, 22), delta_sp)) -kernels.append(generator.SparseKernel("sve_single_prec_test_S7", Precision.SINGLE, 23, 23, 23, 23, 0, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(23, 23, 52), delta_sp)) -kernels.append(generator.SparseKernel("sve_single_prec_test_S8", Precision.SINGLE, 23, 31, 13, 23, 0, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(13, 31, 40), delta_sp)) - -""" -kernels.append(generator.DenseKernel("sve_half_prec_test_S1", Precision.HALF, 9, 9, 9, 9, 9, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size_h) for x in blocksize_algs], delta_sp)) -kernels.append(generator.DenseKernel("sve_half_prec_test_S2", Precision.HALF, 15, 15, 15, 15, 15, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size_h) for x in blocksize_algs], delta_sp)) -kernels.append(generator.DenseKernel("sve_half_prec_test_S3", Precision.HALF, 23, 23, 23, 23, 23, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_h) for x in blocksize_algs], delta_sp)) -kernels.append(generator.DenseKernel("sve_half_prec_test_S4", Precision.HALF, 23, 31, 13, 23, 13, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_h) for x in blocksize_algs], delta_sp)) -kernels.append(generator.SparseKernel("sve_half_prec_test_S5", Precision.HALF, 9, 9, 9, 9, 0, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size_h) for x in blocksize_algs], generator.generateMTX(9, 9, 8), delta_sp)) -kernels.append(generator.SparseKernel("sve_half_prec_test_S6", Precision.HALF, 15, 15, 15, 15, 0, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size_h) for x in blocksize_algs], generator.generateMTX(15, 15, 22), delta_sp)) -kernels.append(generator.SparseKernel("sve_half_prec_test_S7", Precision.HALF, 23, 23, 23, 23, 0, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_h) for x in blocksize_algs], generator.generateMTX(23, 23, 52), delta_sp)) -kernels.append(generator.SparseKernel("sve_half_prec_test_S8", Precision.HALF, 23, 31, 13, 23, 0, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_h) for x in blocksize_algs], generator.generateMTX(13, 31, 40), delta_sp)) -""" - -generator.make(kernels, f"arm_sve{bitlen}") diff --git a/tests/unit_tests_hsw.py b/tests/unit_tests_hsw.py deleted file mode 100755 index 4c30fb1..0000000 --- a/tests/unit_tests_hsw.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 - -import testsuite_generator as generator - -import pspamm.scripts.max_hsw as max_square -import pspamm.scripts.old_hsw as old -from pspamm.codegen.precision import * - -blocksize_algs = [max_square, old] - -delta_sp = 1e-4 # epsilon is around e-7 => /2 ... For most cases, 1e-6 is enough -delta_dp = 1e-7 # epsilon is around e-15 => /2 - -kernels = [] -for precision, delta in zip((Precision.SINGLE, Precision.DOUBLE), (delta_sp, delta_dp)): - kernels.append(generator.SparseKernel(f"test1_{precision}", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], generator.generateMTX(56, 56, 30), delta)) - kernels.append(generator.DenseKernel(f"test2_{precision}", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8,2)] + [x.getBlocksize(8, 40, 2) for x in blocksize_algs], delta)) - kernels.append(generator.DenseKernel(f"test3_{precision}", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], delta)) - kernels.append(generator.SparseKernel(f"hsw_only_test1_{precision}", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 1), delta)) - kernels.append(generator.SparseKernel(f"hsw_only_test2_{precision}", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2)] + [x.getBlocksize(24, 40, 2) for x in blocksize_algs], generator.generateMTX(40, 40, 20), delta)) - - kernels.append(generator.SparseKernel(f"hsw_only_test3_{precision}", precision, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 2), delta)) - kernels.append(generator.SparseKernel(f"hsw_only_test4_{precision}", precision, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], generator.generateMTX(10, 20, 1), delta)) - kernels.append(generator.SparseKernel(f"hsw_only_test5_{precision}", precision, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], generator.generateMTX(10, 5, 1), delta)) - kernels.append(generator.SparseKernel(f"hsw_only_test6_{precision}", precision, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], generator.generateMTX(1, 1, 1), delta)) - kernels.append(generator.SparseKernel(f"hsw_only_test7_{precision}", precision, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8,1)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], generator.generateMTX(40, 24, 1), delta)) - - kernels.append(generator.DenseKernel(f"hsw_only_test8_{precision}", precision, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], delta)) - kernels.append(generator.DenseKernel(f"hsw_only_test9_{precision}", precision, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2)] + [x.getBlocksize(32, 40, 2) for x in blocksize_algs], delta)) - kernels.append(generator.DenseKernel(f"hsw_only_test10_{precision}", precision, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [x.getBlocksize(56, 28, 2) for x in blocksize_algs], delta)) - kernels.append(generator.DenseKernel(f"hsw_only_test11_{precision}", precision, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], delta)) - kernels.append(generator.DenseKernel(f"hsw_only_test12_{precision}", precision, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], delta)) - kernels.append(generator.DenseKernel(f"hsw_only_test13_{precision}", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], delta)) - kernels.append(generator.DenseKernel(f"hsw_only_test14_{precision}", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], delta)) - -for arch in ('hsw256', 'hsw128'): - generator.make(kernels, arch) diff --git a/tests/unit_tests_knl.py b/tests/unit_tests_knl.py deleted file mode 100755 index b673be3..0000000 --- a/tests/unit_tests_knl.py +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python3 - -import testsuite_generator as generator - -import pspamm.scripts.max_knl as max_square -import pspamm.scripts.max_bn_knl as max_bn -import pspamm.scripts.old_knl as old -from pspamm.codegen.precision import * - -blocksize_algs = [max_square, max_bn, old] - -kernels = [] - -for precision in (Precision.SINGLE, Precision.DOUBLE): - kernels.append(generator.SparseKernel(f"test1_{precision}", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], generator.generateMTX(56, 56, 30), 0.0000001)) - kernels.append(generator.DenseKernel(f"test2_{precision}", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel(f"test3_{precision}", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.SparseKernel(f"knl_only_test1_{precision}", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 1), 0.0000001)) - kernels.append(generator.SparseKernel(f"knl_only_test2_{precision}", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2), (16,7)] + [x.getBlocksize(24, 40, 2) for x in blocksize_algs], generator.generateMTX(40, 40, 20), 0.0000001)) - - kernels.append(generator.SparseKernel(f"knl_only_test3_{precision}", precision, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], generator.generateMTX(1, 2, 2), 0.0000001)) - kernels.append(generator.SparseKernel(f"knl_only_test4_{precision}", precision, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [(8, 20), (24,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], generator.generateMTX(10, 20, 1), 0.0000001)) - kernels.append(generator.SparseKernel(f"knl_only_test5_{precision}", precision, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [(32, 2), (8,14)] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], generator.generateMTX(10, 5, 1), 0.0000001)) - kernels.append(generator.SparseKernel(f"knl_only_test6_{precision}", precision, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], generator.generateMTX(1, 1, 1), 0.0000001)) - kernels.append(generator.SparseKernel(f"knl_only_test7_{precision}", precision, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8, 24), (8,1)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], generator.generateMTX(40, 24, 1), 0.0000001)) - - kernels.append(generator.DenseKernel(f"knl_only_test8_{precision}", precision, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel(f"knl_only_test9_{precision}", precision, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2), (16,7)] + [x.getBlocksize(32, 40, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel(f"knl_only_test10_{precision}", precision, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [(8, 28)], 0.0000001)) - kernels.append(generator.DenseKernel(f"knl_only_test11_{precision}", precision, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8, 20), (8,3)] + [x.getBlocksize(8, 20, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel(f"knl_only_test12_{precision}", precision, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [(32, 2), (8,14)] + [x.getBlocksize(64, 5, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel(f"knl_only_test13_{precision}", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], 0.0000001)) - kernels.append(generator.DenseKernel(f"knl_only_test14_{precision}", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [(8, 24)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], 0.0000001)) - -for arch in ('knl512', 'knl256', 'knl128'): - generator.make(kernels, arch) - - From 57d1b475105fb4142c14672489627e84ee9884dc Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 16:59:03 +0200 Subject: [PATCH 35/64] Prepare for larger-k ARM kernels --- pspamm/codegen/architectures/arm/blocksize.py | 29 +++++++++++++++ pspamm/codegen/architectures/arm/generator.py | 2 ++ .../architectures/arm_sve/blocksize.py | 36 +++++++++++++++++++ .../architectures/arm_sve/generator.py | 30 +++++++++++----- 4 files changed, 88 insertions(+), 9 deletions(-) diff --git a/pspamm/codegen/architectures/arm/blocksize.py b/pspamm/codegen/architectures/arm/blocksize.py index c9b4352..5fc1c4f 100644 --- a/pspamm/codegen/architectures/arm/blocksize.py +++ b/pspamm/codegen/architectures/arm/blocksize.py @@ -65,4 +65,33 @@ def ARM_condition(cls, bm, bn, bk, v_size): vm = -(bm // -v_size) return (bn+bk) * vm + bn*bk <= 32 +class MaxK: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + bm = 2 + bn = 1 + maxval = 0 + + elem128 = 16 // prec.size() + + for i in range(v_size, m+1, v_size): + for j in range(1, n+1): + if cls.ARM_condition(i, j, bk, v_size, elem128): + if i*j > maxval: + maxval = i*j + bm = i + bn = j + + while cls.ARM_condition(bm, bn, bk+1, v_size, elem128): + bk += 1 + + return (bm, bn, bk) + + @classmethod + def ARM_condition(cls, bm, bn, bk, v_size, elem128): + # ceiling division + vm = -(bm // -v_size) + vk = -(bk // -elem128) + return (bn+vk) * vm + bn*vk <= 32 + Default = Max diff --git a/pspamm/codegen/architectures/arm/generator.py b/pspamm/codegen/architectures/arm/generator.py index 9ed2799..e559df4 100644 --- a/pspamm/codegen/architectures/arm/generator.py +++ b/pspamm/codegen/architectures/arm/generator.py @@ -49,6 +49,8 @@ def init_mask(self, bm, v_size, tempreg, maskregs): def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int): assert(bm % v_size == 0) vm = bm//v_size + elem128 = 16 // self.get_precision().size() + vk = -(bk // -elem128) assert((bn+bk) * vm + bn * bk <= 32) # Needs to fit in NEON v registers prec = { diff --git a/pspamm/codegen/architectures/arm_sve/blocksize.py b/pspamm/codegen/architectures/arm_sve/blocksize.py index f7bf65d..9f3c287 100644 --- a/pspamm/codegen/architectures/arm_sve/blocksize.py +++ b/pspamm/codegen/architectures/arm_sve/blocksize.py @@ -35,4 +35,40 @@ def ARM_condition(cls, bm, bn, bk, v_size): def tileable(cls, m, bm): return m % bm == 0 +class MaxK: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + # v_size default is 2, however for SVE that parameter will always be larger + bm = 2 + bn = 1 + maxval = 0 + + elem128 = 16 // prec.size() + + for i in range(1, m + 1, 1): + next_multiple = -(i // -v_size) + for j in range(1, n + 1): + if cls.ARM_condition(next_multiple, j, bk, v_size, elem128) and cls.tileable(m, i): + if i * j >= maxval: + maxval = i * j + bm = i + bn = j + + if maxval == 0: + raise RuntimeError("Could not find an appropriate block size. We suggest padding the matrix dimensions") + + while cls.ARM_condition(bm, bn, bk+1, v_size, elem128): + bk += 1 + + return (bm, bn, bk) + + def ARM_condition(cls, bm, bn, bk, v_size, elem128): + # ceiling division + vm = -(bm // -v_size) + vk = -(bk // -elem128) + return (bn + vk) * vm + bn*vk <= 32 + + def tileable(cls, m, bm): + return m % bm == 0 + Default = Max diff --git a/pspamm/codegen/architectures/arm_sve/generator.py b/pspamm/codegen/architectures/arm_sve/generator.py index cf9d331..5a987cc 100644 --- a/pspamm/codegen/architectures/arm_sve/generator.py +++ b/pspamm/codegen/architectures/arm_sve/generator.py @@ -75,7 +75,12 @@ def set_sparse(self): def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: int, n: int, k: int): vm = self.ceil_div(bm, v_size) # vm can be 0 if bm < v_size -> makes ceil_div necessary + + # k-broadcasting only works in 128-bit lanes + elem128 = 16 // self.get_precision().size() + vk = -(bk // -elem128) assert ((bn + bk) * vm + bn * bk <= 32) # Needs to fit in SVE z registers + prec = { Precision.DOUBLE: "d", Precision.SINGLE: "s", @@ -100,7 +105,7 @@ def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: i mask_regs = [p(0), p(7)] - self.init_registers(bm, v_size) + self.init_registers(bm, bk, v_size) return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, mask_regs @@ -141,10 +146,13 @@ def init_mask(self, def init_registers(self, bm: int, + bk: int, v_size: int ) -> None: bmmod = bm % v_size + elem128 = 16 // self.get_precision().size() + bkmod = bk % elem128 eol = "\\n\\t" # define the "end of line" sequence for easy assembly # determine the predicate suffix @@ -158,20 +166,24 @@ def init_registers(self, gen_reg = "w" if self.get_precision().size() <= 4 else "x" overhead_counter = 6 - comment = "//p7 denotes the 'all-true' predicate and, if given, p0 denotes the 'bm % v_size' predicate\n\t" + comment = "// p7 denotes the 'all-true' predicate\n\t" + comment += "// if given, p0 denotes the 'bm % v_size' predicate\n\t" + # comment += "// if given, p1 denotes the 'bk % elem128' predicate\n\t" # specification for ptrue: https://developer.arm.com/documentation/ddi0596/2021-12/SVE-Instructions/PTRUE--Initialise-predicate-from-named-constraint- # search for 'DecodePredCount' for the explanation of how the pattern in 'ptrue p{d}.{suffix}, #pattern' is decoded: # https://developer.arm.com/documentation/ddi0596/2020-12/Shared-Pseudocode/AArch64-Functions?lang=en#impl-aarch64.DecodePredCount.2 # 'ptrue' doesnt work for initialising overhead predicate when using single precision -> see valid patterns from above # overhead = "\"ptrue p0.{suffix}, #{overhead}{eol}\"\n\t" if bm != 0 else "" # define overhead predicate - overhead = "\"mov {gen_reg}{overhead_counter}, #{overhead}{eol}\"\n\t\"whilelo p0.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if bmmod != 0 else "" + overhead_m = "\"mov {gen_reg}{overhead_counter}, #{overhead_m}{eol}\"\n\t\"whilelo p0.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if bmmod != 0 else "" + overhead_k = "" #"\"mov {gen_reg}{overhead_counter}, #{overhead_k}{eol}\"\n\t\"whilelo p1.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if bkmod != 0 else "" all_true = "\"ptrue p7.{suffix}, #31{eol}\"" # define all true predicate - init_registers = (comment + overhead + all_true).format(suffix=p_suffix, - gen_reg=gen_reg, - overhead_counter=overhead_counter, - v_size=v_size, - overhead=bmmod, - eol=eol) + init_registers = (comment + overhead_m + overhead_k + all_true).format(suffix=p_suffix, + gen_reg=gen_reg, + overhead_counter=overhead_counter, + v_size=v_size, + overhead_m=bmmod, + overhead_k=bkmod, + eol=eol) # since .format() doesn't allow partial formatting, we need to re-include the # placeholders that are replaced at the end of generating a kernel From a3214800048e68313178583ce9eec56654212bc2 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 20:45:35 +0200 Subject: [PATCH 36/64] Fix ARM kernel register block loading --- pspamm/codegen/architectures/arm/generator.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pspamm/codegen/architectures/arm/generator.py b/pspamm/codegen/architectures/arm/generator.py index e559df4..156e0e3 100644 --- a/pspamm/codegen/architectures/arm/generator.py +++ b/pspamm/codegen/architectures/arm/generator.py @@ -133,7 +133,7 @@ def move_register_block(self, if (mask is None) or (mask[ir,ic]): cell_offset = Coords(down=ir*v_size, right=ic) addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) - addr.disp += 8 * load_offset + addr.disp += self.precision.size() * load_offset next_offset = [0, 0] if ir+1 < rows: next_offset = [1, 0] @@ -141,8 +141,8 @@ def move_register_block(self, next_offset = [0, 1] addr_next, comment_next = cursor.look(cursor_ptr, block_offset, Coords(down=(ir+next_offset[0])*v_size, right=ic+next_offset[1])) - addr_next.disp += 8 * load_offset - if addr_next.disp == addr.disp + 8 * v_size: + addr_next.disp += self.precision.size() * load_offset + if addr_next.disp == addr.disp + 16: skipflag = True if addr.disp > 255: if(addr.disp - cur11 > 0 and addr.disp - cur11 < 256): @@ -152,6 +152,11 @@ def move_register_block(self, cur11 = addr.disp addr.disp = 0 addr.base = additional_regs[0] + if skipflag and addr.disp % 16 != 0: + asm.add(add(addr.disp, additional_regs[0], "", addr.base)) + cur11 = addr.disp + addr.disp = 0 + addr.base = additional_regs[0] if not skipflag: if store: @@ -208,7 +213,6 @@ def make_microkernel(self, mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size) asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False)) - x = 0; bs = [] cur11 = -1000 for Vmi in range(bm//v_size): From b2ceb793bedbe0e9cda558e84daf362f2e4ef3a0 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 20:45:42 +0200 Subject: [PATCH 37/64] Add some more comments --- pspamm/matmul.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pspamm/matmul.py b/pspamm/matmul.py index b9b3996..2828875 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -323,9 +323,9 @@ def kernelK(asm, Bki, A_ptr, B_ptr): for ic in range(A_regs_cut.shape[1]): pred_m = None if not self.masks else self.generator.pred_n_trues(self.bm - ir*self.v_size, self.v_size, "m") if self.beta != 0.0 and self.beta != 1.0: - store_block.add(mul(A_regs_cut[ir,ic], self.beta_reg[1], A_regs_cut[ir,ic], pred=pred_m)) + store_block.add(mul(A_regs_cut[ir,ic], self.beta_reg[1], A_regs_cut[ir,ic], "C = beta * C + alpha * AB", pred=pred_m)) if self.beta == 0.0: - store_block.add(mul(regs[ir, x + ic], self.alpha_reg[1], A_regs_cut[ir, ic], "C = C + alpha * AB", pred=pred_m)) + store_block.add(mul(regs[ir, x + ic], self.alpha_reg[1], A_regs_cut[ir, ic], "C = alpha * AB", pred=pred_m)) else: store_block.add(fma(regs[ir, x + ic], self.alpha_reg[1], A_regs_cut[ir, ic], "C = C + alpha * AB", None, pred=pred_m)) store_block.add(self.generator.move_register_block(self.C, C_ptr, Coords(), A_regs_cut, self.v_size, self.additional_regs, None, True, self.prefetching, self.ldc * x)) From 2fc28a9102f29cfec73846ec90c6f66ce71bc8d3 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 20:45:55 +0200 Subject: [PATCH 38/64] Fix unit tests --- tests/unit_test.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/tests/unit_test.py b/tests/unit_test.py index 9c14388..f3e012d 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -44,26 +44,26 @@ kernels.append(generator.SparseKernel("test1", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(56, 56, 30), delta)) kernels.append(generator.DenseKernel("test2", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 2, v_size, precision) for x in blocksize_algs], delta)) kernels.append(generator.DenseKernel("test3", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], delta)) - kernels.append(generator.SparseKernel("knl_only_test1", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 2, 1), delta)) - kernels.append(generator.SparseKernel("knl_only_test2", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2), (16,7)] + [x.getBlocksize(24, 40, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(40, 40, 20), delta)) - - kernels.append(generator.SparseKernel("knl_only_test3", precision, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 2, 2), delta)) - kernels.append(generator.SparseKernel("knl_only_test4", precision, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [(8, 20), (24,3)] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(10, 20, 1), delta)) - kernels.append(generator.SparseKernel("knl_only_test5", precision, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [(32, 2), (8,14)] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(10, 5, 1), delta)) - kernels.append(generator.SparseKernel("knl_only_test6", precision, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 1, 1), delta)) - kernels.append(generator.SparseKernel("knl_only_test7", precision, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8, 24), (8,1)] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(40, 24, 1), delta)) - - kernels.append(generator.DenseKernel("knl_only_test8", precision, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], delta)) - kernels.append(generator.DenseKernel("knl_only_test9", precision, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2), (16,7)] + [x.getBlocksize(32, 40, 2, v_size, precision) for x in blocksize_algs], delta)) - kernels.append(generator.DenseKernel("knl_only_test10", precision, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [(8, 28)], delta)) - kernels.append(generator.DenseKernel("knl_only_test11", precision, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8, 20), (8,3)] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], delta)) - kernels.append(generator.DenseKernel("knl_only_test12", precision, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [(32, 2), (8,14)] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], delta)) - kernels.append(generator.DenseKernel("knl_only_test13", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], delta)) - kernels.append(generator.DenseKernel("knl_only_test14", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [(8, 24)] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], delta)) - - kernels.append(generator.SparseKernel("test1", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(56, 56, 30), delta)) - kernels.append(generator.DenseKernel("test2", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8,2)] + [x.getBlocksize(8, 40, 2, v_size, precision) for x in blocksize_algs], delta)) - kernels.append(generator.DenseKernel("test3", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.SparseKernel("knl_only_test1", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1,2)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 2, 1), delta)) + kernels.append(generator.SparseKernel("knl_only_test2", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2,2), (16,7,2)] + [x.getBlocksize(24, 40, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(40, 40, 20), delta)) + + kernels.append(generator.SparseKernel("knl_only_test3", precision, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1,2)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 2, 2), delta)) + kernels.append(generator.SparseKernel("knl_only_test4", precision, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [(8, 20,2), (24,3,2)] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(10, 20, 1), delta)) + kernels.append(generator.SparseKernel("knl_only_test5", precision, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [(32, 2,2), (8,14,2)] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(10, 5, 1), delta)) + kernels.append(generator.SparseKernel("knl_only_test6", precision, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1,2)] + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 1, 1), delta)) + kernels.append(generator.SparseKernel("knl_only_test7", precision, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8, 24,2), (8,1,2)] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(40, 24, 1), delta)) + + kernels.append(generator.DenseKernel("knl_only_test8", precision, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1,2)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("knl_only_test9", precision, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2,2), (16,7,2)] + [x.getBlocksize(32, 40, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("knl_only_test10", precision, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [(8, 28,2)], delta)) + kernels.append(generator.DenseKernel("knl_only_test11", precision, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8, 20,2), (8,3,2)] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("knl_only_test12", precision, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [(32, 2,2), (8,14,2)] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("knl_only_test13", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1,2)] + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("knl_only_test14", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [(8, 24,2)] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], delta)) + + kernels.append(generator.SparseKernel("hswtest1", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(56, 56, 30), delta)) + kernels.append(generator.DenseKernel("hswtest2", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8,2)] + [x.getBlocksize(8, 40, 2, v_size, precision) for x in blocksize_algs], delta)) + kernels.append(generator.DenseKernel("hswtest3", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], delta)) kernels.append(generator.SparseKernel("hsw_only_test1", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(1, 2, 1), delta)) kernels.append(generator.SparseKernel("hsw_only_test2", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2)] + [x.getBlocksize(24, 40, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(40, 40, 20), delta)) From 73988b2649e694b3b68777ce0724dbd92073b66d Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 21:13:22 +0200 Subject: [PATCH 39/64] Rework test output --- tests/testsuite_generator.py | 56 +++++++++++++++++++----------------- tests/unit_test.py | 2 +- 2 files changed, 31 insertions(+), 27 deletions(-) diff --git a/tests/testsuite_generator.py b/tests/testsuite_generator.py index 122a764..5107c12 100755 --- a/tests/testsuite_generator.py +++ b/tests/testsuite_generator.py @@ -13,12 +13,14 @@ head_of_testsuite = """#include #include +#include #include #include #include #include #include #include +#include long long pspamm_num_total_flops = 0; @@ -128,32 +130,43 @@ } template -int post(unsigned M, unsigned N, unsigned K, unsigned LDA, unsigned* LDB, unsigned LDC, T* ALPHA, T* BETA, T* A, T* B, T* C, T* Cref, T DELTA) { +bool post(const std::string& name, unsigned M, unsigned N, unsigned K, unsigned LDA, unsigned* LDB, unsigned LDC, T* ALPHA, T* BETA, T* A, T* B, T* C, T* Cref, T DELTA) { if(*LDB == 0) *LDB = K; gemm_ref(M, N, K, LDA, *LDB, LDC, *ALPHA, *BETA, A, B, Cref); - + + bool failed = false; + double diffAbsMax = 0; + double diffRelMax = 0; for(int i = 0; i < M; i++) { for(int j = 0; j < N; j++) { // we use the relative error instead of the absolute error because of an issue we found for sparse single precision // kernels presumably due to limited precision of floats - if(std::abs((C[i + j * LDC] - Cref[i + j * LDC])) / Cref[i + j * LDC] > DELTA) { - std::cout << i << " " << j << " " << C[i + j * LDC] << " " << Cref[i + j * LDC] << std::endl; - return 0; - } + const double diffAbs = std::abs((static_cast(C[i + j * LDC]) - static_cast(Cref[i + j * LDC]))); + const double diffRel = diffAbs / static_cast(Cref[i + j * LDC]); + + diffAbsMax = std::max(diffAbs, diffAbsMax); + diffRelMax = std::max(diffRel, diffRelMax); + + failed |= diffRel > DELTA; } } - return 1; + const std::string resultString = failed ? "fail" : "success"; + + std::cout << std::scientific << name << ": " << resultString << " (abs: " << diffAbsMax << ", rel: " << diffRelMax << ")" << std::endl; + + return !failed; } """ setup_main = """ int main() { - std::vector> results; + int results = 0; + int correct = 0; """ @@ -166,31 +179,22 @@ auto pointers = pre<{precision}>({m}, {n}, {k}, {lda}, ldb, {ldc}, "{mtx}"); setup_prefetch(prefetch, std::get<3>(pointers), {n}, {ldc}); {name}(std::get<0>(pointers), std::get<{sparse}>(pointers), std::get<3>(pointers), {alpha}, {beta}, nullptr); - const auto result = post<{precision}>({m}, {n}, {k}, {lda}, &ldb, {ldc}, &alpha, &beta, std::get<0>(pointers), std::get<1>(pointers), std::get<3>(pointers), std::get<4>(pointers), {delta:.7f}); - results.push_back(std::make_tuple("{name}", result)); + const auto result = post<{precision}>(\"{name}\", {m}, {n}, {k}, {lda}, &ldb, {ldc}, &alpha, &beta, std::get<0>(pointers), std::get<1>(pointers), std::get<3>(pointers), std::get<4>(pointers), {delta:.15e}); + + if (result) {{ + ++correct; + }} + ++results; + free(std::get<0>(pointers)); free(std::get<1>(pointers)); free(std::get<2>(pointers)); free(std::get<3>(pointers)); free(std::get<4>(pointers)); }} """ end_of_testsuite = """ - int correct = 0; - for(int i = 0; i < results.size(); i++) - { - if(std::get<1>(results[i])) - { - ++correct; - printf("%s succeeded.\\n", (std::get<0>(results[i])).c_str()); - } - else - { - printf("%s failed!\\n", (std::get<0>(results[i])).c_str()); - } - } - - printf("\\n%i out of %lu test successful!\\n", correct, results.size()); + std::cout << correct << " out of " << results << " succeeded." << std::endl; - return correct == results.size() ? 0 : 1; + return correct == results ? 0 : 1; } """ diff --git a/tests/unit_test.py b/tests/unit_test.py index f3e012d..713e0f6 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -35,7 +35,7 @@ # double and single precision delta_hp = 1e-2 delta_sp = 1e-4 # epsilon is around e-7 => /2 ... For most cases, 1e-6 is enough -delta_dp = 1e-7 # epsilon is around e-15 => /2 +delta_dp = 1e-6 # epsilon is around e-15 => /2 kernels = [] From 5b14a9dbffa59eafd652fb7959ea29cb26c5892d Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 21:13:43 +0200 Subject: [PATCH 40/64] Add missing classmethod modifiers --- pspamm/codegen/architectures/arm_sve/blocksize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pspamm/codegen/architectures/arm_sve/blocksize.py b/pspamm/codegen/architectures/arm_sve/blocksize.py index 9f3c287..4c972ee 100644 --- a/pspamm/codegen/architectures/arm_sve/blocksize.py +++ b/pspamm/codegen/architectures/arm_sve/blocksize.py @@ -25,13 +25,13 @@ def getBlocksize(cls, m, n, bk, v_size, prec): return (bm, bn, bk) - + @classmethod def ARM_condition(cls, bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) return (bn + bk) * vm + bn*bk <= 32 - + @classmethod def tileable(cls, m, bm): return m % bm == 0 From 985634ccda6e08666ee759172710f651b33c8191 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 21:15:14 +0200 Subject: [PATCH 41/64] Remove old scripts references --- pspamm/matmul.py | 5 ----- tests/unit_test.py | 1 - 2 files changed, 6 deletions(-) diff --git a/pspamm/matmul.py b/pspamm/matmul.py index 2828875..57b1bbf 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -5,11 +5,6 @@ from pspamm.codegen.forms import * from pspamm.codegen.precision import * -import pspamm.scripts.old_arm -import pspamm.scripts.max_bn_knl -import pspamm.scripts.max_hsw -import pspamm.scripts.max_arm_sve - from pspamm.cursors import * import pspamm.architecture diff --git a/tests/unit_test.py b/tests/unit_test.py index 713e0f6..c1de11c 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -3,7 +3,6 @@ import testsuite_generator as generator from importlib import import_module -import pspamm.scripts from pspamm.codegen.precision import * import sys From 9f99416405ab51bfdb685f2fc9cd21c5ba833fde Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 21:17:57 +0200 Subject: [PATCH 42/64] Fix more class-move bugs --- pspamm/codegen/architectures/hsw/blocksize.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pspamm/codegen/architectures/hsw/blocksize.py b/pspamm/codegen/architectures/hsw/blocksize.py index 4d65441..eed8e97 100644 --- a/pspamm/codegen/architectures/hsw/blocksize.py +++ b/pspamm/codegen/architectures/hsw/blocksize.py @@ -58,12 +58,14 @@ def getBlocksize(cls, m, n, bk, v_size, prec): return (bm, bn, bk) - def cls.HSW_condition(cls, bm, bn, bk, v_size): + @classmethod + def HSW_condition(cls, bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) return (bn + bk) * vm + bn * bk <= 16 - def cls.HSW_condition_extended(cls, bm, bn, bk, v_size): + @classmethod + def HSW_condition_extended(cls, bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) return bn * vm + bn * bk + 1 <= 16 From e3a4fc29ef6878c7eaad0e80c758cde79759e254 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 21:22:28 +0200 Subject: [PATCH 43/64] Fix ARM_SVE codegen --- pspamm/matmul.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pspamm/matmul.py b/pspamm/matmul.py index 57b1bbf..f4252f2 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -122,7 +122,7 @@ def __init__(self, v_len_regs = v_len_bits // 128 arch = 'hsw' - if arch.startswith('arm'): + if arch.startswith('arm') and not arch.startswith('arm_sve'): # only 128 supported arch = 'arm' From 07a487b44b0ea7ad070d62fc42ae2c5be6dbd510 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 21:24:55 +0200 Subject: [PATCH 44/64] Yet even more class-move fixes --- pspamm/codegen/architectures/hsw/blocksize.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pspamm/codegen/architectures/hsw/blocksize.py b/pspamm/codegen/architectures/hsw/blocksize.py index eed8e97..78d4654 100644 --- a/pspamm/codegen/architectures/hsw/blocksize.py +++ b/pspamm/codegen/architectures/hsw/blocksize.py @@ -38,6 +38,7 @@ def HSW_condition(cls, bm, bn, bk, v_size): return (bn + bk) * vm + bn * bk <= 16 class Max: + @classmethod def getBlocksize(cls, m, n, bk, v_size, prec): bm = 4 bn = 1 From 207a373f42a6884578738c6e14fefb24209d8340 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 22:46:52 +0200 Subject: [PATCH 45/64] Add large matrix support and tests --- pspamm/codegen/architectures/hsw/generator.py | 76 ++++++++++++------- pspamm/codegen/architectures/knl/generator.py | 74 +++++++++++------- pspamm/codegen/regcache.py | 28 +++++++ tests/unit_test.py | 1 + 4 files changed, 122 insertions(+), 57 deletions(-) create mode 100644 pspamm/codegen/regcache.py diff --git a/pspamm/codegen/architectures/hsw/generator.py b/pspamm/codegen/architectures/hsw/generator.py index ac09192..9817c3f 100644 --- a/pspamm/codegen/architectures/hsw/generator.py +++ b/pspamm/codegen/architectures/hsw/generator.py @@ -5,7 +5,7 @@ from pspamm.codegen.sugar import * from pspamm.codegen.generator import * from pspamm.codegen.precision import * - +from pspamm.codegen.regcache import * class Generator(AbstractGenerator): template = """ @@ -81,17 +81,21 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: alpha_reg = [xmm(b_reg), vmm(b_reg)] beta_reg = [xmm(b_reg + 1), vmm(b_reg + 1)] - available_regs = [r(9),r(10),r(11),r(15),rax,r(13),r(14)] + available_regs = [r(9),r(10),r(11),r(15),rax] # ,r(13),r(14) additional_regs = [r(8)] reg_count = 0 + self.spontaneous_scaling = False for i in range(1024, min(max(nnz * self.precision.size(), m*k*self.precision.size(), m*n*self.precision.size()),8000), 2048): additional_regs.append(available_regs[reg_count]) reg_count += 1 for i in range(8192, min(nnz * self.precision.size(), 33000), 8192): + if reg_count == len(available_regs): + self.spontaneous_scaling = True + break additional_regs.append(available_regs[reg_count]) reg_count += 1 @@ -119,8 +123,9 @@ def make_scaling_offsets(self, asm = block("Optimize usage of offsets when accessing B Matrix") - for i in range(1, min(len(additional_regs), 5)): - asm.add(mov(c(1024 + (i-1) * 2048), additional_regs[i], False)) + if not self.spontaneous_scaling: + for i in range(1, min(len(additional_regs), 5)): + asm.add(mov(c(1024 + (i-1) * 2048), additional_regs[i], False)) return asm @@ -132,34 +137,45 @@ def make_b_pointers(self, asm = block("Optimize usage of offsets when accessing B Matrix") - reg_count = 5 + if not self.spontaneous_scaling: + reg_count = 5 - for i in range(8192, min(nnz * self.precision.size(), 33000), 8192): - asm.add(lea(B_reg, additional_regs[reg_count], i)) - reg_count += 1 + for i in range(8192, min(nnz * self.precision.size(), 33000), 8192): + asm.add(lea(B_reg, additional_regs[reg_count], i)) + reg_count += 1 return asm - def reg_based_scaling(self, addr: MemoryAddress, additional_regs: List[Register], with_index: bool): - if addr.disp >= 1024 and ((addr.disp < 32768 and with_index) or addr.disp < 8192): - scaling_and_register = { - 1: (1, 1), - 2: (2, 1), - 3: (1, 2), - 4: (4, 1), - 5: (1, 3), - 6: (2, 2), - 7: (1, 4) - } - if addr.disp % 8192 >= 1024: - addr.scaling, reg = scaling_and_register[ (addr.disp % 8192) // 1024 ] - addr.index = additional_regs[reg] - - if addr.disp >= 8192: - addr.base = additional_regs[addr.disp // 8192 + 4] - - addr.disp = addr.disp % 1024 + def reg_based_scaling(self, regcache, asm, addr: MemoryAddress, additional_regs: List[Register], with_index: bool): + if addr.disp >= 1024: + if ((addr.disp < 32768 and with_index) or addr.disp < 8192) and not self.spontaneous_scaling: + scaling_and_register = { + 1: (1, 1), + 2: (2, 1), + 3: (1, 2), + 4: (4, 1), + 5: (1, 3), + 6: (2, 2), + 7: (1, 4) + } + if addr.disp % 8192 >= 1024: + addr.scaling, reg = scaling_and_register[ (addr.disp % 8192) // 1024 ] + addr.index = additional_regs[reg] + + if addr.disp >= 8192 and not self.spontaneous_scaling: + addr.base = additional_regs[addr.disp // 8192 + 4] + + addr.disp = addr.disp % 1024 + else: + large_offset = addr.disp // 1024 + + basereg, load = regcache.get(large_offset) + if load: + asm.add(mov(c(large_offset * 1024), basereg, False)) + + addr.base = basereg + addr.disp = addr.disp % 1024 def move_register_block(self, cursor: Cursor, @@ -259,6 +275,8 @@ def make_microkernel(self, else: asm.add(self.move_register_single(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, 0, 0, mask, store=False)) + regcache = RegisterCache(additional_regs) + bs = [] bsv = [] for Vmi in range(bm//v_size): @@ -267,7 +285,7 @@ def make_microkernel(self, to_cell = Coords(down=bki, right=bni) if B.has_nonzero_cell(B_ptr, to_B_block, to_cell): B_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) - self.reg_based_scaling(B_addr, additional_regs, True) + self.reg_based_scaling(regcache, asm, B_addr, additional_regs, True) if B_regs[bki, bni] not in bs: asm.add(bcst(B_addr, B_regs[bki, bni], comment=B_comment)) bs.append(B_regs[bki, bni]) @@ -284,7 +302,7 @@ def make_microkernel(self, to_cell = Coords(down=bki, right=bni) if B.has_nonzero_cell(B_ptr, to_B_block, to_cell): B_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) - self.reg_based_scaling(B_addr, additional_regs, True) + self.reg_based_scaling(regcache, asm, B_addr, additional_regs, True) comment = "C[{}:{},{}] += A[{}:{},{}]*{}".format(Vmi*v_size,Vmi*v_size+v_size,bni,Vmi*v_size,Vmi*v_size+v_size,bki,B_comment) asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=None)) return asm diff --git a/pspamm/codegen/architectures/knl/generator.py b/pspamm/codegen/architectures/knl/generator.py index 2a7e3a5..79bf2e8 100644 --- a/pspamm/codegen/architectures/knl/generator.py +++ b/pspamm/codegen/architectures/knl/generator.py @@ -5,7 +5,7 @@ from pspamm.codegen.sugar import * from pspamm.codegen.generator import * from pspamm.codegen.precision import * - +from pspamm.codegen.regcache import * class Generator(AbstractGenerator): template = """ @@ -73,7 +73,7 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: alpha_reg = [rbx, rbx] beta_reg = [rcx, rcx] - available_regs = [r(9),r(10),r(11),r(15),rax,r(13),r(14)] + available_regs = [r(9),r(10),r(11),r(15),rax] # ,r(13),r(14) additional_regs = [r(8)] @@ -81,11 +81,15 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: reg_count = 0 + self.spontaneous_scaling = False for i in range(1024, min(max(nnz * self.precision.size(), m*k*self.precision.size(), m*n*self.precision.size()),8000), 2048): additional_regs.append(available_regs[reg_count]) reg_count += 1 for i in range(8192, min(nnz * self.precision.size(), 33000), 8192): + if reg_count == len(available_regs): + self.spontaneous_scaling = True + break additional_regs.append(available_regs[reg_count]) reg_count += 1 @@ -123,8 +127,9 @@ def make_scaling_offsets(self, asm = block("Optimize usage of offsets when accessing B Matrix") - for i in range(1, min(len(additional_regs), 5)): - asm.add(mov(c(1024 + (i-1) * 2048), additional_regs[i], False)) + if not self.spontaneous_scaling: + for i in range(1, min(len(additional_regs), 5)): + asm.add(mov(c(1024 + (i-1) * 2048), additional_regs[i], False)) return asm @@ -136,34 +141,45 @@ def make_b_pointers(self, asm = block("Optimize usage of offsets when accessing B Matrix") - reg_count = 5 + if not self.spontaneous_scaling: + reg_count = 5 - for i in range(8192, min(nnz * self.precision.size(), 33000), 8192): - asm.add(lea(B_reg, additional_regs[reg_count], i)) - reg_count += 1 + for i in range(8192, min(nnz * self.precision.size(), 33000), 8192): + asm.add(lea(B_reg, additional_regs[reg_count], i)) + reg_count += 1 return asm - def reg_based_scaling(self, addr: MemoryAddress, additional_regs: List[Register], with_index: bool): - if addr.disp >= 1024 and ((addr.disp < 32768 and with_index) or addr.disp < 8192): - scaling_and_register = { - 1: (1, 1), - 2: (2, 1), - 3: (1, 2), - 4: (4, 1), - 5: (1, 3), - 6: (2, 2), - 7: (1, 4) - } - if addr.disp % 8192 >= 1024: - addr.scaling, reg = scaling_and_register[ (addr.disp % 8192) // 1024 ] - addr.index = additional_regs[reg] - - if addr.disp >= 8192: - addr.base = additional_regs[addr.disp // 8192 + 4] - - addr.disp = addr.disp % 1024 + def reg_based_scaling(self, regcache, asm, addr: MemoryAddress, additional_regs: List[Register], with_index: bool): + if addr.disp >= 1024: + if ((addr.disp < 32768 and with_index) or addr.disp < 8192) and not self.spontaneous_scaling: + scaling_and_register = { + 1: (1, 1), + 2: (2, 1), + 3: (1, 2), + 4: (4, 1), + 5: (1, 3), + 6: (2, 2), + 7: (1, 4) + } + if addr.disp % 8192 >= 1024: + addr.scaling, reg = scaling_and_register[ (addr.disp % 8192) // 1024 ] + addr.index = additional_regs[reg] + + if addr.disp >= 8192 and not self.spontaneous_scaling: + addr.base = additional_regs[addr.disp // 8192 + 4] + + addr.disp = addr.disp % 1024 + else: + large_offset = addr.disp // 1024 + + basereg, load = regcache.get(large_offset) + if load: + asm.add(mov(c(large_offset * 1024), basereg, False)) + + addr.base = basereg + addr.disp = addr.disp % 1024 def move_register_block(self, cursor: Cursor, @@ -232,6 +248,8 @@ def make_microkernel(self, bm,bk,aidx,apattern = A.get_block(A_ptr, to_A_block) bk,bn,bidx,bpattern = B.get_block(B_ptr, to_B_block) + regcache = RegisterCache(additional_regs) + mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size, True) asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False)) @@ -241,7 +259,7 @@ def make_microkernel(self, to_cell = Coords(down=bki, right=bni) if B.has_nonzero_cell(B_ptr, to_B_block, to_cell): B_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) - self.reg_based_scaling(B_addr, additional_regs, True) + self.reg_based_scaling(regcache, asm, B_addr, additional_regs, True) comment = "C[{}:{},{}] += A[{}:{},{}]*{}".format(Vmi*v_size,Vmi*v_size+v_size,bni,Vmi*v_size,Vmi*v_size+v_size,bki,B_comment) asm.add(fma(B_addr, A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=0)) return asm diff --git a/pspamm/codegen/regcache.py b/pspamm/codegen/regcache.py new file mode 100644 index 0000000..44bb0ae --- /dev/null +++ b/pspamm/codegen/regcache.py @@ -0,0 +1,28 @@ + +class RegisterCache: + def __init__(self, registers): + self.access = 0 + self.lru = [-1] * len(registers) + self.registers = registers + self.storage = {} + + def get(self, value): + self.access += 1 + + evicted = False + + if value not in self.storage: + evicted = True + minaccess = self.access + minidx = -1 + for i, last in enumerate(self.lru): + if last < minaccess: + minaccess = last + minidx = i + self.storage[value] = minidx + + regidx = self.storage[value] + + self.lru[regidx] = self.access + + return (self.registers[regidx], evicted) diff --git a/tests/unit_test.py b/tests/unit_test.py index c1de11c..76b32ed 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -40,6 +40,7 @@ for precision, delta in zip((Precision.SINGLE, Precision.DOUBLE), (delta_sp, delta_dp)): v_size = v_size_fun(precision) + kernels.append(generator.DenseKernel("testlarge", precision, 40, 100, 100, 100, 100, 100, 2.5, 1.0, [(8, 5), (8,2)] + [x.getBlocksize(10, 10, 1, v_size, precision) for x in blocksize_algs], delta)) kernels.append(generator.SparseKernel("test1", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], generator.generateMTX(56, 56, 30), delta)) kernels.append(generator.DenseKernel("test2", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 2, v_size, precision) for x in blocksize_algs], delta)) kernels.append(generator.DenseKernel("test3", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], delta)) From fddef2eaf45d9ac3ee4c3200410c0726691a7493 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 23:09:14 +0200 Subject: [PATCH 46/64] Update readme --- README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index fa425cc..efa9cfa 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,10 @@ -# Code Generator for Sparse Matrix Multiplication -Generates inline-Assembly for sparse Matrix Multiplication. +# PSpaMM +A Code Generator For Small Sparse (and Dense) Matrix Multiplications. -Currently Intel Xeon Phi 'Knights Landing' (AVX512), Haswell/Zen2 (AVX2), and ARM Cortex-A53 (ARMv8) are supported. +Currently supported: + +* x86_64: AVX2, AVX512/AVX10.1 +* ARM/AARCH64: NEON, SVE (128,256,512,1024,2048 bit) ## Installation From 1dab328b9082ccd85491d581107677c238a360ea Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 23:09:36 +0200 Subject: [PATCH 47/64] Enable AVX512 mask support --- pspamm/codegen/architectures/knl/generator.py | 13 +++++++++---- pspamm/codegen/architectures/knl/inlineprinter.py | 3 ++- pspamm/codegen/sugar.py | 3 ++- tests/testsuite_generator.py | 3 +-- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/pspamm/codegen/architectures/knl/generator.py b/pspamm/codegen/architectures/knl/generator.py index 79bf2e8..096e1ea 100644 --- a/pspamm/codegen/architectures/knl/generator.py +++ b/pspamm/codegen/architectures/knl/generator.py @@ -44,12 +44,12 @@ def use_broadcast(self): return False def has_masks(self): - return False # for now + return True # for now def pred_n_trues(self, count, v_size, mode): # a bit hacky at the moment (won't work for all masks) if count < v_size and count > 0: - return mask(0) + return Predicate(mask(0), mode=='z') else: return None @@ -198,18 +198,23 @@ def move_register_block(self, action = "Store" if store else "Load" asm = block("{} {} register block @ {}".format(action,cursor.name,block_offset)) + b_row, _, _, _ = cursor.get_block(cursor_ptr, block_offset) + for ic in range(cols): for ir in range(rows): if (mask is None) or (mask[ir,ic]): cell_offset = Coords(down=ir*v_size, right=ic) addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) addr.disp += self.precision.size() * load_offset + + processed = ir * v_size + p = self.pred_n_trues(b_row - processed, v_size, 'z') if store: - asm.add(mov(registers[ir,ic], addr, True, comment)) + asm.add(mov(registers[ir,ic], addr, True, comment, pred=p)) if prefetching == 'BL2viaC': asm.add(prefetch(mem(additional_regs[0], addr.disp))) else: - asm.add(mov(addr, registers[ir,ic], True, comment)) + asm.add(mov(addr, registers[ir,ic], True, comment, pred=p)) return asm def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: diff --git a/pspamm/codegen/architectures/knl/inlineprinter.py b/pspamm/codegen/architectures/knl/inlineprinter.py index a776d25..dc17887 100644 --- a/pspamm/codegen/architectures/knl/inlineprinter.py +++ b/pspamm/codegen/architectures/knl/inlineprinter.py @@ -146,7 +146,8 @@ def visitMov(self, stmt: MovStmt): if stmt.typ == AsmType.i64: assert(stmt.pred == None) - if stmt.dest.ugly[0] == 'k': + # FIXME: no hack + if stmt.dest.ugly[2] == 'k': s = f"kmovq {src_str}, {stmt.dest.ugly}" else: s = f"movq {src_str}, {stmt.dest.ugly}" diff --git a/pspamm/codegen/sugar.py b/pspamm/codegen/sugar.py index 70387c4..74ac162 100644 --- a/pspamm/codegen/sugar.py +++ b/pspamm/codegen/sugar.py @@ -58,11 +58,12 @@ def jump(label: str, backwards=True): stmt.destination = pspamm.architecture.operands.l(label) return stmt -def mov(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None): +def mov(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None, pred = None): stmt = MovStmt() stmt.src = src if isinstance(src, Operand) else pspamm.architecture.operands.c(src) stmt.dest = dest stmt.comment = comment + stmt.pred = pred if vector: stmt.aligned = True stmt.typ = AsmType.f64x8 diff --git a/tests/testsuite_generator.py b/tests/testsuite_generator.py index 5107c12..2e3bcea 100755 --- a/tests/testsuite_generator.py +++ b/tests/testsuite_generator.py @@ -265,8 +265,7 @@ def make(kernels, arch): v_size = v_len if arch.startswith("knl"): - print(f'{bn} {bk} {vm} {bm} {v_size}') - if not ((bn+bk) * vm <= 32) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: + if not ((bn+bk) * vm <= 32): print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') continue elif arch.startswith("hsw"): From 9bc983150a36a58856d7dc2b4a01c181fb43b822 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 23:12:22 +0200 Subject: [PATCH 48/64] Fix tests once more --- tests/unit_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit_test.py b/tests/unit_test.py index 76b32ed..45ef3db 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -110,7 +110,7 @@ kernels.append(generator.SparseKernel("sve_mixed_test5", precision, 8, 8, 8, 10, 0, 8, 3.0, -0.9, [(2, 2), (4, 4)] + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(8, 8, 6), delta_dp)) kernels.append(generator.DenseKernel("sve_mixed_test6", precision, 8, 8, 8, 10, 8, 8, 3.0, -0.9, [(2, 2), (4, 4)] + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], delta_dp)) - kernels.append(generator.DenseKernel("sve_test3", precision, 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4)], delta_dp)) + kernels.append(generator.DenseKernel("sve_test4", precision, 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4)], delta_dp)) kernels.append(generator.SparseKernel("sve_test1", precision, 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(56, 56, 30), delta_dp)) kernels.append(generator.DenseKernel("sve_test2", precision, 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1, v_size, precision) for x in blocksize_algs], delta_dp)) From 8d4f51f7521553f42fec42e218ec18be14b27b2b Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 23:14:18 +0200 Subject: [PATCH 49/64] Bump version number --- pspamm/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pspamm/VERSION b/pspamm/VERSION index ee1372d..0d91a54 100644 --- a/pspamm/VERSION +++ b/pspamm/VERSION @@ -1 +1 @@ -0.2.2 +0.3.0 From 88a47fb02a74c5b8da52ebb1e7f22a0ba0cf62d9 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 23:28:04 +0200 Subject: [PATCH 50/64] Fix large matrix offsetting --- pspamm/codegen/architectures/hsw/generator.py | 4 +++- pspamm/codegen/architectures/knl/generator.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pspamm/codegen/architectures/hsw/generator.py b/pspamm/codegen/architectures/hsw/generator.py index 9817c3f..b0c0a2e 100644 --- a/pspamm/codegen/architectures/hsw/generator.py +++ b/pspamm/codegen/architectures/hsw/generator.py @@ -168,11 +168,13 @@ def reg_based_scaling(self, regcache, asm, addr: MemoryAddress, additional_regs: addr.disp = addr.disp % 1024 else: + # TODO: not 100%ly sure about this code here... large_offset = addr.disp // 1024 basereg, load = regcache.get(large_offset) if load: - asm.add(mov(c(large_offset * 1024), basereg, False)) + asm.add(mov(addr.base, basereg, False)) + asm.add(add(c(large_offset * 1024), basereg)) addr.base = basereg addr.disp = addr.disp % 1024 diff --git a/pspamm/codegen/architectures/knl/generator.py b/pspamm/codegen/architectures/knl/generator.py index 096e1ea..e340e4f 100644 --- a/pspamm/codegen/architectures/knl/generator.py +++ b/pspamm/codegen/architectures/knl/generator.py @@ -176,7 +176,8 @@ def reg_based_scaling(self, regcache, asm, addr: MemoryAddress, additional_regs: basereg, load = regcache.get(large_offset) if load: - asm.add(mov(c(large_offset * 1024), basereg, False)) + asm.add(mov(addr.base, basereg, False)) + asm.add(add(c(large_offset * 1024), basereg)) addr.base = basereg addr.disp = addr.disp % 1024 From af3fbda8585a5bceec9682faacb574aa574b872b Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 19 Oct 2024 23:29:11 +0200 Subject: [PATCH 51/64] Fix memory load masking --- pspamm/codegen/architectures/knl/inlineprinter.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pspamm/codegen/architectures/knl/inlineprinter.py b/pspamm/codegen/architectures/knl/inlineprinter.py index dc17887..9b04759 100644 --- a/pspamm/codegen/architectures/knl/inlineprinter.py +++ b/pspamm/codegen/architectures/knl/inlineprinter.py @@ -65,9 +65,9 @@ def maskformat(self, pred): if pred is None: return '' elif pred.zero: - return f'{{{pred.register.ugly}}}{{z}}' + return f'{{{{{pred.register.ugly}}}}}{{{{z}}}}' else: - return f'{{{pred.register.ugly}}}' + return f'{{{{{pred.register.ugly}}}}}' def visitFma(self, stmt: FmaStmt): mask = self.maskformat(stmt.pred) @@ -155,7 +155,10 @@ def visitMov(self, stmt: MovStmt): if isinstance(stmt.src, Constant) and stmt.src.value == 0: s = f"vpxord {stmt.dest.ugly} {mask}, {stmt.dest.ugly}, {stmt.dest.ugly}" else: - s = f"vmovupd {src_str} {mask}, {stmt.dest.ugly}" + if isinstance(stmt.src, MemoryAddress): + s = f"vmovupd {src_str}, {stmt.dest.ugly} {mask}" + else: + s = f"vmovupd {src_str} {mask}, {stmt.dest.ugly}" else: raise NotImplementedError() self.addLine(s, stmt.comment) From d87a5af7dc7811e1275826046181b4ea75a4f8e0 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sun, 20 Oct 2024 00:18:34 +0200 Subject: [PATCH 52/64] Working larger B registers --- pspamm/codegen/architectures/arm/blocksize.py | 2 +- pspamm/codegen/architectures/arm/generator.py | 22 ++++++++++++++----- .../architectures/arm_sve/blocksize.py | 2 +- .../architectures/arm_sve/generator.py | 12 +++++++--- 4 files changed, 27 insertions(+), 11 deletions(-) diff --git a/pspamm/codegen/architectures/arm/blocksize.py b/pspamm/codegen/architectures/arm/blocksize.py index 5fc1c4f..72a2e6b 100644 --- a/pspamm/codegen/architectures/arm/blocksize.py +++ b/pspamm/codegen/architectures/arm/blocksize.py @@ -92,6 +92,6 @@ def ARM_condition(cls, bm, bn, bk, v_size, elem128): # ceiling division vm = -(bm // -v_size) vk = -(bk // -elem128) - return (bn+vk) * vm + bn*vk <= 32 + return (bn+bk) * vm + bn*vk <= 32 Default = Max diff --git a/pspamm/codegen/architectures/arm/generator.py b/pspamm/codegen/architectures/arm/generator.py index 156e0e3..eca5d2b 100644 --- a/pspamm/codegen/architectures/arm/generator.py +++ b/pspamm/codegen/architectures/arm/generator.py @@ -51,7 +51,7 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: vm = bm//v_size elem128 = 16 // self.get_precision().size() vk = -(bk // -elem128) - assert((bn+bk) * vm + bn * bk <= 32) # Needs to fit in NEON v registers + assert((bn+bk) * vm + bn * vk <= 32) # Needs to fit in NEON v registers prec = { Precision.DOUBLE: "2d", @@ -60,7 +60,7 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: }[self.get_precision()] A_regs = Matrix([[v(vm*c + r, prec) for c in range(bk)] for r in range(vm)]) - B_regs = Matrix([[v(vm*bk + bn * r + c, prec) for c in range(bn)] for r in range(bk)]) + B_regs = Matrix([[v(vm*bk + bn * r + c, prec) for c in range(bn)] for r in range(vk)]) C_regs = Matrix([[v(32 - vm*bn + vm*c + r, prec) for c in range(bn)] for r in range(vm)]) @@ -213,15 +213,19 @@ def make_microkernel(self, mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size) asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False)) + elem128 = 16 // self.get_precision().size() + vk = -(bk // -elem128) + bs = [] cur11 = -1000 for Vmi in range(bm//v_size): for bki in range(bk): # inside this k-block for bni in range(bn): # inside this n-block to_cell = Coords(down=bki, right=bni) + bki_reg = bki // elem128 if B.has_nonzero_cell(B_ptr, to_B_block, to_cell): B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) - if B_regs[bki, bni] not in bs: + if B_regs[bki_reg, bni] not in bs: if B_cell_addr.disp > 255: if(B_cell_addr.disp - cur11 > 0 and B_cell_addr.disp - cur11 < 256): B_cell_addr.disp = B_cell_addr.disp - cur11 @@ -232,17 +236,23 @@ def make_microkernel(self, B_cell_addr.base = additional_regs[0] - asm.add(ld(B_cell_addr, B_regs[bki, bni], True, B_comment)) - bs.append(B_regs[bki, bni]) + asm.add(ld(B_cell_addr, B_regs[bki_reg, bni], True, B_comment)) + bs.append(B_regs[bki_reg, bni]) for Vmi in range(bm//v_size): + # TODO: + cell_indices = {} for bki in range(bk): # inside this k-block for bni in range(bn): # inside this n-block to_cell = Coords(down=bki, right=bni) if B.has_nonzero_cell(B_ptr, to_B_block, to_cell): B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) comment = "C[{}:{},{}] += A[{}:{},{}]*{}".format(Vmi*v_size, Vmi*v_size+v_size, bni, Vmi*v_size, Vmi*v_size+v_size, bki, B_comment) - asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=0)) + bki_reg = bki // elem128 + if (bki_reg, bni) not in cell_indices: + cell_indices[(bki_reg, bni)] = 0 + asm.add(fma(B_regs[bki_reg, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=cell_indices[(bki_reg, bni)])) + cell_indices[(bki_reg, bni)] += 1 return asm diff --git a/pspamm/codegen/architectures/arm_sve/blocksize.py b/pspamm/codegen/architectures/arm_sve/blocksize.py index 4c972ee..fcc3430 100644 --- a/pspamm/codegen/architectures/arm_sve/blocksize.py +++ b/pspamm/codegen/architectures/arm_sve/blocksize.py @@ -66,7 +66,7 @@ def ARM_condition(cls, bm, bn, bk, v_size, elem128): # ceiling division vm = -(bm // -v_size) vk = -(bk // -elem128) - return (bn + vk) * vm + bn*vk <= 32 + return (bn + bk) * vm + bn*vk <= 32 def tileable(cls, m, bm): return m % bm == 0 diff --git a/pspamm/codegen/architectures/arm_sve/generator.py b/pspamm/codegen/architectures/arm_sve/generator.py index 5a987cc..49c9795 100644 --- a/pspamm/codegen/architectures/arm_sve/generator.py +++ b/pspamm/codegen/architectures/arm_sve/generator.py @@ -300,8 +300,7 @@ def make_microkernel(self, v_size: int, additional_regs, to_A_block: Coords = Coords(), - to_B_block: Coords = Coords(), - is_B: bool = True + to_B_block: Coords = Coords() ) -> Block: """ make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. @@ -325,6 +324,9 @@ def make_microkernel(self, cur11 = -1000 Vm = max(self.ceil_div(bm, v_size), 1) + elem128 = 16 // self.get_precision().size() + vk = -(bk // -elem128) + multiple = self.precision.size() # for ld1rw (single prec): immediate offset is multiple of 4 in range of 0 to 252 # for ld1rd (double prec): immediate offset is multiple of 8 in range of 0 to 504 @@ -349,7 +351,8 @@ def make_microkernel(self, B_cell_addr.disp = 0 B_cell_addr.base = additional_regs[0] - asm.add(ld(B_cell_addr, B_regs[bki, bni], True, B_comment, pred=p_zeroing, is_B=is_B)) + + asm.add(ld(B_cell_addr, B_regs[bki, bni], True, B_comment, pred=p_zeroing, is_B=True)) bs.append(B_regs[bki, bni]) for Vmi in range(Vm): @@ -362,6 +365,9 @@ def make_microkernel(self, B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) comment = "C[{}:{},{}] += A[{}:{},{}]*{}".format(Vmi * v_size, end_index, bni, Vmi * v_size, end_index, bki, B_comment) + + bki_reg = bki // elem128 + bki_sub = bki % elem128 asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, pred=p_merging, bcast=None)) return asm From c480249cc477fc7dd22fc1328461b4c85c1e000a Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sun, 20 Oct 2024 02:57:46 +0200 Subject: [PATCH 53/64] Extend inline ARM_SVE broadcast fma --- pspamm/codegen/architectures/arm/generator.py | 2 +- .../architectures/arm_sve/generator.py | 70 ++++++++++++++----- .../architectures/arm_sve/inlineprinter.py | 16 ++++- pspamm/codegen/sugar.py | 7 +- 4 files changed, 72 insertions(+), 23 deletions(-) diff --git a/pspamm/codegen/architectures/arm/generator.py b/pspamm/codegen/architectures/arm/generator.py index eca5d2b..3cc1416 100644 --- a/pspamm/codegen/architectures/arm/generator.py +++ b/pspamm/codegen/architectures/arm/generator.py @@ -240,7 +240,7 @@ def make_microkernel(self, bs.append(B_regs[bki_reg, bni]) for Vmi in range(bm//v_size): - # TODO: + # TODO: refactor cell_indices into the cursors/blocks cell_indices = {} for bki in range(bk): # inside this k-block for bni in range(bn): # inside this n-block diff --git a/pspamm/codegen/architectures/arm_sve/generator.py b/pspamm/codegen/architectures/arm_sve/generator.py index 49c9795..3d50bc5 100644 --- a/pspamm/codegen/architectures/arm_sve/generator.py +++ b/pspamm/codegen/architectures/arm_sve/generator.py @@ -78,8 +78,21 @@ def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: i # k-broadcasting only works in 128-bit lanes elem128 = 16 // self.get_precision().size() - vk = -(bk // -elem128) - assert ((bn + bk) * vm + bn * bk <= 32) # Needs to fit in SVE z registers + vkext = -(bk // -elem128) + + # inline broadcasting is only allowed for the lower-numbered registers + self.inline_broadcast = False + if bn*vkext < 16 if self.get_precision().size() == 8 else bn*vkext < 8: + self.inline_broadcast = True + if bk == 1: + self.inline_broadcast = False + + if self.inline_broadcast: + vk = vkext + else: + vk = bk + + assert ((bn + bk) * vm + bn * vk <= 32) # Needs to fit in SVE z registers prec = { Precision.DOUBLE: "d", @@ -88,12 +101,15 @@ def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: i Precision.BFLOAT16: "h", }[self.get_precision()] - # use max(vm, 1) in case bm < v_size, otherwise we get no A_regs/C_regs - A_regs = Matrix([[z(max(vm, 1) * c + r , prec) for c in range(bk)] for r in range(max(vm, 1))]) - B_regs = Matrix([[z(max(vm, 1) * bk + bn * r + c, prec) for c in range(bn)] for r in range(bk)]) - C_regs = Matrix([[z(32 - max(vm, 1) * bn + max(vm, 1) * c + r, prec) for c in range(bn)] for r in range(max(vm, 1))]) + # make place for the two broadcasting registers + a_offset = 1 if bn * vk == 1 else 0 + assert ((bn + bk) * vm + bn * vk + a_offset <= 32) + + A_regs = Matrix([[z(vm * c + r + bn * vk + a_offset, prec) for c in range(bk)] for r in range(vm)]) + B_regs = Matrix([[z(bn * r + c, prec) for c in range(bn)] for r in range(vk)]) + C_regs = Matrix([[z(32 - vm * bn + vm * c + r, prec) for c in range(bn)] for r in range(vm)]) - b_reg = max(vm, 1) * bk + b_reg = 0 alpha_reg = [z(b_reg, prec), z(b_reg, prec)] beta_reg = [z(b_reg + 1, prec), z(b_reg + 1, prec)] @@ -175,7 +191,7 @@ def init_registers(self, # 'ptrue' doesnt work for initialising overhead predicate when using single precision -> see valid patterns from above # overhead = "\"ptrue p0.{suffix}, #{overhead}{eol}\"\n\t" if bm != 0 else "" # define overhead predicate overhead_m = "\"mov {gen_reg}{overhead_counter}, #{overhead_m}{eol}\"\n\t\"whilelo p0.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if bmmod != 0 else "" - overhead_k = "" #"\"mov {gen_reg}{overhead_counter}, #{overhead_k}{eol}\"\n\t\"whilelo p1.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if bkmod != 0 else "" + overhead_k = "" # "\"mov {gen_reg}{overhead_counter}, #{overhead_k}{eol}\"\n\t\"whilelo p1.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if bkmod != 0 else "" all_true = "\"ptrue p7.{suffix}, #31{eol}\"" # define all true predicate init_registers = (comment + overhead_m + overhead_k + all_true).format(suffix=p_suffix, gen_reg=gen_reg, @@ -274,7 +290,7 @@ def move_register_block(self, asm.add(ld(addr, registers[ir, ic], True, comment, pred=p_zeroing, is_B=is_B, scalar_offs=False, add_reg=additional_regs[2])) - prev_overhead = int(p.ugly[1]) == 0 # determine if we previously used p0 (overhead predicate) + prev_overhead = p is None or int(p.ugly[1]) == 0 # determine if we previously used p0 (overhead predicate) return asm @@ -331,20 +347,27 @@ def make_microkernel(self, # for ld1rw (single prec): immediate offset is multiple of 4 in range of 0 to 252 # for ld1rd (double prec): immediate offset is multiple of 8 in range of 0 to 504 # in both cases: instruction encodes the immediate offset within 6 bits - max_offs = (2 ** 6 - 1) * multiple + if not self.inline_broadcast: + max_offs = (2 ** 6 - 1) * multiple + divider = 1 + else: + max_offs = 127 + divider = 16 for Vmi in range(Vm): # set to all v_size predicates to true, we want to replicate a B element into a whole vector p_zeroing = self.pred_n_trues(v_size, v_size, "z") for bki in range(bk): # inside this k-block + bki_reg = bki // elem128 for bni in range(bn): # inside this n-block to_cell = Coords(down=bki, right=bni) if B.has_nonzero_cell(B_ptr, to_B_block, to_cell): B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) - if B_regs[bki, bni] not in bs: + if B_regs[bki_reg, bni] not in bs: # max_offs is the maximum allowed immediate offset when using ld1rd/ld1rw to broadcast a scalar value - if B_cell_addr.disp > max_offs: - if B_cell_addr.disp - cur11 > 0 and B_cell_addr.disp - cur11 <= max_offs: - B_cell_addr.disp = B_cell_addr.disp - cur11 + if B_cell_addr.disp > max_offs or B_cell_addr.disp % divider != 0: + moved = B_cell_addr.disp - cur11 + if moved > 0 and moved <= max_offs and moved % divider == 0: + B_cell_addr.disp = moved else: asm.add(add(B_cell_addr.disp, additional_regs[0], "", B_cell_addr.base)) cur11 = B_cell_addr.disp @@ -352,10 +375,15 @@ def make_microkernel(self, B_cell_addr.base = additional_regs[0] - asm.add(ld(B_cell_addr, B_regs[bki, bni], True, B_comment, pred=p_zeroing, is_B=True)) - bs.append(B_regs[bki, bni]) + if not self.inline_broadcast: + asm.add(ld(B_cell_addr, B_regs[bki_reg, bni], True, B_comment, pred=p_zeroing, is_B=True)) + else: + asm.add(ld(B_cell_addr, B_regs[bki_reg, bni], True, B_comment, pred=p_zeroing, sub128=True)) + bs.append(B_regs[bki_reg, bni]) for Vmi in range(Vm): + # TODO: refactor cell_indices into the cursors/blocks + cell_indices = {} p_merging = self.pred_n_trues(bm - Vmi * v_size, v_size, "m") end_index = bm if Vmi + 1 == Vm else Vmi * v_size + v_size # end_index helps us print the right index ranges for bki in range(bk): # inside this k-block @@ -367,8 +395,14 @@ def make_microkernel(self, end_index, bki, B_comment) bki_reg = bki // elem128 - bki_sub = bki % elem128 - asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, pred=p_merging, bcast=None)) + if (bki_reg, bni) not in cell_indices: + cell_indices[(bki_reg, bni)] = 0 + if not self.inline_broadcast: + bcast = None + else: + bcast = cell_indices[(bki_reg, bni)] + asm.add(fma(B_regs[bki_reg, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, pred=p_merging, bcast=bcast)) + cell_indices[(bki_reg, bni)] += 1 return asm def init_prefetching(self, prefetching): diff --git a/pspamm/codegen/architectures/arm_sve/inlineprinter.py b/pspamm/codegen/architectures/arm_sve/inlineprinter.py index 89e0e71..26ce670 100644 --- a/pspamm/codegen/architectures/arm_sve/inlineprinter.py +++ b/pspamm/codegen/architectures/arm_sve/inlineprinter.py @@ -52,7 +52,8 @@ def visitFma(self, stmt: FmaStmt): a = stmt.add_dest.ugly p = self.p_string(stmt.pred) if stmt.bcast is not None: - s = f"fmla {a}, {p}{m}, {b}[{stmt.bcast}]" + # NOTE: ignores predicate + s = f"fmla {a}, {m}, {b}[{stmt.bcast}]" else: s = f"fmla {a}, {p}{m}, {b}" @@ -152,10 +153,16 @@ def visitMov(self, stmt: MovStmt): def visitLoad(self, stmt: LoadStmt): if isinstance(stmt.src, Label): src_str = "#" + stmt.src.ugly - elif stmt.src.ugly_offset != "0" and stmt.scalar_offs: + elif (stmt.src.ugly_offset != "0" and stmt.scalar_offs): self.addLine(f"mov {stmt.add_reg.ugly}, #{stmt.src.ugly_offset}", f"move immediate offset into {stmt.add_reg.ugly}") # TODO: adapt ugly_lsl_shift to account for possible single precision instead of double precision src_str = f"[{stmt.src.ugly_base}, {stmt.add_reg.ugly}, LSL #{stmt.dest.ugly_lsl_shift}]" + elif stmt.typ == AsmType.f64x4 or stmt.typ == AsmType.f64x2: + # (note: the 128-bit and 256-bit broadcasts need the following more rudimentary format here) + if stmt.src.ugly_offset == '0': + src_str = f"[{stmt.src.ugly_base}]" + else: + src_str = f"[{stmt.src.ugly_base}, #{stmt.src.ugly_offset}]" else: src_str = stmt.src.ugly if not stmt.is_B else stmt.src.ugly_no_vl_scaling @@ -169,6 +176,10 @@ def visitLoad(self, stmt: LoadStmt): s = f"ld1r{prec} {stmt.dest.ugly}, {p}{src_str}" else: s = f"ld1{prec} {stmt.dest.ugly}, {p}{src_str}" + elif stmt.typ == AsmType.f64x4 and stmt.aligned: + s = f"ld1ro{prec} {stmt.dest.ugly}, {p}{src_str}" + elif stmt.typ == AsmType.f64x2 and stmt.aligned: + s = f"ld1rq{prec} {stmt.dest.ugly}, {p}{src_str}" else: raise NotImplementedError() self.addLine(s, stmt.comment) @@ -180,6 +191,7 @@ def visitStore(self, stmt: StoreStmt): self.addLine(f"mov {stmt.add_reg.ugly}, #{stmt.dest.ugly_offset}", f"move immediate offset into {stmt.add_reg.ugly}") # TODO: adapt ugly_lsl_shift to account for possible single precision instead of double precision + regsize = stmt.add_dest.size() // 16 dest_str = f"[{stmt.dest.ugly_base}, {stmt.add_reg.ugly}, LSL #{stmt.src.ugly_lsl_shift}]" else: dest_str = stmt.dest.ugly diff --git a/pspamm/codegen/sugar.py b/pspamm/codegen/sugar.py index 74ac162..70b052d 100644 --- a/pspamm/codegen/sugar.py +++ b/pspamm/codegen/sugar.py @@ -80,7 +80,7 @@ def lea(src: Register, dest: Operand, offset: int, comment:str = None): stmt.comment = comment return stmt -def ld(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None, dest2: Operand = None, pred: Register = None, is_B: bool = False, scalar_offs: bool = False, add_reg: AsmType.i64 = None): +def ld(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None, dest2: Operand = None, pred: Register = None, is_B: bool = False, scalar_offs: bool = False, add_reg: AsmType.i64 = None, sub128: bool = False): stmt = LoadStmt() stmt.src = src if isinstance(src, Operand) else pspamm.architecture.operands.c(src) stmt.dest = dest @@ -94,7 +94,10 @@ def ld(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None if vector: stmt.aligned = True - stmt.typ = AsmType.f64x8 + if sub128: + stmt.typ = AsmType.f64x2 + else: + stmt.typ = AsmType.f64x8 else: stmt.aligned = False stmt.typ = AsmType.i64 From 8ee2367944722c39f290091b9bcfd66ea48c679c Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sun, 20 Oct 2024 02:57:57 +0200 Subject: [PATCH 54/64] Add Cube and MaxK scripts --- pspamm/codegen/architectures/arm/blocksize.py | 28 ++++++++++++ .../architectures/arm_sve/blocksize.py | 45 ++++++++++++++++++- pspamm/codegen/architectures/hsw/blocksize.py | 32 +++++++++++++ pspamm/codegen/architectures/knl/blocksize.py | 25 +++++++++++ tests/testsuite_generator.py | 9 +++- tests/unit_test.py | 8 ++-- 6 files changed, 140 insertions(+), 7 deletions(-) diff --git a/pspamm/codegen/architectures/arm/blocksize.py b/pspamm/codegen/architectures/arm/blocksize.py index 72a2e6b..880cced 100644 --- a/pspamm/codegen/architectures/arm/blocksize.py +++ b/pspamm/codegen/architectures/arm/blocksize.py @@ -94,4 +94,32 @@ def ARM_condition(cls, bm, bn, bk, v_size, elem128): vk = -(bk // -elem128) return (bn+bk) * vm + bn*vk <= 32 +class Cube: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + bm = 2 + bn = 1 + maxval = 0 + + elem128 = 16 // prec.size() + + for i in range(v_size, m+1, v_size): + for j in range(1, n+1): + for k in range(1, 200): + if cls.ARM_condition(i, j, k, v_size, elem128): + if i*j*k > maxval: + maxval = i*j*k + bm = i + bn = j + bk = k + + return (bm, bn, bk) + + @classmethod + def ARM_condition(cls, bm, bn, bk, v_size, elem128): + # ceiling division + vm = -(bm // -v_size) + vk = -(bk // -elem128) + return (bn+bk) * vm + bn*vk <= 32 + Default = Max diff --git a/pspamm/codegen/architectures/arm_sve/blocksize.py b/pspamm/codegen/architectures/arm_sve/blocksize.py index fcc3430..b80bf0a 100644 --- a/pspamm/codegen/architectures/arm_sve/blocksize.py +++ b/pspamm/codegen/architectures/arm_sve/blocksize.py @@ -62,12 +62,55 @@ def getBlocksize(cls, m, n, bk, v_size, prec): return (bm, bn, bk) + @classmethod + def ARM_condition(cls, bm, bn, bk, v_size, elem128): + # ceiling division + vkext = -(bk // -elem128) + isvkext = bn*vkext < 16 if elem128 == 2 else bn*vkext < 8 + vm = -(bm // -v_size) + vk = vkext if isvkext else bk + return (bn + bk) * vm + bn*vk <= 32 + + @classmethod + def tileable(cls, m, bm): + return m % bm == 0 + +class Cube: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + # v_size default is 2, however for SVE that parameter will always be larger + bm = 2 + bn = 1 + maxval = 0 + + elem128 = 16 // prec.size() + + for i in range(1, m + 1, 1): + next_multiple = -(i // -v_size) + for j in range(1, n + 1): + for k in range(1, 200): + if cls.ARM_condition(next_multiple, j, k, v_size, elem128) and cls.tileable(m, i): + if i * j * k >= maxval: + maxval = i * j * k + bm = i + bn = j + bk = k + + if maxval == 0: + raise RuntimeError("Could not find an appropriate block size. We suggest padding the matrix dimensions") + + return (bm, bn, bk) + + @classmethod def ARM_condition(cls, bm, bn, bk, v_size, elem128): # ceiling division + vkext = -(bk // -elem128) + isvkext = bn*vkext < 16 if elem128 == 2 else bn*vkext < 8 vm = -(bm // -v_size) - vk = -(bk // -elem128) + vk = vkext if isvkext else bk return (bn + bk) * vm + bn*vk <= 32 + @classmethod def tileable(cls, m, bm): return m % bm == 0 diff --git a/pspamm/codegen/architectures/hsw/blocksize.py b/pspamm/codegen/architectures/hsw/blocksize.py index 78d4654..b2611db 100644 --- a/pspamm/codegen/architectures/hsw/blocksize.py +++ b/pspamm/codegen/architectures/hsw/blocksize.py @@ -71,5 +71,37 @@ def HSW_condition_extended(cls, bm, bn, bk, v_size): vm = -(bm // -v_size) return bn * vm + bn * bk + 1 <= 16 +class Cube: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + bm = 4 + bn = 1 + maxval = 0 + + for i in range(v_size, m+1, v_size): + for j in range(1, n+1): + for k in range(1, 200): + # can be replaced by cls.HSW_condition_extended here + # (but that seemed to be slower in the end) + if cls.HSW_condition(i, j, bk, v_size): + if i*j*k > maxval and (cls.HSW_condition(i, j, k, v_size) or j > 1): + maxval = i*j*k + bm = i + bn = j + bk = k + + return (bm, bn, bk) + + @classmethod + def HSW_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn + bk) * vm + bn * bk <= 16 + + @classmethod + def HSW_condition_extended(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return bn * vm + bn * bk + 1 <= 16 Default = Max diff --git a/pspamm/codegen/architectures/knl/blocksize.py b/pspamm/codegen/architectures/knl/blocksize.py index 21a9c2b..5302413 100644 --- a/pspamm/codegen/architectures/knl/blocksize.py +++ b/pspamm/codegen/architectures/knl/blocksize.py @@ -91,4 +91,29 @@ def KNL_condition(cls, bm, bn, bk, v_size): vm = -(bm // -v_size) return (bn+bk) * vm <= 32 +class CubeBn: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + + bm = v_size + bn = 1 + + maxval = 0 + + for j in range(1, n+1): + for k in range(1, 200): + if cls.KNL_condition(bm, j, k, v_size): + if j*k > maxval: + maxval = j*k + bn = j + bk = k + + return (bm, bn, bk) + + @classmethod + def KNL_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn+bk) * vm <= 32 + Default = MaxBn diff --git a/tests/testsuite_generator.py b/tests/testsuite_generator.py index 2e3bcea..13cf7d1 100755 --- a/tests/testsuite_generator.py +++ b/tests/testsuite_generator.py @@ -263,6 +263,7 @@ def make(kernels, arch): # ceiling division vm = -(bm // -v_len) v_size = v_len + elem128 = (16 // kern.precision.size()) if arch.startswith("knl"): if not ((bn+bk) * vm <= 32): @@ -273,11 +274,15 @@ def make(kernels, arch): print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') continue elif arch.startswith("arm_sve"): - if not ((bn+bk) * vm + bn * bk <= 32): + vkext = -(bk // -elem128) + isvkext = bn*vkext < 16 if elem128 == 2 else bn*vkext < 8 + vk = vkext if isvkext else bk + if not ((bn+bk) * vm + bn * vk <= 32): print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') continue elif arch.startswith("arm"): - if not ((bn+bk) * vm + bn * bk <= 32) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: + vk = -(bk // -elem128) + if not ((bn+bk) * vm + bn * vk <= 32) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') continue diff --git a/tests/unit_test.py b/tests/unit_test.py index 45ef3db..2ddeebc 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -18,10 +18,10 @@ blocksize = import_module("pspamm.codegen.architectures." + archname + ".blocksize") scripts = { - "arm": lambda blocksize: [blocksize.Old, blocksize.Max], - "arm_sve": lambda blocksize: [blocksize.Max], - "knl": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.MaxBn], - "hsw": lambda blocksize: [blocksize.Old, blocksize.Max], + "arm": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.MaxK, blocksize.Cube], + "arm_sve": lambda blocksize: [blocksize.Max, blocksize.MaxK, blocksize.Cube], + "knl": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.MaxBn, blocksize.CubeBn], + "hsw": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.Cube], } blocksize_algs = scripts[archname](blocksize) + [blocksize.Default] From a8a1a628c5aa511942a5bf2329d6846fde93e2fd Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sun, 20 Oct 2024 06:16:46 +0200 Subject: [PATCH 55/64] Towards masking for k --- .../architectures/arm_sve/generator.py | 58 ++++++++++++++----- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/pspamm/codegen/architectures/arm_sve/generator.py b/pspamm/codegen/architectures/arm_sve/generator.py index 3d50bc5..6fed922 100644 --- a/pspamm/codegen/architectures/arm_sve/generator.py +++ b/pspamm/codegen/architectures/arm_sve/generator.py @@ -121,7 +121,7 @@ def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: i mask_regs = [p(0), p(7)] - self.init_registers(bm, bk, v_size) + self.init_registers(bm, k, bk, v_size, nnz) return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, mask_regs @@ -162,13 +162,16 @@ def init_mask(self, def init_registers(self, bm: int, + k: int, bk: int, - v_size: int + v_size: int, + nnz: int ) -> None: bmmod = bm % v_size elem128 = 16 // self.get_precision().size() - bkmod = bk % elem128 + bkmod = bk % elem128 if self.inline_broadcast else 0 + kmod = (k % bk) % elem128 if self.inline_broadcast else 0 eol = "\\n\\t" # define the "end of line" sequence for easy assembly # determine the predicate suffix @@ -184,22 +187,32 @@ def init_registers(self, comment = "// p7 denotes the 'all-true' predicate\n\t" comment += "// if given, p0 denotes the 'bm % v_size' predicate\n\t" - # comment += "// if given, p1 denotes the 'bk % elem128' predicate\n\t" + comment += "// if given, p1 denotes the 'bk % elem128' predicate\n\t" + comment += "// if given, p2 denotes the 'k % elem128' predicate\n\t" + + self.has_k_overhead = kmod != 0 + self.has_bk_overhead = bkmod != 0 + self.has_nnz_overhead = nnz % elem128 != 0 + # specification for ptrue: https://developer.arm.com/documentation/ddi0596/2021-12/SVE-Instructions/PTRUE--Initialise-predicate-from-named-constraint- # search for 'DecodePredCount' for the explanation of how the pattern in 'ptrue p{d}.{suffix}, #pattern' is decoded: # https://developer.arm.com/documentation/ddi0596/2020-12/Shared-Pseudocode/AArch64-Functions?lang=en#impl-aarch64.DecodePredCount.2 # 'ptrue' doesnt work for initialising overhead predicate when using single precision -> see valid patterns from above # overhead = "\"ptrue p0.{suffix}, #{overhead}{eol}\"\n\t" if bm != 0 else "" # define overhead predicate overhead_m = "\"mov {gen_reg}{overhead_counter}, #{overhead_m}{eol}\"\n\t\"whilelo p0.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if bmmod != 0 else "" - overhead_k = "" # "\"mov {gen_reg}{overhead_counter}, #{overhead_k}{eol}\"\n\t\"whilelo p1.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if bkmod != 0 else "" + overhead_bk = "\"mov {gen_reg}{overhead_counter}, #{overhead_bk}{eol}\"\n\t\"whilelo p1.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if self.has_bk_overhead else "" + overhead_k = "\"mov {gen_reg}{overhead_counter}, #{overhead_k}{eol}\"\n\t\"whilelo p2.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if self.has_k_overhead else "" + overhead_nnz = "\"mov {gen_reg}{overhead_counter}, #{overhead_nnz}{eol}\"\n\t\"whilelo p3.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if self.has_nnz_overhead else "" all_true = "\"ptrue p7.{suffix}, #31{eol}\"" # define all true predicate - init_registers = (comment + overhead_m + overhead_k + all_true).format(suffix=p_suffix, - gen_reg=gen_reg, - overhead_counter=overhead_counter, - v_size=v_size, - overhead_m=bmmod, - overhead_k=bkmod, - eol=eol) + init_registers = (comment + overhead_m + overhead_bk + overhead_k + overhead_nnz + all_true).format(suffix=p_suffix, + gen_reg=gen_reg, + overhead_counter=overhead_counter, + v_size=v_size, + overhead_m=bmmod, + overhead_bk=bkmod, + overhead_k=kmod, + overhead_nnz=nnz % elem128, + eol=eol) # since .format() doesn't allow partial formatting, we need to re-include the # placeholders that are replaced at the end of generating a kernel @@ -340,9 +353,6 @@ def make_microkernel(self, cur11 = -1000 Vm = max(self.ceil_div(bm, v_size), 1) - elem128 = 16 // self.get_precision().size() - vk = -(bk // -elem128) - multiple = self.precision.size() # for ld1rw (single prec): immediate offset is multiple of 4 in range of 0 to 252 # for ld1rd (double prec): immediate offset is multiple of 8 in range of 0 to 504 @@ -350,12 +360,26 @@ def make_microkernel(self, if not self.inline_broadcast: max_offs = (2 ** 6 - 1) * multiple divider = 1 + elem128 = 1 + vk = bk + preg = 'p7/z' + preg_last = 'p7/z' else: max_offs = 127 divider = 16 + elem128 = 16 // self.get_precision().size() + vk = -(bk // -elem128) + + #if isinstance(B, DenseCursor): + # preg = 'p1/z' if self.has_bk_overhead else 'p7/z' + # preg_last = 'p2/z' if self.has_k_overhead else preg + #else: + # preg = 'p7/z' + # preg_last = 'p7/z' + preg = 'p7/z' + preg_last = 'p7/z' for Vmi in range(Vm): # set to all v_size predicates to true, we want to replicate a B element into a whole vector - p_zeroing = self.pred_n_trues(v_size, v_size, "z") for bki in range(bk): # inside this k-block bki_reg = bki // elem128 for bni in range(bn): # inside this n-block @@ -363,6 +387,8 @@ def make_microkernel(self, if B.has_nonzero_cell(B_ptr, to_B_block, to_cell): B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) if B_regs[bki_reg, bni] not in bs: + p_zeroing = Register_ARM(AsmType.p64x8, preg_last) if bki_reg + 1 == vk else Register_ARM(AsmType.p64x8, preg) + # max_offs is the maximum allowed immediate offset when using ld1rd/ld1rw to broadcast a scalar value if B_cell_addr.disp > max_offs or B_cell_addr.disp % divider != 0: moved = B_cell_addr.disp - cur11 From 096f190bc6ec6986e245cacce66524a0ae1184f6 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Mon, 21 Oct 2024 19:36:17 +0200 Subject: [PATCH 56/64] Fix ARM_SVE blocksizes --- pspamm/codegen/architectures/arm_sve/blocksize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pspamm/codegen/architectures/arm_sve/blocksize.py b/pspamm/codegen/architectures/arm_sve/blocksize.py index b80bf0a..57d426f 100644 --- a/pspamm/codegen/architectures/arm_sve/blocksize.py +++ b/pspamm/codegen/architectures/arm_sve/blocksize.py @@ -46,7 +46,7 @@ def getBlocksize(cls, m, n, bk, v_size, prec): elem128 = 16 // prec.size() for i in range(1, m + 1, 1): - next_multiple = -(i // -v_size) + next_multiple = -(i // -v_size) * v_size for j in range(1, n + 1): if cls.ARM_condition(next_multiple, j, bk, v_size, elem128) and cls.tileable(m, i): if i * j >= maxval: @@ -86,7 +86,7 @@ def getBlocksize(cls, m, n, bk, v_size, prec): elem128 = 16 // prec.size() for i in range(1, m + 1, 1): - next_multiple = -(i // -v_size) + next_multiple = -(i // -v_size) * v_size for j in range(1, n + 1): for k in range(1, 200): if cls.ARM_condition(next_multiple, j, k, v_size, elem128) and cls.tileable(m, i): From e17d13844e3ffe5ecbfa71dc20c6062f6deef0c8 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Mon, 21 Oct 2024 21:27:17 +0200 Subject: [PATCH 57/64] Change load order for the B matrix --- pspamm/codegen/architectures/arm/generator.py | 4 ++-- pspamm/codegen/architectures/arm_sve/generator.py | 6 +++--- pspamm/codegen/architectures/hsw/generator.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pspamm/codegen/architectures/arm/generator.py b/pspamm/codegen/architectures/arm/generator.py index 3cc1416..654fac7 100644 --- a/pspamm/codegen/architectures/arm/generator.py +++ b/pspamm/codegen/architectures/arm/generator.py @@ -219,8 +219,8 @@ def make_microkernel(self, bs = [] cur11 = -1000 for Vmi in range(bm//v_size): - for bki in range(bk): # inside this k-block - for bni in range(bn): # inside this n-block + for bni in range(bn): # inside this n-block + for bki in range(bk): # inside this k-block to_cell = Coords(down=bki, right=bni) bki_reg = bki // elem128 if B.has_nonzero_cell(B_ptr, to_B_block, to_cell): diff --git a/pspamm/codegen/architectures/arm_sve/generator.py b/pspamm/codegen/architectures/arm_sve/generator.py index 6fed922..305897b 100644 --- a/pspamm/codegen/architectures/arm_sve/generator.py +++ b/pspamm/codegen/architectures/arm_sve/generator.py @@ -380,9 +380,9 @@ def make_microkernel(self, preg_last = 'p7/z' for Vmi in range(Vm): # set to all v_size predicates to true, we want to replicate a B element into a whole vector - for bki in range(bk): # inside this k-block - bki_reg = bki // elem128 - for bni in range(bn): # inside this n-block + for bni in range(bn): # inside this n-block + for bki in range(bk): # inside this k-block + bki_reg = bki // elem128 to_cell = Coords(down=bki, right=bni) if B.has_nonzero_cell(B_ptr, to_B_block, to_cell): B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) diff --git a/pspamm/codegen/architectures/hsw/generator.py b/pspamm/codegen/architectures/hsw/generator.py index b0c0a2e..5d3598d 100644 --- a/pspamm/codegen/architectures/hsw/generator.py +++ b/pspamm/codegen/architectures/hsw/generator.py @@ -282,8 +282,8 @@ def make_microkernel(self, bs = [] bsv = [] for Vmi in range(bm//v_size): - for bki in range(bk): # inside this k-block - for bni in range(bn): # inside this n-block + for bni in range(bn): # inside this n-block + for bki in range(bk): # inside this k-block to_cell = Coords(down=bki, right=bni) if B.has_nonzero_cell(B_ptr, to_B_block, to_cell): B_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) From aa95d58a74472ade49bd278dfb312121ee2ef954 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 26 Oct 2024 00:22:45 +0200 Subject: [PATCH 58/64] Set MaxK to default --- pspamm/codegen/architectures/arm/blocksize.py | 2 +- pspamm/codegen/architectures/arm_sve/blocksize.py | 2 +- pspamm/codegen/architectures/hsw/blocksize.py | 2 +- pspamm/codegen/architectures/knl/blocksize.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pspamm/codegen/architectures/arm/blocksize.py b/pspamm/codegen/architectures/arm/blocksize.py index 880cced..c8e3740 100644 --- a/pspamm/codegen/architectures/arm/blocksize.py +++ b/pspamm/codegen/architectures/arm/blocksize.py @@ -122,4 +122,4 @@ def ARM_condition(cls, bm, bn, bk, v_size, elem128): vk = -(bk // -elem128) return (bn+bk) * vm + bn*vk <= 32 -Default = Max +Default = MaxK diff --git a/pspamm/codegen/architectures/arm_sve/blocksize.py b/pspamm/codegen/architectures/arm_sve/blocksize.py index 57d426f..8b8c264 100644 --- a/pspamm/codegen/architectures/arm_sve/blocksize.py +++ b/pspamm/codegen/architectures/arm_sve/blocksize.py @@ -114,4 +114,4 @@ def ARM_condition(cls, bm, bn, bk, v_size, elem128): def tileable(cls, m, bm): return m % bm == 0 -Default = Max +Default = MaxK diff --git a/pspamm/codegen/architectures/hsw/blocksize.py b/pspamm/codegen/architectures/hsw/blocksize.py index b2611db..0a38028 100644 --- a/pspamm/codegen/architectures/hsw/blocksize.py +++ b/pspamm/codegen/architectures/hsw/blocksize.py @@ -84,7 +84,7 @@ def getBlocksize(cls, m, n, bk, v_size, prec): # can be replaced by cls.HSW_condition_extended here # (but that seemed to be slower in the end) if cls.HSW_condition(i, j, bk, v_size): - if i*j*k > maxval and (cls.HSW_condition(i, j, k, v_size) or j > 1): + if i*j*k >= maxval and (cls.HSW_condition(i, j, k, v_size) or j > 1): maxval = i*j*k bm = i bn = j diff --git a/pspamm/codegen/architectures/knl/blocksize.py b/pspamm/codegen/architectures/knl/blocksize.py index 5302413..e51165f 100644 --- a/pspamm/codegen/architectures/knl/blocksize.py +++ b/pspamm/codegen/architectures/knl/blocksize.py @@ -103,7 +103,7 @@ def getBlocksize(cls, m, n, bk, v_size, prec): for j in range(1, n+1): for k in range(1, 200): if cls.KNL_condition(bm, j, k, v_size): - if j*k > maxval: + if j*k >= maxval: maxval = j*k bn = j bk = k From 6193c093c1ad0986a59a1d2750775fac4b91f617 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sun, 27 Oct 2024 03:03:07 +0100 Subject: [PATCH 59/64] Bugfixes --- pspamm/codegen/architectures/arm_sve/generator.py | 9 ++++++--- pspamm/matmul.py | 1 - 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pspamm/codegen/architectures/arm_sve/generator.py b/pspamm/codegen/architectures/arm_sve/generator.py index 305897b..9ba194d 100644 --- a/pspamm/codegen/architectures/arm_sve/generator.py +++ b/pspamm/codegen/architectures/arm_sve/generator.py @@ -273,12 +273,15 @@ def move_register_block(self, addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) addr.disp += self.precision.size() * load_offset + offset = addr.disp - prev_disp + # count how many elements we have processed between last step and this step - cont_counter = ((addr.disp - prev_disp) // mul_vl) + cont_counter = (offset // mul_vl) larger_max_offset = cont_counter > max_mem_ins_mult + non_dividing_offset = offset % mul_vl != 0 - if larger_max_offset or (prev_overhead and addr.disp > 0): - offset_comment = "disp > {}".format(max_offset) if larger_max_offset else "previous mem. instr. used p0" + if larger_max_offset or (prev_overhead and addr.disp > 0) or non_dividing_offset: + offset_comment = f"disp > {max_offset}" if larger_max_offset else ("disp % VL != 0" if non_dividing_offset else "previous mem. instr. used p0") asm.add(add(addr.disp, additional_regs[0], offset_comment, addr.base)) prev_disp = addr.disp addr.base = additional_regs[0] diff --git a/pspamm/matmul.py b/pspamm/matmul.py index f4252f2..000c14a 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -325,7 +325,6 @@ def kernelK(asm, Bki, A_ptr, B_ptr): store_block.add(fma(regs[ir, x + ic], self.alpha_reg[1], A_regs_cut[ir, ic], "C = C + alpha * AB", None, pred=pred_m)) store_block.add(self.generator.move_register_block(self.C, C_ptr, Coords(), A_regs_cut, self.v_size, self.additional_regs, None, True, self.prefetching, self.ldc * x)) asm.add(store_block) - else: asm.add(self.generator.move_register_block(self.C, C_ptr, Coords(), regs, self.v_size, self.additional_regs, None, True, self.prefetching)) From 5fcf5fde60f056ae9bb1bdf9faf8854dfc269efa Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sun, 27 Oct 2024 03:03:15 +0100 Subject: [PATCH 60/64] Remove old gitignores --- tests/arm/.gitignore | 2 -- tests/arm_sve/.gitignore | 2 -- 2 files changed, 4 deletions(-) delete mode 100644 tests/arm/.gitignore delete mode 100644 tests/arm_sve/.gitignore diff --git a/tests/arm/.gitignore b/tests/arm/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/tests/arm/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/tests/arm_sve/.gitignore b/tests/arm_sve/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/tests/arm_sve/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore From 13bd62243fbf4085792bb852f2732006d1dab462 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Wed, 30 Oct 2024 21:58:20 +0100 Subject: [PATCH 61/64] Fix k predicates --- .../architectures/arm_sve/generator.py | 48 +++++++++++-------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/pspamm/codegen/architectures/arm_sve/generator.py b/pspamm/codegen/architectures/arm_sve/generator.py index 9ba194d..d5007cf 100644 --- a/pspamm/codegen/architectures/arm_sve/generator.py +++ b/pspamm/codegen/architectures/arm_sve/generator.py @@ -36,6 +36,7 @@ class Generator(AbstractGenerator): prefetch_count = 0 is_sparse = False v_len = 4 # vector register length: v_len * 128 bit + predicates = {} def get_v_size(self): return (16 // self.precision.size()) * self.v_len @@ -61,12 +62,12 @@ def pred_n_trues(self, num_trues: int, v_size: int, suffix: str = None) -> Regis assert (suffix == "m" or suffix == "z" or suffix is None) # we only use p7 or p0 as predicates (1 == p0, 8 == p7) - num_trues = 8 if num_trues >= v_size else 1 + index = 7 if num_trues >= v_size else self.predicates[num_trues] if suffix is None: - s = "p{}".format(num_trues - 1) + s = f"p{index}" else: - s = "p{}/{}".format(num_trues - 1, suffix) + s = f"p{index}/{suffix}" return Register_ARM(AsmType.p64x8, s) # is called at most one time in matmul.py @@ -121,7 +122,7 @@ def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: i mask_regs = [p(0), p(7)] - self.init_registers(bm, k, bk, v_size, nnz) + self.init_registers(m, bm, k, bk, v_size, nnz) return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, mask_regs @@ -161,6 +162,7 @@ def init_mask(self, return asm def init_registers(self, + m: int, bm: int, k: int, bk: int, @@ -172,6 +174,7 @@ def init_registers(self, elem128 = 16 // self.get_precision().size() bkmod = bk % elem128 if self.inline_broadcast else 0 kmod = (k % bk) % elem128 if self.inline_broadcast else 0 + mmod = (m % bm) % v_size eol = "\\n\\t" # define the "end of line" sequence for easy assembly # determine the predicate suffix @@ -189,6 +192,7 @@ def init_registers(self, comment += "// if given, p0 denotes the 'bm % v_size' predicate\n\t" comment += "// if given, p1 denotes the 'bk % elem128' predicate\n\t" comment += "// if given, p2 denotes the 'k % elem128' predicate\n\t" + comment += "// if given, p4 denotes the 'k % v_size' predicate\n\t" self.has_k_overhead = kmod != 0 self.has_bk_overhead = bkmod != 0 @@ -199,21 +203,29 @@ def init_registers(self, # https://developer.arm.com/documentation/ddi0596/2020-12/Shared-Pseudocode/AArch64-Functions?lang=en#impl-aarch64.DecodePredCount.2 # 'ptrue' doesnt work for initialising overhead predicate when using single precision -> see valid patterns from above # overhead = "\"ptrue p0.{suffix}, #{overhead}{eol}\"\n\t" if bm != 0 else "" # define overhead predicate - overhead_m = "\"mov {gen_reg}{overhead_counter}, #{overhead_m}{eol}\"\n\t\"whilelo p0.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if bmmod != 0 else "" + overhead_bm = "\"mov {gen_reg}{overhead_counter}, #{overhead_bm}{eol}\"\n\t\"whilelo p0.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if bmmod != 0 else "" overhead_bk = "\"mov {gen_reg}{overhead_counter}, #{overhead_bk}{eol}\"\n\t\"whilelo p1.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if self.has_bk_overhead else "" overhead_k = "\"mov {gen_reg}{overhead_counter}, #{overhead_k}{eol}\"\n\t\"whilelo p2.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if self.has_k_overhead else "" overhead_nnz = "\"mov {gen_reg}{overhead_counter}, #{overhead_nnz}{eol}\"\n\t\"whilelo p3.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if self.has_nnz_overhead else "" + overhead_m = "\"mov {gen_reg}{overhead_counter}, #{overhead_m}{eol}\"\n\t\"whilelo p4.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if mmod != 0 else "" all_true = "\"ptrue p7.{suffix}, #31{eol}\"" # define all true predicate - init_registers = (comment + overhead_m + overhead_bk + overhead_k + overhead_nnz + all_true).format(suffix=p_suffix, + init_registers = (comment + overhead_bm + overhead_bk + overhead_k + overhead_nnz + overhead_m + all_true).format(suffix=p_suffix, gen_reg=gen_reg, overhead_counter=overhead_counter, v_size=v_size, - overhead_m=bmmod, + overhead_bm=bmmod, overhead_bk=bkmod, overhead_k=kmod, + overhead_m=mmod, overhead_nnz=nnz % elem128, eol=eol) + self.predicates[v_size] = 7 + if bmmod != 0: self.predicates[bmmod] = 0 + if bkmod != 0: self.predicates[bkmod] = 1 + if kmod != 0: self.predicates[kmod] = 2 + if mmod != 0: self.predicates[mmod] = 4 + # since .format() doesn't allow partial formatting, we need to re-include the # placeholders that are replaced at the end of generating a kernel self.template = self.get_template().format(init_registers=init_registers, @@ -261,12 +273,14 @@ def move_register_block(self, # this gives us the base register of 'cursor' irrespective of the dummy offset we use prev_base = cursor.look(cursor_ptr, block_offset, Coords(down=0, right=0))[0].base + process_size = min(v_size, cursor.br) + for ic in range(cols): for ir in range(rows): if (mask is None) or (mask[ir, ic]): - processed = ir * v_size - p = self.pred_n_trues(b_row - processed, v_size) if not is_B else self.pred_n_trues(v_size, v_size) - p_zeroing = self.pred_n_trues(b_row - processed, v_size, "z") if not is_B else self.pred_n_trues(v_size, v_size, "z") + processed = ir * process_size + p = self.pred_n_trues(min(b_row - processed, process_size), v_size) if not is_B else self.pred_n_trues(process_size, v_size) + p_zeroing = self.pred_n_trues(min(b_row - processed, process_size), v_size, "z") if not is_B else self.pred_n_trues(process_size, v_size, "z") cell_offset = Coords(down=ir * v_size, right=ic) # addr = base "pointer" + relative offset in bytes @@ -365,22 +379,14 @@ def make_microkernel(self, divider = 1 elem128 = 1 vk = bk - preg = 'p7/z' - preg_last = 'p7/z' else: max_offs = 127 divider = 16 elem128 = 16 // self.get_precision().size() vk = -(bk // -elem128) - #if isinstance(B, DenseCursor): - # preg = 'p1/z' if self.has_bk_overhead else 'p7/z' - # preg_last = 'p2/z' if self.has_k_overhead else preg - #else: - # preg = 'p7/z' - # preg_last = 'p7/z' - preg = 'p7/z' - preg_last = 'p7/z' + preg = self.pred_n_trues(elem128, elem128, 'z') + preg_last = preg if bk % elem128 == 0 else self.pred_n_trues(bk % elem128, elem128, 'z') for Vmi in range(Vm): # set to all v_size predicates to true, we want to replicate a B element into a whole vector for bni in range(bn): # inside this n-block @@ -390,7 +396,7 @@ def make_microkernel(self, if B.has_nonzero_cell(B_ptr, to_B_block, to_cell): B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) if B_regs[bki_reg, bni] not in bs: - p_zeroing = Register_ARM(AsmType.p64x8, preg_last) if bki_reg + 1 == vk else Register_ARM(AsmType.p64x8, preg) + p_zeroing = preg_last if bki_reg + 1 == vk else preg # max_offs is the maximum allowed immediate offset when using ld1rd/ld1rw to broadcast a scalar value if B_cell_addr.disp > max_offs or B_cell_addr.disp % divider != 0: From 9c430fcafcb08f2cb95579ed8b62bdd890900f87 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Wed, 30 Oct 2024 21:58:25 +0100 Subject: [PATCH 62/64] Fix vm overhead --- pspamm/matmul.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pspamm/matmul.py b/pspamm/matmul.py index 000c14a..76d2ed6 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -183,11 +183,17 @@ def __init__(self, bpattern = Matrix.load(mtx_filename) if self.masks: self.generator.set_sparse() + else: + assert self.k <= ldb if lda == 0: apattern = Matrix.load(mtx_filename) if self.masks: self.generator.set_sparse() + else: + assert self.m <= lda + + assert self.m <= ldc self.nnz = 0 self.flop = 0 @@ -286,7 +292,7 @@ def kernelK(asm, Bki, A_ptr, B_ptr): if keep: asm.add(self.generator.make_microkernel(self.A, self.B, A_ptr, B_ptr, self.A_regs, self.B_regs, regs, self.v_size, self.additional_regs, to_A, to_B)) - + if unroll: for Bki in range(Bk): kernelK(asm, Bki, A_ptr, B_ptr) @@ -376,13 +382,14 @@ def make(self): loop(self.loop_regs[0], 0, Bm, 1).body(*loopBody) ) - vm_overhead = (self.m % self.bm) // self.v_size + m_overhead = self.m % self.bm + vm_overhead = -(m_overhead // -self.v_size) if vm_overhead > 0: self.m = self.m % self.bm self.bm = self.m % self.bm - self.A_regs = self.A_regs[0:self.bm // self.v_size, 0:self.bk] - self.C_regs = self.C_regs[0:self.bm // self.v_size, 0:self.bn] + self.A_regs = self.A_regs[0:vm_overhead, 0:self.bk] + self.C_regs = self.C_regs[0:vm_overhead, 0:self.bn] self.A.r = self.m asm.add(self.make_nk_unroll(self.unroll)) From 885c9d4358ff9334b58c84490fd96ab6961cacc7 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Wed, 30 Oct 2024 23:01:05 +0100 Subject: [PATCH 63/64] Fix AVX512 masking --- pspamm/codegen/architectures/arm/generator.py | 2 +- .../architectures/arm_sve/generator.py | 1 + pspamm/codegen/architectures/hsw/generator.py | 2 +- pspamm/codegen/architectures/knl/generator.py | 41 ++++++++++++------- .../architectures/knl/inlineprinter.py | 27 ++++++------ pspamm/matmul.py | 2 +- 6 files changed, 43 insertions(+), 32 deletions(-) diff --git a/pspamm/codegen/architectures/arm/generator.py b/pspamm/codegen/architectures/arm/generator.py index 654fac7..b80e16f 100644 --- a/pspamm/codegen/architectures/arm/generator.py +++ b/pspamm/codegen/architectures/arm/generator.py @@ -43,7 +43,7 @@ def use_broadcast(self): def has_masks(self): return False - def init_mask(self, bm, v_size, tempreg, maskregs): + def init_mask(self, m, bm, v_size, tempreg, maskregs): return block("") def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int): diff --git a/pspamm/codegen/architectures/arm_sve/generator.py b/pspamm/codegen/architectures/arm_sve/generator.py index d5007cf..e7155cb 100644 --- a/pspamm/codegen/architectures/arm_sve/generator.py +++ b/pspamm/codegen/architectures/arm_sve/generator.py @@ -152,6 +152,7 @@ def make_b_pointers(self, return asm def init_mask(self, + m: int, bm: int, v_size: int, tempreg, diff --git a/pspamm/codegen/architectures/hsw/generator.py b/pspamm/codegen/architectures/hsw/generator.py index 5d3598d..5ad7c28 100644 --- a/pspamm/codegen/architectures/hsw/generator.py +++ b/pspamm/codegen/architectures/hsw/generator.py @@ -46,7 +46,7 @@ def use_broadcast(self): def has_masks(self): return False - def init_mask(self, bm, v_size, tempreg, maskregs): + def init_mask(self, m, bm, v_size, tempreg, maskregs): return block("") def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int): diff --git a/pspamm/codegen/architectures/knl/generator.py b/pspamm/codegen/architectures/knl/generator.py index e340e4f..01b7fcd 100644 --- a/pspamm/codegen/architectures/knl/generator.py +++ b/pspamm/codegen/architectures/knl/generator.py @@ -33,6 +33,7 @@ class Generator(AbstractGenerator): }}}}; """ v_len = 4 + predicates = {0:mask(0)} def get_v_size(self): return (16 // self.precision.size()) * self.v_len @@ -48,8 +49,8 @@ def has_masks(self): def pred_n_trues(self, count, v_size, mode): # a bit hacky at the moment (won't work for all masks) - if count < v_size and count > 0: - return Predicate(mask(0), mode=='z') + if count < v_size: + return Predicate(self.predicates[count], mode=='z') else: return None @@ -77,7 +78,7 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: additional_regs = [r(8)] - mask_regs = [mask(0)] + mask_regs = [mask(1), mask(2)] reg_count = 0 @@ -95,17 +96,30 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: loop_regs = [r(12), r(13), r(14)] + # FIXME: a bit hacky to have the mask setup here + rest = bm % v_size + rest2 = (m % bm) % v_size + self.predicates[rest] = mask(1) + self.predicates[rest2] = mask(2) + self.predicates[0] = mask(0) + return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, mask_regs - def init_mask(self, bm, v_size, tempreg, maskregs): + def init_mask(self, m, bm, v_size, tempreg, maskregs): rest = bm % v_size - if rest == 0: + rest2 = (m % bm) % v_size + if rest == 0 and rest2 == 0: return block("") else: - asm = block("Set mask register") - restval = (1 << rest) - 1 - asm.add(mov(restval, tempreg, False)) - asm.add(mov(tempreg, maskregs[0], False)) + asm = block("Set mask registers") + if rest > 0: + restval = (1 << rest) - 1 + asm.add(mov(restval, tempreg, False)) + asm.add(mov(tempreg, maskregs[0], False)) + if rest2 > 0: + restval2 = (1 << rest2) - 1 + asm.add(mov(restval2, tempreg, False)) + asm.add(mov(tempreg, maskregs[1], False)) return asm def bcst_alpha_beta(self, @@ -114,9 +128,6 @@ def bcst_alpha_beta(self, ) -> Block: asm = block("Broadcast alpha and beta using inline broadcasting") - -# asm.add(bcst(alpha_reg[0], alpha_reg[1])) -# asm.add(bcst(beta_reg[0], beta_reg[1])) return asm @@ -201,6 +212,8 @@ def move_register_block(self, b_row, _, _, _ = cursor.get_block(cursor_ptr, block_offset) + process_size = min(v_size, cursor.br) + for ic in range(cols): for ir in range(rows): if (mask is None) or (mask[ir,ic]): @@ -208,8 +221,8 @@ def move_register_block(self, addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) addr.disp += self.precision.size() * load_offset - processed = ir * v_size - p = self.pred_n_trues(b_row - processed, v_size, 'z') + processed = ir * process_size + p = self.pred_n_trues(min(process_size, b_row - processed), v_size, 'm') if store: asm.add(mov(registers[ir,ic], addr, True, comment, pred=p)) if prefetching == 'BL2viaC': diff --git a/pspamm/codegen/architectures/knl/inlineprinter.py b/pspamm/codegen/architectures/knl/inlineprinter.py index 9b04759..8f5e73d 100644 --- a/pspamm/codegen/architectures/knl/inlineprinter.py +++ b/pspamm/codegen/architectures/knl/inlineprinter.py @@ -65,9 +65,9 @@ def maskformat(self, pred): if pred is None: return '' elif pred.zero: - return f'{{{{{pred.register.ugly}}}}}{{{{z}}}}' + return f'%{{{pred.register.ugly}%}}%{{z%}}' else: - return f'{{{{{pred.register.ugly}}}}}' + return f'%{{{pred.register.ugly}%}}' def visitFma(self, stmt: FmaStmt): mask = self.maskformat(stmt.pred) @@ -77,13 +77,13 @@ def visitFma(self, stmt: FmaStmt): regsize = stmt.add_dest.size() // 16 extent = regsize * self.broadcast_multiplier if stmt.bcast is not None: - s = f"vfmadd231{self.alupsuffix} {b}%{{1to{extent}%}} {mask}, {m}, {a}" + s = f"vfmadd231{self.alupsuffix} {b}%{{1to{extent}%}}, {m}, {a} {mask}" else: if stmt.mult_src.typeinfo == AsmType.i64: # in this case, m is a Register that points to alpha; manually format to be a memory address - s = f"vfmadd231{self.alupsuffix} 0({m})%{{1to{extent}%}} {mask}, {b}, {a}" + s = f"vfmadd231{self.alupsuffix} 0({m})%{{1to{extent}%}}, {b}, {a} {mask}" else: - s = f"vfmadd231{self.alupsuffix} {b} {mask}, {m}, {a}" + s = f"vfmadd231{self.alupsuffix} {b}, {m}, {a} {mask}" self.addLine(s, stmt.comment) def visitMul(self, stmt: MulStmt): @@ -95,9 +95,9 @@ def visitMul(self, stmt: MulStmt): extent = regsize * self.broadcast_multiplier if stmt.mult_src.typeinfo == AsmType.i64: # in this case, m is a Register that points to alpha/beta; manually format to be a memory address - s = f"vmul{self.alupsuffix} 0({m})%{{1to{extent}%}} {mask}, {b}, {a}" + s = f"vmul{self.alupsuffix} 0({m})%{{1to{extent}%}}, {b}, {a} {mask}" else: - s = f"vmul{self.alupsuffix} {b} {mask}, {m}, {a}" + s = f"vmul{self.alupsuffix} {b}, {m}, {a} {mask}" self.addLine(s, stmt.comment) def visitBcst(self, stmt: BcstStmt): @@ -111,7 +111,7 @@ def visitBcst(self, stmt: BcstStmt): instruction = 'vmovddup' else: instruction = f"vbroadcasts{self.psuffix}" - s = f"{instruction} {b} {mask}, {a}" + s = f"{instruction} {b}, {a} {mask}" self.addLine(s, stmt.comment) def visitAdd(self, stmt: AddStmt): @@ -129,7 +129,7 @@ def visitLabel(self, stmt: LabelStmt): def visitCmp(self, stmt: CmpStmt): mask = self.maskformat(stmt.pred) - s = f"cmp {stmt.lhs.ugly} {mask}, {stmt.rhs.ugly}" + s = f"cmp {stmt.lhs.ugly}, {stmt.rhs.ugly} {mask}" self.addLine(s, stmt.comment) def visitJump(self, stmt: JumpStmt): @@ -153,19 +153,16 @@ def visitMov(self, stmt: MovStmt): s = f"movq {src_str}, {stmt.dest.ugly}" elif stmt.typ == AsmType.f64x8 and stmt.aligned: if isinstance(stmt.src, Constant) and stmt.src.value == 0: - s = f"vpxord {stmt.dest.ugly} {mask}, {stmt.dest.ugly}, {stmt.dest.ugly}" + s = f"vpxord {stmt.dest.ugly}, {stmt.dest.ugly}, {stmt.dest.ugly} {mask}" else: - if isinstance(stmt.src, MemoryAddress): - s = f"vmovupd {src_str}, {stmt.dest.ugly} {mask}" - else: - s = f"vmovupd {src_str} {mask}, {stmt.dest.ugly}" + s = f"vmovupd {src_str}, {stmt.dest.ugly} {mask}" else: raise NotImplementedError() self.addLine(s, stmt.comment) def visitLea(self, stmt: LeaStmt): mask = self.maskformat(stmt.pred) - s = f"leaq {stmt.offset}({stmt.src.ugly}) {mask}, {stmt.dest.ugly}" + s = f"leaq {stmt.offset}({stmt.src.ugly}), {stmt.dest.ugly} {mask}" self.addLine(s, stmt.comment) def visitPrefetch(self, stmt: PrefetchStmt): diff --git a/pspamm/matmul.py b/pspamm/matmul.py index 76d2ed6..cc72f6f 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -378,7 +378,7 @@ def make(self): asm = block("unrolled_{}x{}x{}".format(self.m,self.n,self.k), self.generator.bcst_alpha_beta(self.alpha_reg, self.beta_reg), self.generator.make_scaling_offsets(self.additional_regs, self.nnz), - self.generator.init_mask(self.bm, self.v_size, self.loop_regs[0], self.mask_regs), + self.generator.init_mask(self.m, self.bm, self.v_size, self.loop_regs[0], self.mask_regs), loop(self.loop_regs[0], 0, Bm, 1).body(*loopBody) ) From a0ce34a17521334ac1a28850c92938e032281543 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Wed, 30 Oct 2024 23:57:41 +0100 Subject: [PATCH 64/64] Hotfix overlarge bm --- pspamm/codegen/architectures/arm_sve/generator.py | 2 ++ pspamm/codegen/architectures/knl/generator.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/pspamm/codegen/architectures/arm_sve/generator.py b/pspamm/codegen/architectures/arm_sve/generator.py index e7155cb..80a744e 100644 --- a/pspamm/codegen/architectures/arm_sve/generator.py +++ b/pspamm/codegen/architectures/arm_sve/generator.py @@ -280,6 +280,8 @@ def move_register_block(self, for ir in range(rows): if (mask is None) or (mask[ir, ic]): processed = ir * process_size + if processed >= b_row: + continue p = self.pred_n_trues(min(b_row - processed, process_size), v_size) if not is_B else self.pred_n_trues(process_size, v_size) p_zeroing = self.pred_n_trues(min(b_row - processed, process_size), v_size, "z") if not is_B else self.pred_n_trues(process_size, v_size, "z") cell_offset = Coords(down=ir * v_size, right=ic) diff --git a/pspamm/codegen/architectures/knl/generator.py b/pspamm/codegen/architectures/knl/generator.py index 01b7fcd..493bbdf 100644 --- a/pspamm/codegen/architectures/knl/generator.py +++ b/pspamm/codegen/architectures/knl/generator.py @@ -222,6 +222,8 @@ def move_register_block(self, addr.disp += self.precision.size() * load_offset processed = ir * process_size + if processed >= b_row: + continue p = self.pred_n_trues(min(process_size, b_row - processed), v_size, 'm') if store: asm.add(mov(registers[ir,ic], addr, True, comment, pred=p))