From 8acaf5265a636a384ec3a63cba34baa5b8c8c10c Mon Sep 17 00:00:00 2001 From: kstokop Date: Fri, 15 Nov 2024 12:26:12 +0000 Subject: [PATCH] Add Lit tests for ScalarizeFunction pass Current tests were only debug info. Created Lit test with proper checks. Created also opaque pointers versions, but pass not fully supports opaque pointers. The "Requires: opaque-ptr-fix" was added. --- IGC/Compiler/tests/ScalarizeFunction/basic.ll | 140 ----- .../tests/ScalarizeFunction/fneg_optnone.ll | 35 -- ...arize-binary-instruction-typed-pointers.ll | 258 ++++++++ .../scalarize-binary-instruction.ll | 260 ++++++++ ...alarize-cast-instruction-typed-pointers.ll | 195 ++++++ .../scalarize-cast-instruction.ll | 196 ++++++ ...alarize-comp-instruction-typed-pointers.ll | 217 +++++++ .../scalarize-comp-instruction.ll | 218 +++++++ ...etelementptr-instruction-typed-pointers.ll | 190 ++++++ .../scalarize-getelementptr-instruction.ll | 82 +++ ...calarize-phi-instruction-typed-pointers.ll | 587 +++++++++++++++++ .../scalarize-phi-instruction.ll | 589 ++++++++++++++++++ ...arize-select-instruction-typed-pointers.ll | 221 +++++++ .../scalarize-select-instruction.ll | 223 +++++++ ...larize-unary-instruction-typed-pointers.ll | 187 ++++++ .../scalarize-unary-instruction.ll | 194 ++++++ ...rize-vector-instructions-typed-pointers.ll | 145 +++++ .../scalarize-vector-instructions.ll | 146 +++++ .../selective-typed-pointers.ll | 352 +++++++++++ .../tests/ScalarizeFunction/selective.ll | 190 ++++-- 20 files changed, 4398 insertions(+), 227 deletions(-) delete mode 100644 IGC/Compiler/tests/ScalarizeFunction/basic.ll delete mode 100644 IGC/Compiler/tests/ScalarizeFunction/fneg_optnone.ll create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-binary-instruction-typed-pointers.ll create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-binary-instruction.ll create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-cast-instruction-typed-pointers.ll create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-cast-instruction.ll create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-comp-instruction-typed-pointers.ll create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-comp-instruction.ll create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-getelementptr-instruction-typed-pointers.ll create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-getelementptr-instruction.ll create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-phi-instruction-typed-pointers.ll create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-phi-instruction.ll create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-select-instruction-typed-pointers.ll create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-select-instruction.ll create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-unary-instruction-typed-pointers.ll create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-unary-instruction.ll create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-vector-instructions-typed-pointers.ll create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-vector-instructions.ll create mode 100644 IGC/Compiler/tests/ScalarizeFunction/selective-typed-pointers.ll diff --git a/IGC/Compiler/tests/ScalarizeFunction/basic.ll b/IGC/Compiler/tests/ScalarizeFunction/basic.ll deleted file mode 100644 index baf8bab18fc5..000000000000 --- a/IGC/Compiler/tests/ScalarizeFunction/basic.ll +++ /dev/null @@ -1,140 +0,0 @@ -;=========================== begin_copyright_notice ============================ -; -; Copyright (C) 2022 Intel Corporation -; -; SPDX-License-Identifier: MIT -; -;============================ end_copyright_notice ============================= -; -; RUN: igc_opt --igc-scalarize -S < %s | FileCheck %s -; ------------------------------------------------ -; ScalarizeFunction -; ------------------------------------------------ - -define spir_kernel void @test_unary(<2 x float> %src1) { -; CHECK-LABEL: @test_unary( -; CHECK: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1:%.*]], i32 0 -; CHECK: [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 -; CHECK: [[TMP1:%.*]] = alloca <2 x float>, align 4 -; CHECK: [[TMP2:%.*]] = fneg float [[SRC1_SCALAR]] -; CHECK: [[TMP3:%.*]] = fneg float [[SRC1_SCALAR1]] -; CHECK: [[ASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0 -; CHECK: [[ASSEMBLED_VECT2:%.*]] = insertelement <2 x float> [[ASSEMBLED_VECT]], float [[TMP3]], i32 1 -; CHECK: store <2 x float> [[ASSEMBLED_VECT2]], <2 x float>* [[TMP1]], align 8 -; CHECK: ret void -; - %1 = alloca <2 x float>, align 4 - %2 = fneg <2 x float> %src1 - store <2 x float> %2, <2 x float>* %1, align 8 - ret void -} - -define spir_kernel void @test_binary(<2 x i32> %src1, <2 x i32> %src2) { -; CHECK-LABEL: @test_binary( -; CHECK: [[SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2:%.*]], i32 0 -; CHECK: [[SCALAR3:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1 -; CHECK: [[SCALAR:%.*]] = extractelement <2 x i32> [[SRC1:%.*]], i32 0 -; CHECK: [[SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 -; CHECK: [[TMP1:%.*]] = alloca <2 x i32>, align 4 -; CHECK: [[TMP2:%.*]] = add i32 [[SCALAR]], [[SCALAR2]] -; CHECK: [[TMP3:%.*]] = add i32 [[SCALAR1]], [[SCALAR3]] -; CHECK: [[ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 -; CHECK: [[ASSEMBLED_VECT4:%.*]] = insertelement <2 x i32> [[ASSEMBLED_VECT]], i32 [[TMP3]], i32 1 -; CHECK: store <2 x i32> [[ASSEMBLED_VECT4]], <2 x i32>* [[TMP1]], align 8 -; CHECK: ret void -; - %1 = alloca <2 x i32>, align 4 - %2 = add <2 x i32> %src1, %src2 - store <2 x i32> %2, <2 x i32>* %1, align 8 - ret void -} - -define spir_kernel void @test_cast(<2 x i32> %src1) { -; CHECK-LABEL: @test_cast( -; CHECK: [[SCALAR:%.*]] = extractelement <2 x i32> [[SRC1:%.*]], i32 0 -; CHECK: [[SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 -; CHECK: [[TMP1:%.*]] = alloca <2 x i64>, align 4 -; CHECK: [[TMP2:%.*]] = alloca <4 x i16>, align 4 -; CHECK: [[TMP3:%.*]] = sext i32 [[SCALAR]] to i64 -; CHECK: [[TMP4:%.*]] = sext i32 [[SCALAR1]] to i64 -; CHECK: [[ASSEMBLED_VECT:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i32 0 -; CHECK: [[ASSEMBLED_VECT2:%.*]] = insertelement <2 x i64> [[ASSEMBLED_VECT]], i64 [[TMP4]], i32 1 -; CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[SRC1]] to <4 x i16> -; CHECK: store <2 x i64> [[ASSEMBLED_VECT2]], <2 x i64>* [[TMP1]], align 16 -; CHECK: store <4 x i16> [[TMP5]], <4 x i16>* [[TMP2]], align 8 -; CHECK: ret void -; - %1 = alloca <2 x i64>, align 4 - %2 = alloca <4 x i16>, align 4 - %3 = sext <2 x i32> %src1 to <2 x i64> - %4 = bitcast <2 x i32> %src1 to <4 x i16> - store <2 x i64> %3, <2 x i64>* %1, align 16 - store <4 x i16> %4, <4 x i16>* %2, align 8 - ret void -} - -define spir_kernel void @test_cmp(<2 x i32> %src1, <2 x i32> %src2) { -; CHECK-LABEL: @test_cmp( -; CHECK: [[SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2:%.*]], i32 0 -; CHECK: [[SCALAR3:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1 -; CHECK: [[SCALAR:%.*]] = extractelement <2 x i32> [[SRC1:%.*]], i32 0 -; CHECK: [[SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 -; CHECK: [[TMP1:%.*]] = alloca <2 x i1>, align 4 -; CHECK: [[TMP2:%.*]] = icmp eq i32 [[SCALAR]], [[SCALAR2]] -; CHECK: [[TMP3:%.*]] = icmp eq i32 [[SCALAR1]], [[SCALAR3]] -; CHECK: [[ASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP2]], i32 0 -; CHECK: [[ASSEMBLED_VECT4:%.*]] = insertelement <2 x i1> [[ASSEMBLED_VECT]], i1 [[TMP3]], i32 1 -; CHECK: store <2 x i1> [[ASSEMBLED_VECT4]], <2 x i1>* [[TMP1]], align 1 -; CHECK: ret void -; - %1 = alloca <2 x i1>, align 4 - %2 = icmp eq <2 x i32> %src1, %src2 - store <2 x i1> %2, <2 x i1>* %1, align 1 - ret void -} - -define spir_kernel void @test_select(<2 x i32> %src1, <4 x i16> %src2, i1 %cond, <4 x i1> %vcond) { -; CHECK-LABEL: @test_select( -; CHECK: [[SCALAR6:%.*]] = extractelement <4 x i1> [[VCOND:%.*]], i32 0 -; CHECK: [[SCALAR7:%.*]] = extractelement <4 x i1> [[VCOND]], i32 1 -; CHECK: [[SCALAR8:%.*]] = extractelement <4 x i1> [[VCOND]], i32 2 -; CHECK: [[SCALAR9:%.*]] = extractelement <4 x i1> [[VCOND]], i32 3 -; CHECK: [[SCALAR2:%.*]] = extractelement <4 x i16> [[SRC2:%.*]], i32 0 -; CHECK: [[SCALAR3:%.*]] = extractelement <4 x i16> [[SRC2]], i32 1 -; CHECK: [[SCALAR4:%.*]] = extractelement <4 x i16> [[SRC2]], i32 2 -; CHECK: [[SCALAR5:%.*]] = extractelement <4 x i16> [[SRC2]], i32 3 -; CHECK: [[SCALAR:%.*]] = extractelement <2 x i32> [[SRC1:%.*]], i32 0 -; CHECK: [[SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 -; CHECK: [[TMP1:%.*]] = alloca <2 x i32>, align 4 -; CHECK: [[TMP2:%.*]] = alloca <4 x i16>, align 4 -; CHECK: [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[SCALAR]], i32 42 -; CHECK: [[TMP4:%.*]] = select i1 [[COND]], i32 [[SCALAR1]], i32 13 -; CHECK: [[ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP3]], i32 0 -; CHECK: [[ASSEMBLED_VECT10:%.*]] = insertelement <2 x i32> [[ASSEMBLED_VECT]], i32 [[TMP4]], i32 1 -; CHECK: [[TMP5:%.*]] = select i1 [[SCALAR6]], i16 [[SCALAR2]], i16 1 -; CHECK: [[TMP6:%.*]] = select i1 [[SCALAR7]], i16 [[SCALAR3]], i16 2 -; CHECK: [[TMP7:%.*]] = select i1 [[SCALAR8]], i16 [[SCALAR4]], i16 3 -; CHECK: [[TMP8:%.*]] = select i1 [[SCALAR9]], i16 [[SCALAR5]], i16 4 -; CHECK: [[ASSEMBLED_VECT11:%.*]] = insertelement <4 x i16> undef, i16 [[TMP5]], i32 0 -; CHECK: [[ASSEMBLED_VECT12:%.*]] = insertelement <4 x i16> [[ASSEMBLED_VECT11]], i16 [[TMP6]], i32 1 -; CHECK: [[ASSEMBLED_VECT13:%.*]] = insertelement <4 x i16> [[ASSEMBLED_VECT12]], i16 [[TMP7]], i32 2 -; CHECK: [[ASSEMBLED_VECT14:%.*]] = insertelement <4 x i16> [[ASSEMBLED_VECT13]], i16 [[TMP8]], i32 3 -; CHECK: [[ASSEMBLED_VECT15:%.*]] = insertelement <4 x i16> undef, i16 [[SCALAR2]], i32 0 -; CHECK: [[ASSEMBLED_VECT16:%.*]] = insertelement <4 x i16> [[ASSEMBLED_VECT15]], i16 [[SCALAR3]], i32 1 -; CHECK: [[ASSEMBLED_VECT17:%.*]] = insertelement <4 x i16> [[ASSEMBLED_VECT16]], i16 [[SCALAR4]], i32 2 -; CHECK: [[ASSEMBLED_VECT18:%.*]] = insertelement <4 x i16> [[ASSEMBLED_VECT17]], i16 [[SCALAR5]], i32 3 -; CHECK: store <2 x i32> [[ASSEMBLED_VECT10]], <2 x i32>* [[TMP1]], align 8 -; CHECK: store <4 x i16> [[ASSEMBLED_VECT14]], <4 x i16>* [[TMP2]], align 8 -; CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[ASSEMBLED_VECT18]] to i64 -; CHECK: ret void -; - %1 = alloca <2 x i32>, align 4 - %2 = alloca <4 x i16>, align 4 - %3 = select i1 %cond, <2 x i32> %src1, <2 x i32> - %4 = select <4 x i1> %vcond, <4 x i16> %src2, <4 x i16> - %5 = select i1 %cond, <4 x i16> %src2, <4 x i16> %src2 - store <2 x i32> %3, <2 x i32>* %1, align 8 - store <4 x i16> %4, <4 x i16>* %2, align 8 - %6 = bitcast <4 x i16> %5 to i64 - ret void -} diff --git a/IGC/Compiler/tests/ScalarizeFunction/fneg_optnone.ll b/IGC/Compiler/tests/ScalarizeFunction/fneg_optnone.ll deleted file mode 100644 index 3d1ee280b3c9..000000000000 --- a/IGC/Compiler/tests/ScalarizeFunction/fneg_optnone.ll +++ /dev/null @@ -1,35 +0,0 @@ -;=========================== begin_copyright_notice ============================ -; -; Copyright (C) 2024 Intel Corporation -; -; SPDX-License-Identifier: MIT -; -;============================ end_copyright_notice ============================= -; -; REQUIRES: llvm-14-plus -; RUN: igc_opt %s -S -o - --igc-scalarize | FileCheck %s - -; Function Attrs: noinline optnone -define void @test_fneg_optnone(<4 x float> %src, <3 x float> addrspace(1)* %out) #0 { - -; CHECK-LABEL: @test_fneg_optnone( -; -; CHECK: [[EE0:%.*]] = extractelement <4 x float> %src, i32 0 -; CHECK: [[EE1:%.*]] = extractelement <4 x float> %src, i32 1 -; CHECK: [[EE2:%.*]] = extractelement <4 x float> %src, i32 2 -; CHECK: [[EE3:%.*]] = extractelement <4 x float> %src, i32 3 -; CHECK: [[IE0:%.*]] = insertelement <3 x float> undef, float [[EE0]], i32 0 -; CHECK: [[IE1:%.*]] = insertelement <3 x float> [[IE0]], float [[EE1]], i32 1 -; CHECK: [[IE2:%.*]] = insertelement <3 x float> [[IE1]], float [[EE2]], i32 2 -; CHECK: [[FNEG:%.*]] = fneg <3 x float> [[IE2]] -; CHECK: store <3 x float> [[FNEG]], <3 x float> addrspace(1)* %out, align 4 - -; CHECK-NOT: fneg <3 x float> undef - - %1 = shufflevector <4 x float> %src, <4 x float> undef, <3 x i32> - %2 = fneg <3 x float> %1 - store <3 x float> %2, <3 x float> addrspace(1)* %out, align 4 - ret void -} - -attributes #0 = { noinline optnone } diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-binary-instruction-typed-pointers.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-binary-instruction-typed-pointers.ll new file mode 100644 index 000000000000..93fd85af02ef --- /dev/null +++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-binary-instruction-typed-pointers.ll @@ -0,0 +1,258 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2024 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= +; +; RUN: igc_opt --igc-scalarize -S < %s | FileCheck %s +; ------------------------------------------------ +; ScalarizeFunction +; ------------------------------------------------ +; The pass should break vector operation into many scalar operations +; ------------------------------------------------ + +define spir_kernel void @basic(<2 x i32> %src1, <2 x i32> %src2) { +; CHECK-LABEL: define spir_kernel void @basic( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x i32>, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x i32> [[DOTASSEMBLED_VECT3]], <2 x i32>* [[TMP1]], align 8 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x i32> + %2 = add <2 x i32> %src1, %src2 + store <2 x i32> %2, <2 x i32>* %1 + ret void +} + +define spir_kernel void @should_work_with_different_instruction_type(<2 x float> %src1, <2 x float> %src2) { +; CHECK-LABEL: define spir_kernel void @should_work_with_different_instruction_type( +; CHECK-SAME: <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x float>, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = fadd float [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = fadd float [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x float> [[DOTASSEMBLED_VECT]], float [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x float> [[DOTASSEMBLED_VECT3]], <2 x float>* [[TMP1]], align 8 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x float> + %2 = fadd <2 x float> %src1, %src2 + store <2 x float> %2, <2 x float>* %1 + ret void +} + +define spir_kernel void @should_work_with_exact_flag(<2 x i32> %src1, <2 x i32> %src2) { +; CHECK-LABEL: @should_work_with_exact_flag( +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2:%.*]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1:%.*]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x i32>, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = udiv exact i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = udiv exact i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x i32> [[DOTASSEMBLED_VECT3]], <2 x i32>* [[TMP1]], align 8 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x i32> + %2 = udiv exact <2 x i32> %src1, %src2 + store <2 x i32> %2, <2 x i32>* %1 + ret void +} + +; triangulating with @should_work_with_different_instruction_type +define spir_kernel void @should_work_with_fast_math_flags(<2 x float> %src1, <2 x float> %src2) { +; CHECK-LABEL: define spir_kernel void @should_work_with_fast_math_flags( +; CHECK-SAME: <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x float>, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast float [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = fadd fast float [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x float> [[DOTASSEMBLED_VECT]], float [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x float> [[DOTASSEMBLED_VECT3]], <2 x float>* [[TMP1]], align 8 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x float> + %2 = fadd fast <2 x float> %src1, %src2 + store <2 x float> %2, <2 x float>* %1 + ret void +} + +define spir_kernel void @should_work_with_different_value_type(<2 x i64> %src1, <2 x i64> %src2) { +; CHECK-LABEL: define spir_kernel void @should_work_with_different_value_type( +; CHECK-SAME: <2 x i64> [[SRC1:%.*]], <2 x i64> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i64> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x i64> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i64> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i64> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x i64>, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i64> [[DOTASSEMBLED_VECT]], i64 [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x i64> [[DOTASSEMBLED_VECT3]], <2 x i64>* [[TMP1]], align 16 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x i64> + %2 = add <2 x i64> %src1, %src2 + store <2 x i64> %2, <2 x i64>* %1 + ret void +} + +define spir_kernel void @should_work_with_larger_vector_size(<16 x i32> %src1, <16 x i32> %src2) { +; CHECK-LABEL: define spir_kernel void @should_work_with_larger_vector_size( +; CHECK-SAME: <16 x i32> [[SRC1:%.*]], <16 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <16 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR16:%.*]] = extractelement <16 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC2_SCALAR17:%.*]] = extractelement <16 x i32> [[SRC2]], i32 2 +; CHECK-NEXT: [[SRC2_SCALAR18:%.*]] = extractelement <16 x i32> [[SRC2]], i32 3 +; CHECK-NEXT: [[SRC2_SCALAR19:%.*]] = extractelement <16 x i32> [[SRC2]], i32 4 +; CHECK-NEXT: [[SRC2_SCALAR20:%.*]] = extractelement <16 x i32> [[SRC2]], i32 5 +; CHECK-NEXT: [[SRC2_SCALAR21:%.*]] = extractelement <16 x i32> [[SRC2]], i32 6 +; CHECK-NEXT: [[SRC2_SCALAR22:%.*]] = extractelement <16 x i32> [[SRC2]], i32 7 +; CHECK-NEXT: [[SRC2_SCALAR23:%.*]] = extractelement <16 x i32> [[SRC2]], i32 8 +; CHECK-NEXT: [[SRC2_SCALAR24:%.*]] = extractelement <16 x i32> [[SRC2]], i32 9 +; CHECK-NEXT: [[SRC2_SCALAR25:%.*]] = extractelement <16 x i32> [[SRC2]], i32 10 +; CHECK-NEXT: [[SRC2_SCALAR26:%.*]] = extractelement <16 x i32> [[SRC2]], i32 11 +; CHECK-NEXT: [[SRC2_SCALAR27:%.*]] = extractelement <16 x i32> [[SRC2]], i32 12 +; CHECK-NEXT: [[SRC2_SCALAR28:%.*]] = extractelement <16 x i32> [[SRC2]], i32 13 +; CHECK-NEXT: [[SRC2_SCALAR29:%.*]] = extractelement <16 x i32> [[SRC2]], i32 14 +; CHECK-NEXT: [[SRC2_SCALAR30:%.*]] = extractelement <16 x i32> [[SRC2]], i32 15 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <16 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <16 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR2:%.*]] = extractelement <16 x i32> [[SRC1]], i32 2 +; CHECK-NEXT: [[SRC1_SCALAR3:%.*]] = extractelement <16 x i32> [[SRC1]], i32 3 +; CHECK-NEXT: [[SRC1_SCALAR4:%.*]] = extractelement <16 x i32> [[SRC1]], i32 4 +; CHECK-NEXT: [[SRC1_SCALAR5:%.*]] = extractelement <16 x i32> [[SRC1]], i32 5 +; CHECK-NEXT: [[SRC1_SCALAR6:%.*]] = extractelement <16 x i32> [[SRC1]], i32 6 +; CHECK-NEXT: [[SRC1_SCALAR7:%.*]] = extractelement <16 x i32> [[SRC1]], i32 7 +; CHECK-NEXT: [[SRC1_SCALAR8:%.*]] = extractelement <16 x i32> [[SRC1]], i32 8 +; CHECK-NEXT: [[SRC1_SCALAR9:%.*]] = extractelement <16 x i32> [[SRC1]], i32 9 +; CHECK-NEXT: [[SRC1_SCALAR10:%.*]] = extractelement <16 x i32> [[SRC1]], i32 10 +; CHECK-NEXT: [[SRC1_SCALAR11:%.*]] = extractelement <16 x i32> [[SRC1]], i32 11 +; CHECK-NEXT: [[SRC1_SCALAR12:%.*]] = extractelement <16 x i32> [[SRC1]], i32 12 +; CHECK-NEXT: [[SRC1_SCALAR13:%.*]] = extractelement <16 x i32> [[SRC1]], i32 13 +; CHECK-NEXT: [[SRC1_SCALAR14:%.*]] = extractelement <16 x i32> [[SRC1]], i32 14 +; CHECK-NEXT: [[SRC1_SCALAR15:%.*]] = extractelement <16 x i32> [[SRC1]], i32 15 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <16 x i32>, align 64 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR16]] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SRC1_SCALAR2]], [[SRC2_SCALAR17]] +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SRC1_SCALAR3]], [[SRC2_SCALAR18]] +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[SRC1_SCALAR4]], [[SRC2_SCALAR19]] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[SRC1_SCALAR5]], [[SRC2_SCALAR20]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SRC1_SCALAR6]], [[SRC2_SCALAR21]] +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[SRC1_SCALAR7]], [[SRC2_SCALAR22]] +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[SRC1_SCALAR8]], [[SRC2_SCALAR23]] +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[SRC1_SCALAR9]], [[SRC2_SCALAR24]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[SRC1_SCALAR10]], [[SRC2_SCALAR25]] +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[SRC1_SCALAR11]], [[SRC2_SCALAR26]] +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[SRC1_SCALAR12]], [[SRC2_SCALAR27]] +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[SRC1_SCALAR13]], [[SRC2_SCALAR28]] +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[SRC1_SCALAR14]], [[SRC2_SCALAR29]] +; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[SRC1_SCALAR15]], [[SRC2_SCALAR30]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <16 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT31:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1 +; CHECK-NEXT: [[DOTASSEMBLED_VECT32:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT31]], i32 [[TMP4]], i32 2 +; CHECK-NEXT: [[DOTASSEMBLED_VECT33:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT32]], i32 [[TMP5]], i32 3 +; CHECK-NEXT: [[DOTASSEMBLED_VECT34:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT33]], i32 [[TMP6]], i32 4 +; CHECK-NEXT: [[DOTASSEMBLED_VECT35:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT34]], i32 [[TMP7]], i32 5 +; CHECK-NEXT: [[DOTASSEMBLED_VECT36:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT35]], i32 [[TMP8]], i32 6 +; CHECK-NEXT: [[DOTASSEMBLED_VECT37:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT36]], i32 [[TMP9]], i32 7 +; CHECK-NEXT: [[DOTASSEMBLED_VECT38:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT37]], i32 [[TMP10]], i32 8 +; CHECK-NEXT: [[DOTASSEMBLED_VECT39:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT38]], i32 [[TMP11]], i32 9 +; CHECK-NEXT: [[DOTASSEMBLED_VECT40:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT39]], i32 [[TMP12]], i32 10 +; CHECK-NEXT: [[DOTASSEMBLED_VECT41:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT40]], i32 [[TMP13]], i32 11 +; CHECK-NEXT: [[DOTASSEMBLED_VECT42:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT41]], i32 [[TMP14]], i32 12 +; CHECK-NEXT: [[DOTASSEMBLED_VECT43:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT42]], i32 [[TMP15]], i32 13 +; CHECK-NEXT: [[DOTASSEMBLED_VECT44:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT43]], i32 [[TMP16]], i32 14 +; CHECK-NEXT: [[DOTASSEMBLED_VECT45:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT44]], i32 [[TMP17]], i32 15 +; CHECK-NEXT: store <16 x i32> [[DOTASSEMBLED_VECT45]], <16 x i32>* [[TMP1]], align 64 +; CHECK-NEXT: ret void +; + %1 = alloca <16 x i32> + %2 = add <16 x i32> %src1, %src2 + store <16 x i32> %2, <16 x i32>* %1 + ret void +} + +define spir_kernel void @should_work_with_bit_wise_instruction(<2 x i32> %src1, <2 x i32> %src2) { +; CHECK-LABEL: define spir_kernel void @should_work_with_bit_wise_instruction( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x i32>, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = shl i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x i32> [[DOTASSEMBLED_VECT3]], <2 x i32>* [[TMP1]], align 8 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x i32> + %2 = shl <2 x i32> %src1, %src2 + store <2 x i32> %2, <2 x i32>* %1 + ret void +} + +define spir_kernel void @should_work_with_constant_value(<2 x i32> %src1) { +; CHECK-LABEL: define spir_kernel void @should_work_with_constant_value( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x i32>, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SRC1_SCALAR]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SRC1_SCALAR1]], 4 +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x i32> [[DOTASSEMBLED_VECT2]], <2 x i32>* [[TMP1]], align 8 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x i32> + %2 = add <2 x i32> %src1, + store <2 x i32> %2, <2 x i32>* %1 + ret void +} + +define spir_kernel void @should_work_with_nuw_nsw(<2 x i32> %src1, <2 x i32> %src2) { +; CHECK-LABEL: define spir_kernel void @should_work_with_nuw_nsw( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x i32>, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x i32> [[DOTASSEMBLED_VECT3]], <2 x i32>* [[TMP1]], align 8 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x i32> + %2 = add nuw nsw <2 x i32> %src1, %src2 + store <2 x i32> %2, <2 x i32>* %1 + ret void +} diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-binary-instruction.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-binary-instruction.ll new file mode 100644 index 000000000000..eeaed5137272 --- /dev/null +++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-binary-instruction.ll @@ -0,0 +1,260 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2024 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= +; +; RUN: igc_opt --igc-scalarize --opaque-pointers -S < %s | FileCheck %s +; REQUIRES: llvm-14-plus +; ------------------------------------------------ +; ScalarizeFunction +; ------------------------------------------------ +; The pass should break vector operation into many scalar operations +; ------------------------------------------------ + +define spir_kernel void @basic(<2 x i32> %src1, <2 x i32> %src2) { +; CHECK-LABEL: define spir_kernel void @basic( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x i32>, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x i32> [[DOTASSEMBLED_VECT3]], ptr [[TMP1]], align 8 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x i32> + %2 = add <2 x i32> %src1, %src2 + store <2 x i32> %2, ptr %1 + ret void +} + +define spir_kernel void @should_work_with_different_instruction_type(<2 x float> %src1, <2 x float> %src2) { +; CHECK-LABEL: define spir_kernel void @should_work_with_different_instruction_type( +; CHECK-SAME: <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x float>, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = fadd float [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = fadd float [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x float> [[DOTASSEMBLED_VECT]], float [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x float> [[DOTASSEMBLED_VECT3]], ptr [[TMP1]], align 8 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x float> + %2 = fadd <2 x float> %src1, %src2 + store <2 x float> %2, ptr %1 + ret void +} + +define spir_kernel void @should_work_with_exact_flag(<2 x i32> %src1, <2 x i32> %src2) { +; CHECK-LABEL: define spir_kernel void @should_work_with_exact_flag( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x i32>, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = udiv exact i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = udiv exact i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x i32> [[DOTASSEMBLED_VECT3]], ptr [[TMP1]], align 8 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x i32> + %2 = udiv exact <2 x i32> %src1, %src2 + store <2 x i32> %2, ptr %1 + ret void +} + +; triangulating with @should_work_with_different_instruction_type +define spir_kernel void @should_work_with_fast_math_flags(<2 x float> %src1, <2 x float> %src2) { +; CHECK-LABEL: define spir_kernel void @should_work_with_fast_math_flags( +; CHECK-SAME: <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x float>, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast float [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = fadd fast float [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x float> [[DOTASSEMBLED_VECT]], float [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x float> [[DOTASSEMBLED_VECT3]], ptr [[TMP1]], align 8 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x float> + %2 = fadd fast <2 x float> %src1, %src2 + store <2 x float> %2, ptr %1 + ret void +} + +define spir_kernel void @should_work_with_different_value_type(<2 x i64> %src1, <2 x i64> %src2) { +; CHECK-LABEL: define spir_kernel void @should_work_with_different_value_type( +; CHECK-SAME: <2 x i64> [[SRC1:%.*]], <2 x i64> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i64> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x i64> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i64> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i64> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x i64>, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i64> [[DOTASSEMBLED_VECT]], i64 [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x i64> [[DOTASSEMBLED_VECT3]], ptr [[TMP1]], align 16 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x i64> + %2 = add <2 x i64> %src1, %src2 + store <2 x i64> %2, ptr %1 + ret void +} + +define spir_kernel void @should_work_with_larger_vector_size(<16 x i32> %src1, <16 x i32> %src2) { +; CHECK-LABEL: define spir_kernel void @should_work_with_larger_vector_size( +; CHECK-SAME: <16 x i32> [[SRC1:%.*]], <16 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <16 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR16:%.*]] = extractelement <16 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC2_SCALAR17:%.*]] = extractelement <16 x i32> [[SRC2]], i32 2 +; CHECK-NEXT: [[SRC2_SCALAR18:%.*]] = extractelement <16 x i32> [[SRC2]], i32 3 +; CHECK-NEXT: [[SRC2_SCALAR19:%.*]] = extractelement <16 x i32> [[SRC2]], i32 4 +; CHECK-NEXT: [[SRC2_SCALAR20:%.*]] = extractelement <16 x i32> [[SRC2]], i32 5 +; CHECK-NEXT: [[SRC2_SCALAR21:%.*]] = extractelement <16 x i32> [[SRC2]], i32 6 +; CHECK-NEXT: [[SRC2_SCALAR22:%.*]] = extractelement <16 x i32> [[SRC2]], i32 7 +; CHECK-NEXT: [[SRC2_SCALAR23:%.*]] = extractelement <16 x i32> [[SRC2]], i32 8 +; CHECK-NEXT: [[SRC2_SCALAR24:%.*]] = extractelement <16 x i32> [[SRC2]], i32 9 +; CHECK-NEXT: [[SRC2_SCALAR25:%.*]] = extractelement <16 x i32> [[SRC2]], i32 10 +; CHECK-NEXT: [[SRC2_SCALAR26:%.*]] = extractelement <16 x i32> [[SRC2]], i32 11 +; CHECK-NEXT: [[SRC2_SCALAR27:%.*]] = extractelement <16 x i32> [[SRC2]], i32 12 +; CHECK-NEXT: [[SRC2_SCALAR28:%.*]] = extractelement <16 x i32> [[SRC2]], i32 13 +; CHECK-NEXT: [[SRC2_SCALAR29:%.*]] = extractelement <16 x i32> [[SRC2]], i32 14 +; CHECK-NEXT: [[SRC2_SCALAR30:%.*]] = extractelement <16 x i32> [[SRC2]], i32 15 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <16 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <16 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR2:%.*]] = extractelement <16 x i32> [[SRC1]], i32 2 +; CHECK-NEXT: [[SRC1_SCALAR3:%.*]] = extractelement <16 x i32> [[SRC1]], i32 3 +; CHECK-NEXT: [[SRC1_SCALAR4:%.*]] = extractelement <16 x i32> [[SRC1]], i32 4 +; CHECK-NEXT: [[SRC1_SCALAR5:%.*]] = extractelement <16 x i32> [[SRC1]], i32 5 +; CHECK-NEXT: [[SRC1_SCALAR6:%.*]] = extractelement <16 x i32> [[SRC1]], i32 6 +; CHECK-NEXT: [[SRC1_SCALAR7:%.*]] = extractelement <16 x i32> [[SRC1]], i32 7 +; CHECK-NEXT: [[SRC1_SCALAR8:%.*]] = extractelement <16 x i32> [[SRC1]], i32 8 +; CHECK-NEXT: [[SRC1_SCALAR9:%.*]] = extractelement <16 x i32> [[SRC1]], i32 9 +; CHECK-NEXT: [[SRC1_SCALAR10:%.*]] = extractelement <16 x i32> [[SRC1]], i32 10 +; CHECK-NEXT: [[SRC1_SCALAR11:%.*]] = extractelement <16 x i32> [[SRC1]], i32 11 +; CHECK-NEXT: [[SRC1_SCALAR12:%.*]] = extractelement <16 x i32> [[SRC1]], i32 12 +; CHECK-NEXT: [[SRC1_SCALAR13:%.*]] = extractelement <16 x i32> [[SRC1]], i32 13 +; CHECK-NEXT: [[SRC1_SCALAR14:%.*]] = extractelement <16 x i32> [[SRC1]], i32 14 +; CHECK-NEXT: [[SRC1_SCALAR15:%.*]] = extractelement <16 x i32> [[SRC1]], i32 15 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <16 x i32>, align 64 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR16]] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[SRC1_SCALAR2]], [[SRC2_SCALAR17]] +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SRC1_SCALAR3]], [[SRC2_SCALAR18]] +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[SRC1_SCALAR4]], [[SRC2_SCALAR19]] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[SRC1_SCALAR5]], [[SRC2_SCALAR20]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[SRC1_SCALAR6]], [[SRC2_SCALAR21]] +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[SRC1_SCALAR7]], [[SRC2_SCALAR22]] +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[SRC1_SCALAR8]], [[SRC2_SCALAR23]] +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[SRC1_SCALAR9]], [[SRC2_SCALAR24]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[SRC1_SCALAR10]], [[SRC2_SCALAR25]] +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[SRC1_SCALAR11]], [[SRC2_SCALAR26]] +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[SRC1_SCALAR12]], [[SRC2_SCALAR27]] +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[SRC1_SCALAR13]], [[SRC2_SCALAR28]] +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[SRC1_SCALAR14]], [[SRC2_SCALAR29]] +; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[SRC1_SCALAR15]], [[SRC2_SCALAR30]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <16 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT31:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1 +; CHECK-NEXT: [[DOTASSEMBLED_VECT32:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT31]], i32 [[TMP4]], i32 2 +; CHECK-NEXT: [[DOTASSEMBLED_VECT33:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT32]], i32 [[TMP5]], i32 3 +; CHECK-NEXT: [[DOTASSEMBLED_VECT34:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT33]], i32 [[TMP6]], i32 4 +; CHECK-NEXT: [[DOTASSEMBLED_VECT35:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT34]], i32 [[TMP7]], i32 5 +; CHECK-NEXT: [[DOTASSEMBLED_VECT36:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT35]], i32 [[TMP8]], i32 6 +; CHECK-NEXT: [[DOTASSEMBLED_VECT37:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT36]], i32 [[TMP9]], i32 7 +; CHECK-NEXT: [[DOTASSEMBLED_VECT38:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT37]], i32 [[TMP10]], i32 8 +; CHECK-NEXT: [[DOTASSEMBLED_VECT39:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT38]], i32 [[TMP11]], i32 9 +; CHECK-NEXT: [[DOTASSEMBLED_VECT40:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT39]], i32 [[TMP12]], i32 10 +; CHECK-NEXT: [[DOTASSEMBLED_VECT41:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT40]], i32 [[TMP13]], i32 11 +; CHECK-NEXT: [[DOTASSEMBLED_VECT42:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT41]], i32 [[TMP14]], i32 12 +; CHECK-NEXT: [[DOTASSEMBLED_VECT43:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT42]], i32 [[TMP15]], i32 13 +; CHECK-NEXT: [[DOTASSEMBLED_VECT44:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT43]], i32 [[TMP16]], i32 14 +; CHECK-NEXT: [[DOTASSEMBLED_VECT45:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT44]], i32 [[TMP17]], i32 15 +; CHECK-NEXT: store <16 x i32> [[DOTASSEMBLED_VECT45]], ptr [[TMP1]], align 64 +; CHECK-NEXT: ret void +; + %1 = alloca <16 x i32> + %2 = add <16 x i32> %src1, %src2 + store <16 x i32> %2, ptr %1 + ret void +} + +define spir_kernel void @should_work_with_bit_wise_instruction(<2 x i32> %src1, <2 x i32> %src2) { +; CHECK-LABEL: define spir_kernel void @should_work_with_bit_wise_instruction( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x i32>, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = shl i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x i32> [[DOTASSEMBLED_VECT3]], ptr [[TMP1]], align 8 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x i32> + %2 = shl <2 x i32> %src1, %src2 + store <2 x i32> %2, ptr %1 + ret void +} + +define spir_kernel void @should_work_with_constant_value(<2 x i32> %src1) { +; CHECK-LABEL: define spir_kernel void @should_work_with_constant_value( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x i32>, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SRC1_SCALAR]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[SRC1_SCALAR1]], 4 +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x i32> [[DOTASSEMBLED_VECT2]], ptr [[TMP1]], align 8 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x i32> + %2 = add <2 x i32> %src1, + store <2 x i32> %2, ptr %1 + ret void +} + +define spir_kernel void @should_work_with_nuw_nsw(<2 x i32> %src1, <2 x i32> %src2) { +; CHECK-LABEL: define spir_kernel void @should_work_with_nuw_nsw( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x i32>, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x i32> [[DOTASSEMBLED_VECT3]], ptr [[TMP1]], align 8 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x i32> + %2 = add nuw nsw <2 x i32> %src1, %src2 + store <2 x i32> %2, ptr %1 + ret void +} diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-cast-instruction-typed-pointers.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-cast-instruction-typed-pointers.ll new file mode 100644 index 000000000000..c2ad0702afd7 --- /dev/null +++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-cast-instruction-typed-pointers.ll @@ -0,0 +1,195 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2024 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= +; +; RUN: igc_opt --igc-scalarize -S < %s | FileCheck %s +; ------------------------------------------------ +; ScalarizeFunction +; ------------------------------------------------ +; The pass should break vector operation into many scalar operations +; ------------------------------------------------ + +define <2 x i8> @basic(<2 x i32> %src1) { +; CHECK-LABEL: define <2 x i8> @basic( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[SRC1_SCALAR]] to i8 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SRC1_SCALAR1]] to i8 +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i8> undef, i8 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i8> [[DOTASSEMBLED_VECT]], i8 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i8> [[DOTASSEMBLED_VECT2]] +; + %1 = trunc <2 x i32> %src1 to <2 x i8> + ret <2 x i8> %1 +} + +define <2 x float> @should_work_with_different_instruction_type(<2 x double> %src1) { +; CHECK-LABEL: define <2 x float> @should_work_with_different_instruction_type( +; CHECK-SAME: <2 x double> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x double> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x double> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = fptrunc double [[SRC1_SCALAR]] to float +; CHECK-NEXT: [[TMP2:%.*]] = fptrunc double [[SRC1_SCALAR1]] to float +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x float> [[DOTASSEMBLED_VECT]], float [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x float> [[DOTASSEMBLED_VECT2]] +; + %1 = fptrunc <2 x double> %src1 to <2 x float> + ret <2 x float> %1 +} + +define <2 x i8> @should_work_with_different_value_type(<2 x i64> %src1) { +; CHECK-LABEL: define <2 x i8> @should_work_with_different_value_type( +; CHECK-SAME: <2 x i64> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i64> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i64> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[SRC1_SCALAR]] to i8 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[SRC1_SCALAR1]] to i8 +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i8> undef, i8 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i8> [[DOTASSEMBLED_VECT]], i8 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i8> [[DOTASSEMBLED_VECT2]] +; + %1 = trunc <2 x i64> %src1 to <2 x i8> + ret <2 x i8> %1 +} + +define <2 x i16> @should_work_with_different_cast_type(<2 x i32> %src1) { +; CHECK-LABEL: define <2 x i16> @should_work_with_different_cast_type( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[SRC1_SCALAR]] to i16 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SRC1_SCALAR1]] to i16 +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i16> undef, i16 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i16> [[DOTASSEMBLED_VECT]], i16 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i16> [[DOTASSEMBLED_VECT2]] +; + %1 = trunc <2 x i32> %src1 to <2 x i16> + ret <2 x i16> %1 +} + +define <2 x float> @should_work_with_type_cast_type_2(<2 x i32> %src1) { +; CHECK-LABEL: define <2 x float> @should_work_with_type_cast_type_2( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[SRC1_SCALAR]] to float +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[SRC1_SCALAR1]] to float +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x float> [[DOTASSEMBLED_VECT]], float [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x float> [[DOTASSEMBLED_VECT2]] +; + %1 = bitcast <2 x i32> %src1 to <2 x float> + ret <2 x float> %1 +} + +define <2 x i32> @should_work_with_type_extension(<2 x i16> %src1) { +; CHECK-LABEL: define <2 x i32> @should_work_with_type_extension( +; CHECK-SAME: <2 x i16> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i16> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i16> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[SRC1_SCALAR]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[SRC1_SCALAR1]] to i32 +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i32> [[DOTASSEMBLED_VECT2]] +; + %1 = zext <2 x i16> %src1 to <2 x i32> + ret <2 x i32> %1 +} + +define <16 x i8> @should_work_with_larger_vector_size(<16 x i32> %src1) { +; CHECK-LABEL: define <16 x i8> @should_work_with_larger_vector_size( +; CHECK-SAME: <16 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <16 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <16 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR2:%.*]] = extractelement <16 x i32> [[SRC1]], i32 2 +; CHECK-NEXT: [[SRC1_SCALAR3:%.*]] = extractelement <16 x i32> [[SRC1]], i32 3 +; CHECK-NEXT: [[SRC1_SCALAR4:%.*]] = extractelement <16 x i32> [[SRC1]], i32 4 +; CHECK-NEXT: [[SRC1_SCALAR5:%.*]] = extractelement <16 x i32> [[SRC1]], i32 5 +; CHECK-NEXT: [[SRC1_SCALAR6:%.*]] = extractelement <16 x i32> [[SRC1]], i32 6 +; CHECK-NEXT: [[SRC1_SCALAR7:%.*]] = extractelement <16 x i32> [[SRC1]], i32 7 +; CHECK-NEXT: [[SRC1_SCALAR8:%.*]] = extractelement <16 x i32> [[SRC1]], i32 8 +; CHECK-NEXT: [[SRC1_SCALAR9:%.*]] = extractelement <16 x i32> [[SRC1]], i32 9 +; CHECK-NEXT: [[SRC1_SCALAR10:%.*]] = extractelement <16 x i32> [[SRC1]], i32 10 +; CHECK-NEXT: [[SRC1_SCALAR11:%.*]] = extractelement <16 x i32> [[SRC1]], i32 11 +; CHECK-NEXT: [[SRC1_SCALAR12:%.*]] = extractelement <16 x i32> [[SRC1]], i32 12 +; CHECK-NEXT: [[SRC1_SCALAR13:%.*]] = extractelement <16 x i32> [[SRC1]], i32 13 +; CHECK-NEXT: [[SRC1_SCALAR14:%.*]] = extractelement <16 x i32> [[SRC1]], i32 14 +; CHECK-NEXT: [[SRC1_SCALAR15:%.*]] = extractelement <16 x i32> [[SRC1]], i32 15 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[SRC1_SCALAR]] to i8 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SRC1_SCALAR1]] to i8 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[SRC1_SCALAR2]] to i8 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[SRC1_SCALAR3]] to i8 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[SRC1_SCALAR4]] to i8 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[SRC1_SCALAR5]] to i8 +; CHECK-NEXT: [[TMP7:%.*]] = trunc i32 [[SRC1_SCALAR6]] to i8 +; CHECK-NEXT: [[TMP8:%.*]] = trunc i32 [[SRC1_SCALAR7]] to i8 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[SRC1_SCALAR8]] to i8 +; CHECK-NEXT: [[TMP10:%.*]] = trunc i32 [[SRC1_SCALAR9]] to i8 +; CHECK-NEXT: [[TMP11:%.*]] = trunc i32 [[SRC1_SCALAR10]] to i8 +; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[SRC1_SCALAR11]] to i8 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[SRC1_SCALAR12]] to i8 +; CHECK-NEXT: [[TMP14:%.*]] = trunc i32 [[SRC1_SCALAR13]] to i8 +; CHECK-NEXT: [[TMP15:%.*]] = trunc i32 [[SRC1_SCALAR14]] to i8 +; CHECK-NEXT: [[TMP16:%.*]] = trunc i32 [[SRC1_SCALAR15]] to i8 +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <16 x i8> undef, i8 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT16:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT]], i8 [[TMP2]], i32 1 +; CHECK-NEXT: [[DOTASSEMBLED_VECT17:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT16]], i8 [[TMP3]], i32 2 +; CHECK-NEXT: [[DOTASSEMBLED_VECT18:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT17]], i8 [[TMP4]], i32 3 +; CHECK-NEXT: [[DOTASSEMBLED_VECT19:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT18]], i8 [[TMP5]], i32 4 +; CHECK-NEXT: [[DOTASSEMBLED_VECT20:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT19]], i8 [[TMP6]], i32 5 +; CHECK-NEXT: [[DOTASSEMBLED_VECT21:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT20]], i8 [[TMP7]], i32 6 +; CHECK-NEXT: [[DOTASSEMBLED_VECT22:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT21]], i8 [[TMP8]], i32 7 +; CHECK-NEXT: [[DOTASSEMBLED_VECT23:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT22]], i8 [[TMP9]], i32 8 +; CHECK-NEXT: [[DOTASSEMBLED_VECT24:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT23]], i8 [[TMP10]], i32 9 +; CHECK-NEXT: [[DOTASSEMBLED_VECT25:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT24]], i8 [[TMP11]], i32 10 +; CHECK-NEXT: [[DOTASSEMBLED_VECT26:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT25]], i8 [[TMP12]], i32 11 +; CHECK-NEXT: [[DOTASSEMBLED_VECT27:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT26]], i8 [[TMP13]], i32 12 +; CHECK-NEXT: [[DOTASSEMBLED_VECT28:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT27]], i8 [[TMP14]], i32 13 +; CHECK-NEXT: [[DOTASSEMBLED_VECT29:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT28]], i8 [[TMP15]], i32 14 +; CHECK-NEXT: [[DOTASSEMBLED_VECT30:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT29]], i8 [[TMP16]], i32 15 +; CHECK-NEXT: ret <16 x i8> [[DOTASSEMBLED_VECT30]] +; + %1 = trunc <16 x i32> %src1 to <16 x i8> + ret <16 x i8> %1 +} + +define <2 x float*> @should_work_with_different_instruction_type_2(<2 x i64> %src1) { +; CHECK-LABEL: define <2 x float*> @should_work_with_different_instruction_type_2( +; CHECK-SAME: <2 x i64> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i64> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i64> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[SRC1_SCALAR]] to float* +; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[SRC1_SCALAR1]] to float* +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float*> undef, float* [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x float*> [[DOTASSEMBLED_VECT]], float* [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x float*> [[DOTASSEMBLED_VECT2]] +; + %1 = inttoptr <2 x i64> %src1 to <2 x float*> + ret <2 x float*> %1 +} + +define <2 x i8> @should_not_scalarize_constants() { +; CHECK-LABEL: define <2 x i8> @should_not_scalarize_constants() { +; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> to <2 x i8> +; CHECK-NEXT: ret <2 x i8> [[TMP1]] +; + %1 = trunc <2 x i32> to <2 x i8> + ret <2 x i8> %1 +} + +define i8 @should_not_scalarize_scalar() { +; CHECK-LABEL: define i8 @should_not_scalarize_scalar() { +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 4 to i8 +; CHECK-NEXT: ret i8 [[TMP1]] +; + %1 = trunc i32 4 to i8 + ret i8 %1 +} diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-cast-instruction.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-cast-instruction.ll new file mode 100644 index 000000000000..e8369bb48df1 --- /dev/null +++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-cast-instruction.ll @@ -0,0 +1,196 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2024 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= +; +; RUN: igc_opt --igc-scalarize --opaque-pointers -S < %s | FileCheck %s +; REQUIRES: llvm-14-plus +; ------------------------------------------------ +; ScalarizeFunction +; ------------------------------------------------ +; The pass should break vector operation into many scalar operations +; ------------------------------------------------ + +define <2 x i8> @basic(<2 x i32> %src1) { +; CHECK-LABEL: define <2 x i8> @basic( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[SRC1_SCALAR]] to i8 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SRC1_SCALAR1]] to i8 +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i8> undef, i8 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i8> [[DOTASSEMBLED_VECT]], i8 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i8> [[DOTASSEMBLED_VECT2]] +; + %1 = trunc <2 x i32> %src1 to <2 x i8> + ret <2 x i8> %1 +} + +define <2 x float> @should_work_with_different_instruction_type(<2 x double> %src1) { +; CHECK-LABEL: define <2 x float> @should_work_with_different_instruction_type( +; CHECK-SAME: <2 x double> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x double> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x double> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = fptrunc double [[SRC1_SCALAR]] to float +; CHECK-NEXT: [[TMP2:%.*]] = fptrunc double [[SRC1_SCALAR1]] to float +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x float> [[DOTASSEMBLED_VECT]], float [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x float> [[DOTASSEMBLED_VECT2]] +; + %1 = fptrunc <2 x double> %src1 to <2 x float> + ret <2 x float> %1 +} + +define <2 x i8> @should_work_with_different_value_type(<2 x i64> %src1) { +; CHECK-LABEL: define <2 x i8> @should_work_with_different_value_type( +; CHECK-SAME: <2 x i64> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i64> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i64> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[SRC1_SCALAR]] to i8 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[SRC1_SCALAR1]] to i8 +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i8> undef, i8 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i8> [[DOTASSEMBLED_VECT]], i8 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i8> [[DOTASSEMBLED_VECT2]] +; + %1 = trunc <2 x i64> %src1 to <2 x i8> + ret <2 x i8> %1 +} + +define <2 x i16> @should_work_with_different_cast_type(<2 x i32> %src1) { +; CHECK-LABEL: define <2 x i16> @should_work_with_different_cast_type( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[SRC1_SCALAR]] to i16 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SRC1_SCALAR1]] to i16 +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i16> undef, i16 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i16> [[DOTASSEMBLED_VECT]], i16 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i16> [[DOTASSEMBLED_VECT2]] +; + %1 = trunc <2 x i32> %src1 to <2 x i16> + ret <2 x i16> %1 +} + +define <2 x float> @should_work_with_type_cast_type_2(<2 x i32> %src1) { +; CHECK-LABEL: define <2 x float> @should_work_with_type_cast_type_2( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[SRC1_SCALAR]] to float +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[SRC1_SCALAR1]] to float +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x float> [[DOTASSEMBLED_VECT]], float [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x float> [[DOTASSEMBLED_VECT2]] +; + %1 = bitcast <2 x i32> %src1 to <2 x float> + ret <2 x float> %1 +} + +define <2 x i32> @should_work_with_type_extension(<2 x i16> %src1) { +; CHECK-LABEL: define <2 x i32> @should_work_with_type_extension( +; CHECK-SAME: <2 x i16> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i16> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i16> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[SRC1_SCALAR]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[SRC1_SCALAR1]] to i32 +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i32> [[DOTASSEMBLED_VECT2]] +; + %1 = zext <2 x i16> %src1 to <2 x i32> + ret <2 x i32> %1 +} + +define <16 x i8> @should_work_with_larger_vector_size(<16 x i32> %src1) { +; CHECK-LABEL: define <16 x i8> @should_work_with_larger_vector_size( +; CHECK-SAME: <16 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <16 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <16 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR2:%.*]] = extractelement <16 x i32> [[SRC1]], i32 2 +; CHECK-NEXT: [[SRC1_SCALAR3:%.*]] = extractelement <16 x i32> [[SRC1]], i32 3 +; CHECK-NEXT: [[SRC1_SCALAR4:%.*]] = extractelement <16 x i32> [[SRC1]], i32 4 +; CHECK-NEXT: [[SRC1_SCALAR5:%.*]] = extractelement <16 x i32> [[SRC1]], i32 5 +; CHECK-NEXT: [[SRC1_SCALAR6:%.*]] = extractelement <16 x i32> [[SRC1]], i32 6 +; CHECK-NEXT: [[SRC1_SCALAR7:%.*]] = extractelement <16 x i32> [[SRC1]], i32 7 +; CHECK-NEXT: [[SRC1_SCALAR8:%.*]] = extractelement <16 x i32> [[SRC1]], i32 8 +; CHECK-NEXT: [[SRC1_SCALAR9:%.*]] = extractelement <16 x i32> [[SRC1]], i32 9 +; CHECK-NEXT: [[SRC1_SCALAR10:%.*]] = extractelement <16 x i32> [[SRC1]], i32 10 +; CHECK-NEXT: [[SRC1_SCALAR11:%.*]] = extractelement <16 x i32> [[SRC1]], i32 11 +; CHECK-NEXT: [[SRC1_SCALAR12:%.*]] = extractelement <16 x i32> [[SRC1]], i32 12 +; CHECK-NEXT: [[SRC1_SCALAR13:%.*]] = extractelement <16 x i32> [[SRC1]], i32 13 +; CHECK-NEXT: [[SRC1_SCALAR14:%.*]] = extractelement <16 x i32> [[SRC1]], i32 14 +; CHECK-NEXT: [[SRC1_SCALAR15:%.*]] = extractelement <16 x i32> [[SRC1]], i32 15 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[SRC1_SCALAR]] to i8 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SRC1_SCALAR1]] to i8 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[SRC1_SCALAR2]] to i8 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[SRC1_SCALAR3]] to i8 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[SRC1_SCALAR4]] to i8 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[SRC1_SCALAR5]] to i8 +; CHECK-NEXT: [[TMP7:%.*]] = trunc i32 [[SRC1_SCALAR6]] to i8 +; CHECK-NEXT: [[TMP8:%.*]] = trunc i32 [[SRC1_SCALAR7]] to i8 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[SRC1_SCALAR8]] to i8 +; CHECK-NEXT: [[TMP10:%.*]] = trunc i32 [[SRC1_SCALAR9]] to i8 +; CHECK-NEXT: [[TMP11:%.*]] = trunc i32 [[SRC1_SCALAR10]] to i8 +; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[SRC1_SCALAR11]] to i8 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[SRC1_SCALAR12]] to i8 +; CHECK-NEXT: [[TMP14:%.*]] = trunc i32 [[SRC1_SCALAR13]] to i8 +; CHECK-NEXT: [[TMP15:%.*]] = trunc i32 [[SRC1_SCALAR14]] to i8 +; CHECK-NEXT: [[TMP16:%.*]] = trunc i32 [[SRC1_SCALAR15]] to i8 +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <16 x i8> undef, i8 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT16:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT]], i8 [[TMP2]], i32 1 +; CHECK-NEXT: [[DOTASSEMBLED_VECT17:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT16]], i8 [[TMP3]], i32 2 +; CHECK-NEXT: [[DOTASSEMBLED_VECT18:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT17]], i8 [[TMP4]], i32 3 +; CHECK-NEXT: [[DOTASSEMBLED_VECT19:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT18]], i8 [[TMP5]], i32 4 +; CHECK-NEXT: [[DOTASSEMBLED_VECT20:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT19]], i8 [[TMP6]], i32 5 +; CHECK-NEXT: [[DOTASSEMBLED_VECT21:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT20]], i8 [[TMP7]], i32 6 +; CHECK-NEXT: [[DOTASSEMBLED_VECT22:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT21]], i8 [[TMP8]], i32 7 +; CHECK-NEXT: [[DOTASSEMBLED_VECT23:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT22]], i8 [[TMP9]], i32 8 +; CHECK-NEXT: [[DOTASSEMBLED_VECT24:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT23]], i8 [[TMP10]], i32 9 +; CHECK-NEXT: [[DOTASSEMBLED_VECT25:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT24]], i8 [[TMP11]], i32 10 +; CHECK-NEXT: [[DOTASSEMBLED_VECT26:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT25]], i8 [[TMP12]], i32 11 +; CHECK-NEXT: [[DOTASSEMBLED_VECT27:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT26]], i8 [[TMP13]], i32 12 +; CHECK-NEXT: [[DOTASSEMBLED_VECT28:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT27]], i8 [[TMP14]], i32 13 +; CHECK-NEXT: [[DOTASSEMBLED_VECT29:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT28]], i8 [[TMP15]], i32 14 +; CHECK-NEXT: [[DOTASSEMBLED_VECT30:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT29]], i8 [[TMP16]], i32 15 +; CHECK-NEXT: ret <16 x i8> [[DOTASSEMBLED_VECT30]] +; + %1 = trunc <16 x i32> %src1 to <16 x i8> + ret <16 x i8> %1 +} + +define <2 x ptr> @should_work_with_different_instruction_type_2(<2 x i64> %src1) { +; CHECK-LABEL: define <2 x ptr> @should_work_with_different_instruction_type_2( +; CHECK-SAME: <2 x i64> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i64> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i64> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[SRC1_SCALAR]] to ptr +; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[SRC1_SCALAR1]] to ptr +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x ptr> undef, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x ptr> [[DOTASSEMBLED_VECT]], ptr [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x ptr> [[DOTASSEMBLED_VECT2]] +; + %1 = inttoptr <2 x i64> %src1 to <2 x ptr> + ret <2 x ptr> %1 +} + +define <2 x i8> @should_not_scalarize_constants() { +; CHECK-LABEL: define <2 x i8> @should_not_scalarize_constants() { +; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> to <2 x i8> +; CHECK-NEXT: ret <2 x i8> [[TMP1]] +; + %1 = trunc <2 x i32> to <2 x i8> + ret <2 x i8> %1 +} + +define i8 @should_not_scalarize_scalar() { +; CHECK-LABEL: define i8 @should_not_scalarize_scalar() { +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 4 to i8 +; CHECK-NEXT: ret i8 [[TMP1]] +; + %1 = trunc i32 4 to i8 + ret i8 %1 +} diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-comp-instruction-typed-pointers.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-comp-instruction-typed-pointers.ll new file mode 100644 index 000000000000..97a4d37693bc --- /dev/null +++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-comp-instruction-typed-pointers.ll @@ -0,0 +1,217 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2024 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= +; +; RUN: igc_opt --igc-scalarize -S < %s | FileCheck %s +; ------------------------------------------------ +; ScalarizeFunction +; ------------------------------------------------ +; The pass should break vector operation into many scalar operations +; ------------------------------------------------ + +define <2 x i1> @basic(<2 x i32> %src1, <2 x i32> %src2) { +; CHECK-LABEL: define <2 x i1> @basic( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i1> [[DOTASSEMBLED_VECT3]] +; + %1 = icmp eq <2 x i32> %src1, %src2 + ret <2 x i1> %1 +} + +define <2 x i1> @should_work_with_different_instruction_type(<2 x float> %src1, <2 x float> %src2) { +; CHECK-LABEL: define <2 x i1> @should_work_with_different_instruction_type( +; CHECK-SAME: <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = fcmp ueq float [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP2:%.*]] = fcmp ueq float [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i1> [[DOTASSEMBLED_VECT3]] +; + %1 = fcmp ueq <2 x float> %src1, %src2 + ret <2 x i1> %1 +} + +; triangulating with @should_work_with_different_instruction_type +; update checks if fast will be preserved +define <2 x i1> @should_work_with_fast_math_flags(<2 x float> %src1, <2 x float> %src2) { +; CHECK-LABEL: define <2 x i1> @should_work_with_fast_math_flags( +; CHECK-SAME: <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = fcmp ueq float [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP2:%.*]] = fcmp ueq float [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i1> [[DOTASSEMBLED_VECT3]] +; + %1 = fcmp fast ueq <2 x float> %src1, %src2 + ret <2 x i1> %1 +} + +define <2 x i1> @should_work_with_different_value_type(<2 x i64> %src1, <2 x i64> %src2) { +; CHECK-LABEL: define <2 x i1> @should_work_with_different_value_type( +; CHECK-SAME: <2 x i64> [[SRC1:%.*]], <2 x i64> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i64> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x i64> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i64> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i64> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i1> [[DOTASSEMBLED_VECT3]] +; + %1 = icmp eq <2 x i64> %src1, %src2 + ret <2 x i1> %1 +} + +define <16 x i1> @should_work_with_larger_vector_size(<16 x i32> %src1, <16 x i32> %src2) { +; CHECK-LABEL: define <16 x i1> @should_work_with_larger_vector_size( +; CHECK-SAME: <16 x i32> [[SRC1:%.*]], <16 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <16 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR16:%.*]] = extractelement <16 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC2_SCALAR17:%.*]] = extractelement <16 x i32> [[SRC2]], i32 2 +; CHECK-NEXT: [[SRC2_SCALAR18:%.*]] = extractelement <16 x i32> [[SRC2]], i32 3 +; CHECK-NEXT: [[SRC2_SCALAR19:%.*]] = extractelement <16 x i32> [[SRC2]], i32 4 +; CHECK-NEXT: [[SRC2_SCALAR20:%.*]] = extractelement <16 x i32> [[SRC2]], i32 5 +; CHECK-NEXT: [[SRC2_SCALAR21:%.*]] = extractelement <16 x i32> [[SRC2]], i32 6 +; CHECK-NEXT: [[SRC2_SCALAR22:%.*]] = extractelement <16 x i32> [[SRC2]], i32 7 +; CHECK-NEXT: [[SRC2_SCALAR23:%.*]] = extractelement <16 x i32> [[SRC2]], i32 8 +; CHECK-NEXT: [[SRC2_SCALAR24:%.*]] = extractelement <16 x i32> [[SRC2]], i32 9 +; CHECK-NEXT: [[SRC2_SCALAR25:%.*]] = extractelement <16 x i32> [[SRC2]], i32 10 +; CHECK-NEXT: [[SRC2_SCALAR26:%.*]] = extractelement <16 x i32> [[SRC2]], i32 11 +; CHECK-NEXT: [[SRC2_SCALAR27:%.*]] = extractelement <16 x i32> [[SRC2]], i32 12 +; CHECK-NEXT: [[SRC2_SCALAR28:%.*]] = extractelement <16 x i32> [[SRC2]], i32 13 +; CHECK-NEXT: [[SRC2_SCALAR29:%.*]] = extractelement <16 x i32> [[SRC2]], i32 14 +; CHECK-NEXT: [[SRC2_SCALAR30:%.*]] = extractelement <16 x i32> [[SRC2]], i32 15 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <16 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <16 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR2:%.*]] = extractelement <16 x i32> [[SRC1]], i32 2 +; CHECK-NEXT: [[SRC1_SCALAR3:%.*]] = extractelement <16 x i32> [[SRC1]], i32 3 +; CHECK-NEXT: [[SRC1_SCALAR4:%.*]] = extractelement <16 x i32> [[SRC1]], i32 4 +; CHECK-NEXT: [[SRC1_SCALAR5:%.*]] = extractelement <16 x i32> [[SRC1]], i32 5 +; CHECK-NEXT: [[SRC1_SCALAR6:%.*]] = extractelement <16 x i32> [[SRC1]], i32 6 +; CHECK-NEXT: [[SRC1_SCALAR7:%.*]] = extractelement <16 x i32> [[SRC1]], i32 7 +; CHECK-NEXT: [[SRC1_SCALAR8:%.*]] = extractelement <16 x i32> [[SRC1]], i32 8 +; CHECK-NEXT: [[SRC1_SCALAR9:%.*]] = extractelement <16 x i32> [[SRC1]], i32 9 +; CHECK-NEXT: [[SRC1_SCALAR10:%.*]] = extractelement <16 x i32> [[SRC1]], i32 10 +; CHECK-NEXT: [[SRC1_SCALAR11:%.*]] = extractelement <16 x i32> [[SRC1]], i32 11 +; CHECK-NEXT: [[SRC1_SCALAR12:%.*]] = extractelement <16 x i32> [[SRC1]], i32 12 +; CHECK-NEXT: [[SRC1_SCALAR13:%.*]] = extractelement <16 x i32> [[SRC1]], i32 13 +; CHECK-NEXT: [[SRC1_SCALAR14:%.*]] = extractelement <16 x i32> [[SRC1]], i32 14 +; CHECK-NEXT: [[SRC1_SCALAR15:%.*]] = extractelement <16 x i32> [[SRC1]], i32 15 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR16]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[SRC1_SCALAR2]], [[SRC2_SCALAR17]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[SRC1_SCALAR3]], [[SRC2_SCALAR18]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[SRC1_SCALAR4]], [[SRC2_SCALAR19]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[SRC1_SCALAR5]], [[SRC2_SCALAR20]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[SRC1_SCALAR6]], [[SRC2_SCALAR21]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[SRC1_SCALAR7]], [[SRC2_SCALAR22]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[SRC1_SCALAR8]], [[SRC2_SCALAR23]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[SRC1_SCALAR9]], [[SRC2_SCALAR24]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[SRC1_SCALAR10]], [[SRC2_SCALAR25]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[SRC1_SCALAR11]], [[SRC2_SCALAR26]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[SRC1_SCALAR12]], [[SRC2_SCALAR27]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[SRC1_SCALAR13]], [[SRC2_SCALAR28]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[SRC1_SCALAR14]], [[SRC2_SCALAR29]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[SRC1_SCALAR15]], [[SRC2_SCALAR30]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <16 x i1> undef, i1 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT31:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1 +; CHECK-NEXT: [[DOTASSEMBLED_VECT32:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT31]], i1 [[TMP3]], i32 2 +; CHECK-NEXT: [[DOTASSEMBLED_VECT33:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT32]], i1 [[TMP4]], i32 3 +; CHECK-NEXT: [[DOTASSEMBLED_VECT34:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT33]], i1 [[TMP5]], i32 4 +; CHECK-NEXT: [[DOTASSEMBLED_VECT35:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT34]], i1 [[TMP6]], i32 5 +; CHECK-NEXT: [[DOTASSEMBLED_VECT36:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT35]], i1 [[TMP7]], i32 6 +; CHECK-NEXT: [[DOTASSEMBLED_VECT37:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT36]], i1 [[TMP8]], i32 7 +; CHECK-NEXT: [[DOTASSEMBLED_VECT38:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT37]], i1 [[TMP9]], i32 8 +; CHECK-NEXT: [[DOTASSEMBLED_VECT39:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT38]], i1 [[TMP10]], i32 9 +; CHECK-NEXT: [[DOTASSEMBLED_VECT40:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT39]], i1 [[TMP11]], i32 10 +; CHECK-NEXT: [[DOTASSEMBLED_VECT41:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT40]], i1 [[TMP12]], i32 11 +; CHECK-NEXT: [[DOTASSEMBLED_VECT42:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT41]], i1 [[TMP13]], i32 12 +; CHECK-NEXT: [[DOTASSEMBLED_VECT43:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT42]], i1 [[TMP14]], i32 13 +; CHECK-NEXT: [[DOTASSEMBLED_VECT44:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT43]], i1 [[TMP15]], i32 14 +; CHECK-NEXT: [[DOTASSEMBLED_VECT45:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT44]], i1 [[TMP16]], i32 15 +; CHECK-NEXT: ret <16 x i1> [[DOTASSEMBLED_VECT45]] +; + %1 = icmp eq <16 x i32> %src1, %src2 + ret <16 x i1> %1 +} + +define <2 x i1> @should_work_with_constant_value(<2 x i32> %src1) { +; CHECK-LABEL: define <2 x i1> @should_work_with_constant_value( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[SRC1_SCALAR]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[SRC1_SCALAR1]], 8 +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i1> [[DOTASSEMBLED_VECT2]] +; + %1 = icmp eq <2 x i32> %src1, + ret <2 x i1> %1 +} + +define <2 x i1> @should_work_with_different_coparison_type(<2 x i32> %src1, <2 x i32> %src2) { +; CHECK-LABEL: define <2 x i1> @should_work_with_different_coparison_type( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp uge i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i1> [[DOTASSEMBLED_VECT3]] +; + %1 = icmp uge <2 x i32> %src1, %src2 + ret <2 x i1> %1 +} + +; triangulating with @should_work_with_different_instruction_type +define <2 x i1> @should_work_with_different_coparison_type_2(<2 x float> %src1, <2 x float> %src2) { +; CHECK-LABEL: define <2 x i1> @should_work_with_different_coparison_type_2( +; CHECK-SAME: <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = fcmp false float [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP2:%.*]] = fcmp false float [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i1> [[DOTASSEMBLED_VECT3]] +; + %1 = fcmp false <2 x float> %src1, %src2 + ret <2 x i1> %1 +} + +define <2 x i1> @should_not_scalarize_two_constants() { +; CHECK-LABEL: define <2 x i1> @should_not_scalarize_two_constants() { +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <2 x i32> , +; CHECK-NEXT: ret <2 x i1> [[TMP1]] +; + %1 = icmp eq <2 x i32> , + ret <2 x i1> %1 +} diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-comp-instruction.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-comp-instruction.ll new file mode 100644 index 000000000000..ed86ff2f3611 --- /dev/null +++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-comp-instruction.ll @@ -0,0 +1,218 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2024 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= +; +; RUN: igc_opt --igc-scalarize --opaque-pointers -S < %s | FileCheck %s +; REQUIRES: llvm-14-plus +; ------------------------------------------------ +; ScalarizeFunction +; ------------------------------------------------ +; The pass should break vector operation into many scalar operations +; ------------------------------------------------ + +define <2 x i1> @basic(<2 x i32> %src1, <2 x i32> %src2) { +; CHECK-LABEL: define <2 x i1> @basic( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i1> [[DOTASSEMBLED_VECT3]] +; + %1 = icmp eq <2 x i32> %src1, %src2 + ret <2 x i1> %1 +} + +define <2 x i1> @should_work_with_different_instruction_type(<2 x float> %src1, <2 x float> %src2) { +; CHECK-LABEL: define <2 x i1> @should_work_with_different_instruction_type( +; CHECK-SAME: <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = fcmp ueq float [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP2:%.*]] = fcmp ueq float [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i1> [[DOTASSEMBLED_VECT3]] +; + %1 = fcmp ueq <2 x float> %src1, %src2 + ret <2 x i1> %1 +} + +; triangulating with @should_work_with_different_instruction_type +; update checks if fast will be preserved +define <2 x i1> @should_work_with_fast_math_flags(<2 x float> %src1, <2 x float> %src2) { +; CHECK-LABEL: define <2 x i1> @should_work_with_fast_math_flags( +; CHECK-SAME: <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = fcmp ueq float [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP2:%.*]] = fcmp ueq float [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i1> [[DOTASSEMBLED_VECT3]] +; + %1 = fcmp fast ueq <2 x float> %src1, %src2 + ret <2 x i1> %1 +} + +define <2 x i1> @should_work_with_different_value_type(<2 x i64> %src1, <2 x i64> %src2) { +; CHECK-LABEL: define <2 x i1> @should_work_with_different_value_type( +; CHECK-SAME: <2 x i64> [[SRC1:%.*]], <2 x i64> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i64> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x i64> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i64> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i64> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i1> [[DOTASSEMBLED_VECT3]] +; + %1 = icmp eq <2 x i64> %src1, %src2 + ret <2 x i1> %1 +} + +define <16 x i1> @should_work_with_larger_vector_size(<16 x i32> %src1, <16 x i32> %src2) { +; CHECK-LABEL: define <16 x i1> @should_work_with_larger_vector_size( +; CHECK-SAME: <16 x i32> [[SRC1:%.*]], <16 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <16 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR16:%.*]] = extractelement <16 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC2_SCALAR17:%.*]] = extractelement <16 x i32> [[SRC2]], i32 2 +; CHECK-NEXT: [[SRC2_SCALAR18:%.*]] = extractelement <16 x i32> [[SRC2]], i32 3 +; CHECK-NEXT: [[SRC2_SCALAR19:%.*]] = extractelement <16 x i32> [[SRC2]], i32 4 +; CHECK-NEXT: [[SRC2_SCALAR20:%.*]] = extractelement <16 x i32> [[SRC2]], i32 5 +; CHECK-NEXT: [[SRC2_SCALAR21:%.*]] = extractelement <16 x i32> [[SRC2]], i32 6 +; CHECK-NEXT: [[SRC2_SCALAR22:%.*]] = extractelement <16 x i32> [[SRC2]], i32 7 +; CHECK-NEXT: [[SRC2_SCALAR23:%.*]] = extractelement <16 x i32> [[SRC2]], i32 8 +; CHECK-NEXT: [[SRC2_SCALAR24:%.*]] = extractelement <16 x i32> [[SRC2]], i32 9 +; CHECK-NEXT: [[SRC2_SCALAR25:%.*]] = extractelement <16 x i32> [[SRC2]], i32 10 +; CHECK-NEXT: [[SRC2_SCALAR26:%.*]] = extractelement <16 x i32> [[SRC2]], i32 11 +; CHECK-NEXT: [[SRC2_SCALAR27:%.*]] = extractelement <16 x i32> [[SRC2]], i32 12 +; CHECK-NEXT: [[SRC2_SCALAR28:%.*]] = extractelement <16 x i32> [[SRC2]], i32 13 +; CHECK-NEXT: [[SRC2_SCALAR29:%.*]] = extractelement <16 x i32> [[SRC2]], i32 14 +; CHECK-NEXT: [[SRC2_SCALAR30:%.*]] = extractelement <16 x i32> [[SRC2]], i32 15 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <16 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <16 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR2:%.*]] = extractelement <16 x i32> [[SRC1]], i32 2 +; CHECK-NEXT: [[SRC1_SCALAR3:%.*]] = extractelement <16 x i32> [[SRC1]], i32 3 +; CHECK-NEXT: [[SRC1_SCALAR4:%.*]] = extractelement <16 x i32> [[SRC1]], i32 4 +; CHECK-NEXT: [[SRC1_SCALAR5:%.*]] = extractelement <16 x i32> [[SRC1]], i32 5 +; CHECK-NEXT: [[SRC1_SCALAR6:%.*]] = extractelement <16 x i32> [[SRC1]], i32 6 +; CHECK-NEXT: [[SRC1_SCALAR7:%.*]] = extractelement <16 x i32> [[SRC1]], i32 7 +; CHECK-NEXT: [[SRC1_SCALAR8:%.*]] = extractelement <16 x i32> [[SRC1]], i32 8 +; CHECK-NEXT: [[SRC1_SCALAR9:%.*]] = extractelement <16 x i32> [[SRC1]], i32 9 +; CHECK-NEXT: [[SRC1_SCALAR10:%.*]] = extractelement <16 x i32> [[SRC1]], i32 10 +; CHECK-NEXT: [[SRC1_SCALAR11:%.*]] = extractelement <16 x i32> [[SRC1]], i32 11 +; CHECK-NEXT: [[SRC1_SCALAR12:%.*]] = extractelement <16 x i32> [[SRC1]], i32 12 +; CHECK-NEXT: [[SRC1_SCALAR13:%.*]] = extractelement <16 x i32> [[SRC1]], i32 13 +; CHECK-NEXT: [[SRC1_SCALAR14:%.*]] = extractelement <16 x i32> [[SRC1]], i32 14 +; CHECK-NEXT: [[SRC1_SCALAR15:%.*]] = extractelement <16 x i32> [[SRC1]], i32 15 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR16]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[SRC1_SCALAR2]], [[SRC2_SCALAR17]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[SRC1_SCALAR3]], [[SRC2_SCALAR18]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[SRC1_SCALAR4]], [[SRC2_SCALAR19]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[SRC1_SCALAR5]], [[SRC2_SCALAR20]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[SRC1_SCALAR6]], [[SRC2_SCALAR21]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[SRC1_SCALAR7]], [[SRC2_SCALAR22]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[SRC1_SCALAR8]], [[SRC2_SCALAR23]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[SRC1_SCALAR9]], [[SRC2_SCALAR24]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[SRC1_SCALAR10]], [[SRC2_SCALAR25]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[SRC1_SCALAR11]], [[SRC2_SCALAR26]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[SRC1_SCALAR12]], [[SRC2_SCALAR27]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[SRC1_SCALAR13]], [[SRC2_SCALAR28]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[SRC1_SCALAR14]], [[SRC2_SCALAR29]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[SRC1_SCALAR15]], [[SRC2_SCALAR30]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <16 x i1> undef, i1 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT31:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1 +; CHECK-NEXT: [[DOTASSEMBLED_VECT32:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT31]], i1 [[TMP3]], i32 2 +; CHECK-NEXT: [[DOTASSEMBLED_VECT33:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT32]], i1 [[TMP4]], i32 3 +; CHECK-NEXT: [[DOTASSEMBLED_VECT34:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT33]], i1 [[TMP5]], i32 4 +; CHECK-NEXT: [[DOTASSEMBLED_VECT35:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT34]], i1 [[TMP6]], i32 5 +; CHECK-NEXT: [[DOTASSEMBLED_VECT36:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT35]], i1 [[TMP7]], i32 6 +; CHECK-NEXT: [[DOTASSEMBLED_VECT37:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT36]], i1 [[TMP8]], i32 7 +; CHECK-NEXT: [[DOTASSEMBLED_VECT38:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT37]], i1 [[TMP9]], i32 8 +; CHECK-NEXT: [[DOTASSEMBLED_VECT39:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT38]], i1 [[TMP10]], i32 9 +; CHECK-NEXT: [[DOTASSEMBLED_VECT40:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT39]], i1 [[TMP11]], i32 10 +; CHECK-NEXT: [[DOTASSEMBLED_VECT41:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT40]], i1 [[TMP12]], i32 11 +; CHECK-NEXT: [[DOTASSEMBLED_VECT42:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT41]], i1 [[TMP13]], i32 12 +; CHECK-NEXT: [[DOTASSEMBLED_VECT43:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT42]], i1 [[TMP14]], i32 13 +; CHECK-NEXT: [[DOTASSEMBLED_VECT44:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT43]], i1 [[TMP15]], i32 14 +; CHECK-NEXT: [[DOTASSEMBLED_VECT45:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT44]], i1 [[TMP16]], i32 15 +; CHECK-NEXT: ret <16 x i1> [[DOTASSEMBLED_VECT45]] +; + %1 = icmp eq <16 x i32> %src1, %src2 + ret <16 x i1> %1 +} + +define <2 x i1> @should_work_with_constant_value(<2 x i32> %src1) { +; CHECK-LABEL: define <2 x i1> @should_work_with_constant_value( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[SRC1_SCALAR]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[SRC1_SCALAR1]], 8 +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i1> [[DOTASSEMBLED_VECT2]] +; + %1 = icmp eq <2 x i32> %src1, + ret <2 x i1> %1 +} + +define <2 x i1> @should_work_with_different_coparison_type(<2 x i32> %src1, <2 x i32> %src2) { +; CHECK-LABEL: define <2 x i1> @should_work_with_different_coparison_type( +; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp uge i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i1> [[DOTASSEMBLED_VECT3]] +; + %1 = icmp uge <2 x i32> %src1, %src2 + ret <2 x i1> %1 +} + +; triangulating with @should_work_with_different_instruction_type +define <2 x i1> @should_work_with_different_coparison_type_2(<2 x float> %src1, <2 x float> %src2) { +; CHECK-LABEL: define <2 x i1> @should_work_with_different_coparison_type_2( +; CHECK-SAME: <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = fcmp false float [[SRC1_SCALAR]], [[SRC2_SCALAR]] +; CHECK-NEXT: [[TMP2:%.*]] = fcmp false float [[SRC1_SCALAR1]], [[SRC2_SCALAR2]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1 +; CHECK-NEXT: ret <2 x i1> [[DOTASSEMBLED_VECT3]] +; + %1 = fcmp false <2 x float> %src1, %src2 + ret <2 x i1> %1 +} + +define <2 x i1> @should_not_scalarize_two_constants() { +; CHECK-LABEL: define <2 x i1> @should_not_scalarize_two_constants() { +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <2 x i32> , +; CHECK-NEXT: ret <2 x i1> [[TMP1]] +; + %1 = icmp eq <2 x i32> , + ret <2 x i1> %1 +} diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-getelementptr-instruction-typed-pointers.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-getelementptr-instruction-typed-pointers.ll new file mode 100644 index 000000000000..c57b0533194a --- /dev/null +++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-getelementptr-instruction-typed-pointers.ll @@ -0,0 +1,190 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2024 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= +; +; RUN: igc_opt --igc-scalarize -S < %s | FileCheck %s +; ------------------------------------------------ +; ScalarizeFunction +; ------------------------------------------------ +; The pass should break vector operation into many scalar operations +; ------------------------------------------------ + +define double @basic(<2 x double*> %pointers) { +; CHECK-LABEL: define double @basic( +; CHECK-SAME: <2 x double*> [[POINTERS:%.*]]) { +; CHECK-NEXT: [[POINTERS_SCALAR:%.*]] = extractelement <2 x double*> [[POINTERS]], i32 0 +; CHECK-NEXT: [[POINTERS_SCALAR1:%.*]] = extractelement <2 x double*> [[POINTERS]], i32 1 +; CHECK-NEXT: [[POINTER_TO_DOUBLE2:%.*]] = getelementptr double, double* [[POINTERS_SCALAR]], i32 1 +; CHECK-NEXT: [[POINTER_TO_DOUBLE_ASSEMBLED_VECT:%.*]] = insertelement <2 x double*> undef, double* [[POINTER_TO_DOUBLE2]], i32 0 +; CHECK-NEXT: [[POINTER_TO_DOUBLE3:%.*]] = getelementptr double, double* [[POINTERS_SCALAR1]], i32 1 +; CHECK-NEXT: [[POINTER_TO_DOUBLE_ASSEMBLED_VECT4:%.*]] = insertelement <2 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT]], double* [[POINTER_TO_DOUBLE3]], i32 1 +; CHECK-NEXT: [[VAL0:%.*]] = load double, double* [[POINTER_TO_DOUBLE2]], align 8 +; CHECK-NEXT: [[VAL1:%.*]] = load double, double* [[POINTER_TO_DOUBLE3]], align 8 +; CHECK-NEXT: [[RETURN:%.*]] = fadd double [[VAL0]], [[VAL1]] +; CHECK-NEXT: ret double [[RETURN]] +; + %pointer_to_double = getelementptr double, <2 x double*> %pointers, i32 1 + + %ptr0 = extractelement <2 x double*> %pointer_to_double, i32 0 + %ptr1 = extractelement <2 x double*> %pointer_to_double, i32 1 + %val0 = load double, double* %ptr0 + %val1 = load double, double* %ptr1 + %return = fadd double %val0, %val1 + ret double %return +} + +define double @should_work_with_vector_of_indices(<2 x double*> %pointers) { +; CHECK-LABEL: define double @should_work_with_vector_of_indices( +; CHECK-SAME: <2 x double*> [[POINTERS:%.*]]) { +; CHECK-NEXT: [[POINTERS_SCALAR:%.*]] = extractelement <2 x double*> [[POINTERS]], i32 0 +; CHECK-NEXT: [[POINTERS_SCALAR1:%.*]] = extractelement <2 x double*> [[POINTERS]], i32 1 +; CHECK-NEXT: [[POINTERS_TO_DOUBLE2:%.*]] = getelementptr double, double* [[POINTERS_SCALAR]], i32 0 +; CHECK-NEXT: [[POINTERS_TO_DOUBLE_ASSEMBLED_VECT:%.*]] = insertelement <2 x double*> undef, double* [[POINTERS_TO_DOUBLE2]], i32 0 +; CHECK-NEXT: [[POINTERS_TO_DOUBLE3:%.*]] = getelementptr double, double* [[POINTERS_SCALAR1]], i32 1 +; CHECK-NEXT: [[POINTERS_TO_DOUBLE_ASSEMBLED_VECT4:%.*]] = insertelement <2 x double*> [[POINTERS_TO_DOUBLE_ASSEMBLED_VECT]], double* [[POINTERS_TO_DOUBLE3]], i32 1 +; CHECK-NEXT: [[VAL0:%.*]] = load double, double* [[POINTERS_TO_DOUBLE2]], align 8 +; CHECK-NEXT: [[VAL1:%.*]] = load double, double* [[POINTERS_TO_DOUBLE3]], align 8 +; CHECK-NEXT: [[RETURN:%.*]] = fadd double [[VAL0]], [[VAL1]] +; CHECK-NEXT: ret double [[RETURN]] +; + %pointers_to_double = getelementptr double, <2 x double*> %pointers, <2 x i32> + + %ptr0 = extractelement <2 x double*> %pointers_to_double, i32 0 + %ptr1 = extractelement <2 x double*> %pointers_to_double, i32 1 + %val0 = load double, double* %ptr0 + %val1 = load double, double* %ptr1 + %return = fadd double %val0, %val1 + ret double %return +} + +define i64 @should_work_with_different_value_type(<2 x i64*> %pointers) { +; CHECK-LABEL: define i64 @should_work_with_different_value_type( +; CHECK-SAME: <2 x i64*> [[POINTERS:%.*]]) { +; CHECK-NEXT: [[POINTERS_SCALAR:%.*]] = extractelement <2 x i64*> [[POINTERS]], i32 0 +; CHECK-NEXT: [[POINTERS_SCALAR1:%.*]] = extractelement <2 x i64*> [[POINTERS]], i32 1 +; CHECK-NEXT: [[POINTER_TO_I642:%.*]] = getelementptr i64, i64* [[POINTERS_SCALAR]], i32 1 +; CHECK-NEXT: [[POINTER_TO_I64_ASSEMBLED_VECT:%.*]] = insertelement <2 x i64*> undef, i64* [[POINTER_TO_I642]], i32 0 +; CHECK-NEXT: [[POINTER_TO_I643:%.*]] = getelementptr i64, i64* [[POINTERS_SCALAR1]], i32 1 +; CHECK-NEXT: [[POINTER_TO_I64_ASSEMBLED_VECT4:%.*]] = insertelement <2 x i64*> [[POINTER_TO_I64_ASSEMBLED_VECT]], i64* [[POINTER_TO_I643]], i32 1 +; CHECK-NEXT: [[VAL0:%.*]] = load i64, i64* [[POINTER_TO_I642]], align 4 +; CHECK-NEXT: [[VAL1:%.*]] = load i64, i64* [[POINTER_TO_I643]], align 4 +; CHECK-NEXT: [[RETURN:%.*]] = add i64 [[VAL0]], [[VAL1]] +; CHECK-NEXT: ret i64 [[RETURN]] +; + %pointer_to_i64 = getelementptr i64, <2 x i64*> %pointers, i32 1 + + %ptr0 = extractelement <2 x i64*> %pointer_to_i64, i32 0 + %ptr1 = extractelement <2 x i64*> %pointer_to_i64, i32 1 + %val0 = load i64, i64* %ptr0 + %val1 = load i64, i64* %ptr1 + %return = add i64 %val0, %val1 + ret i64 %return +} + +define double @should_work_with_larger_vector_size(<16 x double*> %pointers) { +; CHECK-LABEL: define double @should_work_with_larger_vector_size( +; CHECK-SAME: <16 x double*> [[POINTERS:%.*]]) { +; CHECK-NEXT: [[POINTERS_SCALAR:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 0 +; CHECK-NEXT: [[POINTERS_SCALAR1:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 1 +; CHECK-NEXT: [[POINTERS_SCALAR2:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 2 +; CHECK-NEXT: [[POINTERS_SCALAR3:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 3 +; CHECK-NEXT: [[POINTERS_SCALAR4:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 4 +; CHECK-NEXT: [[POINTERS_SCALAR5:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 5 +; CHECK-NEXT: [[POINTERS_SCALAR6:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 6 +; CHECK-NEXT: [[POINTERS_SCALAR7:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 7 +; CHECK-NEXT: [[POINTERS_SCALAR8:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 8 +; CHECK-NEXT: [[POINTERS_SCALAR9:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 9 +; CHECK-NEXT: [[POINTERS_SCALAR10:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 10 +; CHECK-NEXT: [[POINTERS_SCALAR11:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 11 +; CHECK-NEXT: [[POINTERS_SCALAR12:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 12 +; CHECK-NEXT: [[POINTERS_SCALAR13:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 13 +; CHECK-NEXT: [[POINTERS_SCALAR14:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 14 +; CHECK-NEXT: [[POINTERS_SCALAR15:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 15 +; CHECK-NEXT: [[POINTER_TO_DOUBLE16:%.*]] = getelementptr double, double* [[POINTERS_SCALAR]], i32 1 +; CHECK-NEXT: [[POINTER_TO_DOUBLE_ASSEMBLED_VECT:%.*]] = insertelement <16 x double*> undef, double* [[POINTER_TO_DOUBLE16]], i32 0 +; CHECK-NEXT: [[POINTER_TO_DOUBLE17:%.*]] = getelementptr double, double* [[POINTERS_SCALAR1]], i32 1 +; CHECK-NEXT: [[POINTER_TO_DOUBLE_ASSEMBLED_VECT18:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT]], double* [[POINTER_TO_DOUBLE17]], i32 1 +; CHECK-NEXT: [[POINTER_TO_DOUBLE19:%.*]] = getelementptr double, double* [[POINTERS_SCALAR2]], i32 1 +; CHECK-NEXT: [[POINTER_TO_DOUBLE_ASSEMBLED_VECT20:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT18]], double* [[POINTER_TO_DOUBLE19]], i32 2 +; CHECK-NEXT: [[POINTER_TO_DOUBLE21:%.*]] = getelementptr double, double* [[POINTERS_SCALAR3]], i32 1 +; CHECK-NEXT: [[POINTER_TO_DOUBLE_ASSEMBLED_VECT22:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT20]], double* [[POINTER_TO_DOUBLE21]], i32 3 +; CHECK-NEXT: [[POINTER_TO_DOUBLE23:%.*]] = getelementptr double, double* [[POINTERS_SCALAR4]], i32 1 +; CHECK-NEXT: [[POINTER_TO_DOUBLE_ASSEMBLED_VECT24:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT22]], double* [[POINTER_TO_DOUBLE23]], i32 4 +; CHECK-NEXT: [[POINTER_TO_DOUBLE25:%.*]] = getelementptr double, double* [[POINTERS_SCALAR5]], i32 1 +; CHECK-NEXT: [[POINTER_TO_DOUBLE_ASSEMBLED_VECT26:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT24]], double* [[POINTER_TO_DOUBLE25]], i32 5 +; CHECK-NEXT: [[POINTER_TO_DOUBLE27:%.*]] = getelementptr double, double* [[POINTERS_SCALAR6]], i32 1 +; CHECK-NEXT: [[POINTER_TO_DOUBLE_ASSEMBLED_VECT28:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT26]], double* [[POINTER_TO_DOUBLE27]], i32 6 +; CHECK-NEXT: [[POINTER_TO_DOUBLE29:%.*]] = getelementptr double, double* [[POINTERS_SCALAR7]], i32 1 +; CHECK-NEXT: [[POINTER_TO_DOUBLE_ASSEMBLED_VECT30:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT28]], double* [[POINTER_TO_DOUBLE29]], i32 7 +; CHECK-NEXT: [[POINTER_TO_DOUBLE31:%.*]] = getelementptr double, double* [[POINTERS_SCALAR8]], i32 1 +; CHECK-NEXT: [[POINTER_TO_DOUBLE_ASSEMBLED_VECT32:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT30]], double* [[POINTER_TO_DOUBLE31]], i32 8 +; CHECK-NEXT: [[POINTER_TO_DOUBLE33:%.*]] = getelementptr double, double* [[POINTERS_SCALAR9]], i32 1 +; CHECK-NEXT: [[POINTER_TO_DOUBLE_ASSEMBLED_VECT34:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT32]], double* [[POINTER_TO_DOUBLE33]], i32 9 +; CHECK-NEXT: [[POINTER_TO_DOUBLE35:%.*]] = getelementptr double, double* [[POINTERS_SCALAR10]], i32 1 +; CHECK-NEXT: [[POINTER_TO_DOUBLE_ASSEMBLED_VECT36:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT34]], double* [[POINTER_TO_DOUBLE35]], i32 10 +; CHECK-NEXT: [[POINTER_TO_DOUBLE37:%.*]] = getelementptr double, double* [[POINTERS_SCALAR11]], i32 1 +; CHECK-NEXT: [[POINTER_TO_DOUBLE_ASSEMBLED_VECT38:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT36]], double* [[POINTER_TO_DOUBLE37]], i32 11 +; CHECK-NEXT: [[POINTER_TO_DOUBLE39:%.*]] = getelementptr double, double* [[POINTERS_SCALAR12]], i32 1 +; CHECK-NEXT: [[POINTER_TO_DOUBLE_ASSEMBLED_VECT40:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT38]], double* [[POINTER_TO_DOUBLE39]], i32 12 +; CHECK-NEXT: [[POINTER_TO_DOUBLE41:%.*]] = getelementptr double, double* [[POINTERS_SCALAR13]], i32 1 +; CHECK-NEXT: [[POINTER_TO_DOUBLE_ASSEMBLED_VECT42:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT40]], double* [[POINTER_TO_DOUBLE41]], i32 13 +; CHECK-NEXT: [[POINTER_TO_DOUBLE43:%.*]] = getelementptr double, double* [[POINTERS_SCALAR14]], i32 1 +; CHECK-NEXT: [[POINTER_TO_DOUBLE_ASSEMBLED_VECT44:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT42]], double* [[POINTER_TO_DOUBLE43]], i32 14 +; CHECK-NEXT: [[POINTER_TO_DOUBLE45:%.*]] = getelementptr double, double* [[POINTERS_SCALAR15]], i32 1 +; CHECK-NEXT: [[POINTER_TO_DOUBLE_ASSEMBLED_VECT46:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT44]], double* [[POINTER_TO_DOUBLE45]], i32 15 +; CHECK-NEXT: [[VAL0:%.*]] = load double, double* [[POINTER_TO_DOUBLE16]], align 8 +; CHECK-NEXT: [[VAL1:%.*]] = load double, double* [[POINTER_TO_DOUBLE17]], align 8 +; CHECK-NEXT: [[RETURN:%.*]] = fadd double [[VAL0]], [[VAL1]] +; CHECK-NEXT: ret double [[RETURN]] +; + %pointer_to_double = getelementptr double, <16 x double*> %pointers, i32 1 + + %ptr0 = extractelement <16 x double*> %pointer_to_double, i32 0 + %ptr1 = extractelement <16 x double*> %pointer_to_double, i32 1 + %val0 = load double, double* %ptr0 + %val1 = load double, double* %ptr1 + %return = fadd double %val0, %val1 + ret double %return +} + +%some_type = type {i64, i32} + +; checks early out for non: = getelementptr , , +define i64 @should_not_scalarize_with_more_then_one_index(%some_type* %pointer) { +; CHECK-LABEL: @should_not_scalarize_with_more_then_one_index( +; CHECK-NEXT: [[POINTER_TO_INT:%.*]] = getelementptr [[SOME_TYPE:%.*]], %some_type* [[POINTER:%.*]], i32 0, i32 0 +; CHECK-NEXT: [[VAL0:%.*]] = load i64, i64* [[POINTER_TO_INT]], align 4 +; CHECK-NEXT: [[RETURN:%.*]] = add i64 [[VAL0]], [[VAL0]] +; CHECK-NEXT: ret i64 [[RETURN]] +; + %pointer_to_int = getelementptr %some_type, %some_type* %pointer, i32 0, i32 0 + + %val0 = load i64, i64* %pointer_to_int + %return = add i64 %val0, %val0 + ret i64 %return +} + +; checks early out for non: = getelementptr , , +define i64 @should_scalarize_only_vectors(%some_type* %pointer) { +; CHECK-LABEL: @should_scalarize_only_vectors( +; CHECK-NEXT: [[POINTER_SOME_TYPE:%.*]] = getelementptr [[SOME_TYPE:%.*]], %some_type* [[POINTER:%.*]], i32 1 +; CHECK-NEXT: [[VAL:%.*]] = load [[SOME_TYPE]], %some_type* [[POINTER_SOME_TYPE]], align 4 +; CHECK-NEXT: [[VAL0:%.*]] = extractvalue [[SOME_TYPE]] [[VAL]], 0 +; CHECK-NEXT: [[VAL1:%.*]] = extractvalue [[SOME_TYPE]] [[VAL]], 0 +; CHECK-NEXT: [[RETURN:%.*]] = add i64 [[VAL0]], [[VAL1]] +; CHECK-NEXT: ret i64 [[RETURN]] +; + %pointer_some_type = getelementptr %some_type, %some_type* %pointer, i32 1 + + %val = load %some_type, %some_type* %pointer_some_type + + %val0 = extractvalue %some_type %val, 0 + %val1 = extractvalue %some_type %val, 0 + %return = add i64 %val0, %val1 + ret i64 %return +} \ No newline at end of file diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-getelementptr-instruction.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-getelementptr-instruction.ll new file mode 100644 index 000000000000..7d064bcc58ce --- /dev/null +++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-getelementptr-instruction.ll @@ -0,0 +1,82 @@ +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2024 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= +; +; RUN: igc_opt --igc-scalarize --opaque-pointers -S < %s | FileCheck %s +; REQUIRES: llvm-14-plus, opaque-ptr-fix +; ------------------------------------------------ +; ScalarizeFunction +; ------------------------------------------------ +; The pass should break vector operation into many scalar operations + +; add checks when the pass will support opaque pointers (and remove "opaque-ptr-fix") +; ------------------------------------------------ + +define double @basic(<2 x ptr> %pointers) { + %pointer_to_double = getelementptr double, <2 x ptr> %pointers, i32 1 + + %ptr0 = extractelement <2 x ptr> %pointer_to_double, i32 0 + %ptr1 = extractelement <2 x ptr> %pointer_to_double, i32 1 + %val0 = load double, ptr %ptr0 + %val1 = load double, ptr %ptr1 + %return = fadd double %val0, %val1 + ret double %return +} + +define double @should_work_with_vector_of_indices(<2 x ptr> %pointers) { + %pointers_to_double = getelementptr double, <2 x ptr> %pointers, <2 x i32> + + %ptr0 = extractelement <2 x ptr> %pointers_to_double, i32 0 + %ptr1 = extractelement <2 x ptr> %pointers_to_double, i32 1 + %val0 = load double, ptr %ptr0 + %val1 = load double, ptr %ptr1 + %return = fadd double %val0, %val1 + ret double %return +} + +define i64 @should_work_with_different_value_type(<2 x ptr> %pointers) { + %pointer_to_i64 = getelementptr i64, <2 x ptr> %pointers, i32 1 + + %ptr0 = extractelement <2 x ptr> %pointer_to_i64, i32 0 + %ptr1 = extractelement <2 x ptr> %pointer_to_i64, i32 1 + %val0 = load i64, ptr %ptr0 + %val1 = load i64, ptr %ptr1 + %return = add i64 %val0, %val1 + ret i64 %return +} + +define double @should_work_with_larger_vector_size(<16 x ptr> %pointers) { + %pointer_to_double = getelementptr double, <16 x ptr> %pointers, i32 1 + + %ptr0 = extractelement <16 x ptr> %pointer_to_double, i32 0 + %ptr1 = extractelement <16 x ptr> %pointer_to_double, i32 1 + %val0 = load double, ptr %ptr0 + %val1 = load double, ptr %ptr1 + %return = fadd double %val0, %val1 + ret double %return +} + +%some_type = type {i64, i32} + +define i64 @should_not_scalarize_with_more_then_index(ptr %pointer) { + %pointer_to_int = getelementptr %some_type, ptr %pointer, i32 0, i32 0 + + %val0 = load i64, ptr %pointer_to_int + %return = add i64 %val0, %val0 + ret i64 %return +} + +define i64 @should_scalarize_only_vectors(ptr %pointer) { + %pointer_some_type = getelementptr %some_type, ptr %pointer, i32 1 + + %val = load %some_type, ptr %pointer_some_type + + %val0 = extractvalue %some_type %val, 0 + %val1 = extractvalue %some_type %val, 0 + %return = add i64 %val0, %val1 + ret i64 %return +} diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-phi-instruction-typed-pointers.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-phi-instruction-typed-pointers.ll new file mode 100644 index 000000000000..70b9a72ced7d --- /dev/null +++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-phi-instruction-typed-pointers.ll @@ -0,0 +1,587 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2024 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= +; +; RUN: igc_opt --igc-scalarize -S < %s | FileCheck %s +; ------------------------------------------------ +; ScalarizeFunction +; ------------------------------------------------ +; The pass should break vector operation into many scalar operations +; ------------------------------------------------ + +define <2 x i32> @basic(i1 %switch, <2 x i32> %src1, <2 x i32> %src2) { +; CHECK-LABEL: define <2 x i32> @basic( +; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR4:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR3:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RESULT1:%.*]] = phi i32 [ [[SRC1_SCALAR]], %[[FIRST]] ], [ [[SRC2_SCALAR]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT2:%.*]] = phi i32 [ [[SRC1_SCALAR3]], %[[FIRST]] ], [ [[SRC2_SCALAR4]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[RESULT1]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT5:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT2]], i32 1 +; CHECK-NEXT: ret <2 x i32> [[RESULT_ASSEMBLED_VECT5]] +; +entry: + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %result = phi <2 x i32> [ %src1, %first], [ %src2, %second] + ret <2 x i32> %result +} + +define <2 x float> @should_work_with_different_value_type(i1 %switch, <2 x float> %src1, <2 x float> %src2) { +; CHECK-LABEL: define <2 x float> @should_work_with_different_value_type( +; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR4:%.*]] = extractelement <2 x float> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR3:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RESULT1:%.*]] = phi float [ [[SRC1_SCALAR]], %[[FIRST]] ], [ [[SRC2_SCALAR]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT2:%.*]] = phi float [ [[SRC1_SCALAR3]], %[[FIRST]] ], [ [[SRC2_SCALAR4]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[RESULT1]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT5:%.*]] = insertelement <2 x float> [[RESULT_ASSEMBLED_VECT]], float [[RESULT2]], i32 1 +; CHECK-NEXT: ret <2 x float> [[RESULT_ASSEMBLED_VECT5]] +; +entry: + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %result = phi <2 x float> [ %src1, %first], [ %src2, %second] + ret <2 x float> %result +} + +; triangulating with @should_work_with_different_value_type +; update checks if fast will be preserved +define <2 x float> @should_work_with_fast_math_flags(i1 %switch, <2 x float> %src1, <2 x float> %src2) { +; CHECK-LABEL: define <2 x float> @should_work_with_fast_math_flags( +; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR4:%.*]] = extractelement <2 x float> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR3:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RESULT1:%.*]] = phi float [ [[SRC1_SCALAR]], %[[FIRST]] ], [ [[SRC2_SCALAR]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT2:%.*]] = phi float [ [[SRC1_SCALAR3]], %[[FIRST]] ], [ [[SRC2_SCALAR4]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[RESULT1]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT5:%.*]] = insertelement <2 x float> [[RESULT_ASSEMBLED_VECT]], float [[RESULT2]], i32 1 +; CHECK-NEXT: ret <2 x float> [[RESULT_ASSEMBLED_VECT5]] +; + entry: + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %result = phi fast <2 x float> [ %src1, %first], [ %src2, %second] + ret <2 x float> %result +} + +define <16 x i32> @should_work_with_larger_vector_size(i1 %switch, <16 x i32> %src1, <16 x i32> %src2) { +; CHECK-LABEL: define <16 x i32> @should_work_with_larger_vector_size( +; CHECK-SAME: i1 [[SWITCH:%.*]], <16 x i32> [[SRC1:%.*]], <16 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <16 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR32:%.*]] = extractelement <16 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC2_SCALAR33:%.*]] = extractelement <16 x i32> [[SRC2]], i32 2 +; CHECK-NEXT: [[SRC2_SCALAR34:%.*]] = extractelement <16 x i32> [[SRC2]], i32 3 +; CHECK-NEXT: [[SRC2_SCALAR35:%.*]] = extractelement <16 x i32> [[SRC2]], i32 4 +; CHECK-NEXT: [[SRC2_SCALAR36:%.*]] = extractelement <16 x i32> [[SRC2]], i32 5 +; CHECK-NEXT: [[SRC2_SCALAR37:%.*]] = extractelement <16 x i32> [[SRC2]], i32 6 +; CHECK-NEXT: [[SRC2_SCALAR38:%.*]] = extractelement <16 x i32> [[SRC2]], i32 7 +; CHECK-NEXT: [[SRC2_SCALAR39:%.*]] = extractelement <16 x i32> [[SRC2]], i32 8 +; CHECK-NEXT: [[SRC2_SCALAR40:%.*]] = extractelement <16 x i32> [[SRC2]], i32 9 +; CHECK-NEXT: [[SRC2_SCALAR41:%.*]] = extractelement <16 x i32> [[SRC2]], i32 10 +; CHECK-NEXT: [[SRC2_SCALAR42:%.*]] = extractelement <16 x i32> [[SRC2]], i32 11 +; CHECK-NEXT: [[SRC2_SCALAR43:%.*]] = extractelement <16 x i32> [[SRC2]], i32 12 +; CHECK-NEXT: [[SRC2_SCALAR44:%.*]] = extractelement <16 x i32> [[SRC2]], i32 13 +; CHECK-NEXT: [[SRC2_SCALAR45:%.*]] = extractelement <16 x i32> [[SRC2]], i32 14 +; CHECK-NEXT: [[SRC2_SCALAR46:%.*]] = extractelement <16 x i32> [[SRC2]], i32 15 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <16 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR17:%.*]] = extractelement <16 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR18:%.*]] = extractelement <16 x i32> [[SRC1]], i32 2 +; CHECK-NEXT: [[SRC1_SCALAR19:%.*]] = extractelement <16 x i32> [[SRC1]], i32 3 +; CHECK-NEXT: [[SRC1_SCALAR20:%.*]] = extractelement <16 x i32> [[SRC1]], i32 4 +; CHECK-NEXT: [[SRC1_SCALAR21:%.*]] = extractelement <16 x i32> [[SRC1]], i32 5 +; CHECK-NEXT: [[SRC1_SCALAR22:%.*]] = extractelement <16 x i32> [[SRC1]], i32 6 +; CHECK-NEXT: [[SRC1_SCALAR23:%.*]] = extractelement <16 x i32> [[SRC1]], i32 7 +; CHECK-NEXT: [[SRC1_SCALAR24:%.*]] = extractelement <16 x i32> [[SRC1]], i32 8 +; CHECK-NEXT: [[SRC1_SCALAR25:%.*]] = extractelement <16 x i32> [[SRC1]], i32 9 +; CHECK-NEXT: [[SRC1_SCALAR26:%.*]] = extractelement <16 x i32> [[SRC1]], i32 10 +; CHECK-NEXT: [[SRC1_SCALAR27:%.*]] = extractelement <16 x i32> [[SRC1]], i32 11 +; CHECK-NEXT: [[SRC1_SCALAR28:%.*]] = extractelement <16 x i32> [[SRC1]], i32 12 +; CHECK-NEXT: [[SRC1_SCALAR29:%.*]] = extractelement <16 x i32> [[SRC1]], i32 13 +; CHECK-NEXT: [[SRC1_SCALAR30:%.*]] = extractelement <16 x i32> [[SRC1]], i32 14 +; CHECK-NEXT: [[SRC1_SCALAR31:%.*]] = extractelement <16 x i32> [[SRC1]], i32 15 +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RESULT1:%.*]] = phi i32 [ [[SRC1_SCALAR]], %[[FIRST]] ], [ [[SRC2_SCALAR]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT2:%.*]] = phi i32 [ [[SRC1_SCALAR17]], %[[FIRST]] ], [ [[SRC2_SCALAR32]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT3:%.*]] = phi i32 [ [[SRC1_SCALAR18]], %[[FIRST]] ], [ [[SRC2_SCALAR33]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT4:%.*]] = phi i32 [ [[SRC1_SCALAR19]], %[[FIRST]] ], [ [[SRC2_SCALAR34]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT5:%.*]] = phi i32 [ [[SRC1_SCALAR20]], %[[FIRST]] ], [ [[SRC2_SCALAR35]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT6:%.*]] = phi i32 [ [[SRC1_SCALAR21]], %[[FIRST]] ], [ [[SRC2_SCALAR36]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT7:%.*]] = phi i32 [ [[SRC1_SCALAR22]], %[[FIRST]] ], [ [[SRC2_SCALAR37]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT8:%.*]] = phi i32 [ [[SRC1_SCALAR23]], %[[FIRST]] ], [ [[SRC2_SCALAR38]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT9:%.*]] = phi i32 [ [[SRC1_SCALAR24]], %[[FIRST]] ], [ [[SRC2_SCALAR39]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT10:%.*]] = phi i32 [ [[SRC1_SCALAR25]], %[[FIRST]] ], [ [[SRC2_SCALAR40]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT11:%.*]] = phi i32 [ [[SRC1_SCALAR26]], %[[FIRST]] ], [ [[SRC2_SCALAR41]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT12:%.*]] = phi i32 [ [[SRC1_SCALAR27]], %[[FIRST]] ], [ [[SRC2_SCALAR42]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT13:%.*]] = phi i32 [ [[SRC1_SCALAR28]], %[[FIRST]] ], [ [[SRC2_SCALAR43]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT14:%.*]] = phi i32 [ [[SRC1_SCALAR29]], %[[FIRST]] ], [ [[SRC2_SCALAR44]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT15:%.*]] = phi i32 [ [[SRC1_SCALAR30]], %[[FIRST]] ], [ [[SRC2_SCALAR45]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT16:%.*]] = phi i32 [ [[SRC1_SCALAR31]], %[[FIRST]] ], [ [[SRC2_SCALAR46]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <16 x i32> undef, i32 [[RESULT1]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT47:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT2]], i32 1 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT48:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT47]], i32 [[RESULT3]], i32 2 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT49:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT48]], i32 [[RESULT4]], i32 3 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT50:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT49]], i32 [[RESULT5]], i32 4 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT51:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT50]], i32 [[RESULT6]], i32 5 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT52:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT51]], i32 [[RESULT7]], i32 6 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT53:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT52]], i32 [[RESULT8]], i32 7 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT54:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT53]], i32 [[RESULT9]], i32 8 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT55:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT54]], i32 [[RESULT10]], i32 9 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT56:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT55]], i32 [[RESULT11]], i32 10 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT57:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT56]], i32 [[RESULT12]], i32 11 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT58:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT57]], i32 [[RESULT13]], i32 12 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT59:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT58]], i32 [[RESULT14]], i32 13 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT60:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT59]], i32 [[RESULT15]], i32 14 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT61:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT60]], i32 [[RESULT16]], i32 15 +; CHECK-NEXT: ret <16 x i32> [[RESULT_ASSEMBLED_VECT61]] +; +entry: + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %result = phi <16 x i32> [ %src1, %first], [ %src2, %second] + ret <16 x i32> %result +} + +define <2 x i32> @should_work_with_constant_value(i1 %switch, <2 x i32> %src1) { +; CHECK-LABEL: define <2 x i32> @should_work_with_constant_value( +; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR3:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RESULT1:%.*]] = phi i32 [ [[SRC1_SCALAR]], %[[FIRST]] ], [ 2, %[[SECOND]] ] +; CHECK-NEXT: [[RESULT2:%.*]] = phi i32 [ [[SRC1_SCALAR3]], %[[FIRST]] ], [ 4, %[[SECOND]] ] +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[RESULT1]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT4:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT2]], i32 1 +; CHECK-NEXT: ret <2 x i32> [[RESULT_ASSEMBLED_VECT4]] +; +entry: + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %result = phi <2 x i32> [ %src1, %first], [ , %second] + ret <2 x i32> %result +} + + +define <8 x i32> @should_not_scalarize(i1 %switch, <8 x i32> %src1) { +; CHECK-LABEL: define <8 x i32> @should_not_scalarize( +; CHECK-SAME: i1 [[SWITCH:%.*]], <8 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHOULD_NOT_SCALARIZE_ME:%.*]] = call <8 x i32> @llvm.genx.GenISA.vmeSendIME2(<8 x i32> [[SRC1]], i32 1, i32 2, i32 3, i32 0) +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RESULT:%.*]] = phi <8 x i32> [ [[SRC1]], %[[FIRST]] ], [ [[SHOULD_NOT_SCALARIZE_ME]], %[[SECOND]] ] +; CHECK-NEXT: ret <8 x i32> [[RESULT]] +; +entry: + %should_not_scalarize_me = call <8 x i32> @llvm.genx.GenISA.vmeSendIME2(<8 x i32> %src1, i32 1, i32 2, i32 3, i32 0) + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %result = phi <8 x i32> [ %src1, %first], [ %should_not_scalarize_me, %second] + ret <8 x i32> %result +} + +define <4 x i32> @should_not_scalarize_2(i1 %switch, <4 x i32> %src1) { +; CHECK-LABEL: define <4 x i32> @should_not_scalarize_2( +; CHECK-SAME: i1 [[SWITCH:%.*]], <4 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHOULD_NOT_SCALARIZE_ME:%.*]] = call <4 x i32> @llvm.genx.GenISA.vmeSendFBR2(<4 x i32> [[SRC1]], i32 1, i32 2, i32 3) +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RESULT:%.*]] = phi <4 x i32> [ [[SRC1]], %[[FIRST]] ], [ [[SHOULD_NOT_SCALARIZE_ME]], %[[SECOND]] ] +; CHECK-NEXT: ret <4 x i32> [[RESULT]] +; +entry: + %should_not_scalarize_me = call <4 x i32> @llvm.genx.GenISA.vmeSendFBR2(<4 x i32> %src1, i32 1, i32 2, i32 3) + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %result = phi <4 x i32> [ %src1, %first], [ %should_not_scalarize_me, %second] + ret <4 x i32> %result +} + +define <4 x i32> @should_not_scalarize_3(i1 %switch, <4 x i32> %src1) { +; CHECK-LABEL: define <4 x i32> @should_not_scalarize_3( +; CHECK-SAME: i1 [[SWITCH:%.*]], <4 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHOULD_NOT_SCALARIZE_ME:%.*]] = call <4 x i32> @llvm.genx.GenISA.vmeSendSIC2(<4 x i32> [[SRC1]], i32 1, i32 2, i32 3) +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RESULT:%.*]] = phi <4 x i32> [ [[SRC1]], %[[FIRST]] ], [ [[SHOULD_NOT_SCALARIZE_ME]], %[[SECOND]] ] +; CHECK-NEXT: ret <4 x i32> [[RESULT]] +; +entry: + %should_not_scalarize_me = call <4 x i32> @llvm.genx.GenISA.vmeSendSIC2(<4 x i32> %src1, i32 1, i32 2, i32 3) + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %result = phi <4 x i32> [ %src1, %first], [ %should_not_scalarize_me, %second] + ret <4 x i32> %result +} + +define <8 x float> @should_not_scalarize_4(i1 %switch, <8 x float> %src1, <8 x float> %src2) { +; CHECK-LABEL: define <8 x float> @should_not_scalarize_4( +; CHECK-SAME: i1 [[SWITCH:%.*]], <8 x float> [[SRC1:%.*]], <8 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, <8 x i16>* @vector.8x.i16, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* @vector.8x.i32, align 32 +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <8 x float> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ] +; CHECK-NEXT: [[RETURN:%.*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[SHOULD_NOT_SCALARIZE_ME]], <8 x i16> [[TMP0]], <8 x i32> [[TMP1]], i32 11, i32 11, i32 8, i32 8, i1 false) +; CHECK-NEXT: ret <8 x float> [[RETURN]] +; +entry: + %0 = load <8 x i16>, <8 x i16>* @vector.8x.i16 + %1 = load <8 x i32>, <8 x i32>* @vector.8x.i32 + + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %should_not_scalarize_me = phi <8 x float> [ %src1, %first], [ %src2, %second] + %return = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %should_not_scalarize_me, <8 x i16> %0, <8 x i32> %1, i32 11, i32 11, i32 8, i32 8, i1 false) + ret <8 x float> %return +} + +define i32 @should_not_scalarize_5(i1 %switch, <8 x i32> %src1, <8 x i32> %src2) { +; CHECK-LABEL: define i32 @should_not_scalarize_5( +; CHECK-SAME: i1 [[SWITCH:%.*]], <8 x i32> [[SRC1:%.*]], <8 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* @vector.8x.i32, align 32 +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <8 x i32> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ] +; CHECK-NEXT: [[RETURN:%.*]] = call i32 @llvm.genx.GenISA.dpas.v8i32(<8 x i32> [[SHOULD_NOT_SCALARIZE_ME]], <8 x i32> [[TMP0]], <8 x i32> [[TMP0]], i32 7, i32 7, i32 8, i32 1, i1 false) +; CHECK-NEXT: ret i32 [[RETURN]] +; +entry: + %0 = load <8 x i32>, <8 x i32>* @vector.8x.i32 + + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %should_not_scalarize_me = phi <8 x i32> [ %src1, %first], [ %src2, %second] + %return = call i32 @llvm.genx.GenISA.dpas.v8i32(<8 x i32> %should_not_scalarize_me, <8 x i32> %0, <8 x i32> %0, i32 7, i32 7, i32 8, i32 1, i1 false) + ret i32 %return +} + +define spir_kernel void @should_not_scalarize_6(i1 %switch, <2 x i32> addrspace(1)* %src1, <2 x i32> addrspace(1)* %src2) { +; CHECK-LABEL: define spir_kernel void @should_not_scalarize_6( +; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x i32> addrspace(1)* [[SRC1:%.*]], <2 x i32> addrspace(1)* [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <2 x i32> addrspace(1)* [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ] +; CHECK-NEXT: call void @llvm.genx.GenISA.simdBlockWrite.v2i32.p1v2i32(<2 x i32> addrspace(1)* [[SHOULD_NOT_SCALARIZE_ME]], <2 x i32> ) +; CHECK-NEXT: ret void +; +entry: + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %should_not_scalarize_me = phi <2 x i32> addrspace(1)* [ %src1, %first], [ %src2, %second] + call void @llvm.genx.GenISA.simdBlockWrite.v2i32.p1v2i32(<2 x i32> addrspace(1)* %should_not_scalarize_me, <2 x i32> ) + ret void +} + +define spir_kernel void @should_not_scalarize_7(i1 %switch, <2 x float> %src1, <2 x float> %src2) { +; CHECK-LABEL: define spir_kernel void @should_not_scalarize_7( +; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = alloca i8, align 1, addrspace(2490368) +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <2 x float> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ] +; CHECK-NEXT: call void @llvm.genx.GenISA.simdBlockWriteBindless.p2490368i8.v2f32.i32(i8 addrspace(2490368)* [[TMP0]], <2 x float> [[SHOULD_NOT_SCALARIZE_ME]], i32 0) +; CHECK-NEXT: ret void +; +entry: + %0 = alloca i8, addrspace(2490368) + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %should_not_scalarize_me = phi <2 x float> [ %src1, %first], [ %src2, %second] + call void @llvm.genx.GenISA.simdBlockWriteBindless.p2490368i8.v2f32.i32(i8 addrspace(2490368)* %0, <2 x float> %should_not_scalarize_me, i32 0) + ret void +} + +define spir_kernel void @should_not_scalarize_8(i1 %switch, <16 x i16> %src1, <16 x i16> %src2) { +; CHECK-LABEL: define spir_kernel void @should_not_scalarize_8( +; CHECK-SAME: i1 [[SWITCH:%.*]], <16 x i16> [[SRC1:%.*]], <16 x i16> [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <16 x i16> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ] +; CHECK-NEXT: call void @llvm.genx.GenISA.simdMediaBlockWrite.v16i16(i32 1, i32 1, i32 1, i32 1, <16 x i16> [[SHOULD_NOT_SCALARIZE_ME]]) +; CHECK-NEXT: ret void +; +entry: + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %should_not_scalarize_me = phi <16 x i16> [ %src1, %first], [ %src2, %second] + call void @llvm.genx.GenISA.simdMediaBlockWrite.v16i16(i32 1, i32 1, i32 1, i32 1, <16 x i16> %should_not_scalarize_me) + ret void +} + +define spir_kernel void @should_not_scalarize_9(i1 %switch, <8 x i32> %src1, <8 x i32> %src2) { +; CHECK-LABEL: define spir_kernel void @should_not_scalarize_9( +; CHECK-SAME: i1 [[SWITCH:%.*]], <8 x i32> [[SRC1:%.*]], <8 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <8 x i32> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ] +; CHECK-NEXT: call void @llvm.genx.GenISA.LSC2DBlockWrite.p0i32(i64 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i1 false, i1 true, i32 11, <8 x i32> [[SHOULD_NOT_SCALARIZE_ME]]) +; CHECK-NEXT: ret void +; +entry: + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %should_not_scalarize_me = phi <8 x i32> [ %src1, %first], [ %src2, %second] + call void @llvm.genx.GenISA.LSC2DBlockWrite.p0i32(i64 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i1 false, i1 true, i32 11, <8 x i32> %should_not_scalarize_me) + ret void +} + +define spir_kernel void @should_not_scalarize_10(i1 %switch, <16 x i16> %src1, <16 x i16> %src2) { +; CHECK-LABEL: define spir_kernel void @should_not_scalarize_10( +; CHECK-SAME: i1 [[SWITCH:%.*]], <16 x i16> [[SRC1:%.*]], <16 x i16> [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <16 x i16> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ] +; CHECK-NEXT: call void @llvm.genx.GenISA.LSC2DBlockWriteAddrPayload.p0i32.v16i16(i32* [[TMP0]], i32 0, i32 0, i32 16, i32 16, i32 16, i32 1, i1 false, i1 false, i32 0, <16 x i16> [[SHOULD_NOT_SCALARIZE_ME]]) +; CHECK-NEXT: ret void +; +entry: + %0 = alloca i32 + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %should_not_scalarize_me = phi <16 x i16> [ %src1, %first], [ %src2, %second] + call void @llvm.genx.GenISA.LSC2DBlockWriteAddrPayload.p0i32.v16i16(i32* %0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 1, i1 false, i1 false, i32 0, <16 x i16> %should_not_scalarize_me) + ret void +} + +define spir_kernel void @should_work_with_nested_phi(i1 %switch) { +; CHECK-LABEL: @should_work_with_nested_phi( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[VECTINT:%.*]] = add <8 x i32> , zeroinitializer +; CHECK-NEXT: [[VECTINT_SCALAR:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 0 +; CHECK-NEXT: [[VECTINT_SCALAR9:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 1 +; CHECK-NEXT: [[VECTINT_SCALAR10:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 2 +; CHECK-NEXT: [[VECTINT_SCALAR11:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 3 +; CHECK-NEXT: [[VECTINT_SCALAR12:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 4 +; CHECK-NEXT: [[VECTINT_SCALAR13:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 5 +; CHECK-NEXT: [[VECTINT_SCALAR14:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 6 +; CHECK-NEXT: [[VECTINT_SCALAR15:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 7 +; CHECK-NEXT: br i1 [[SWITCH:%.*]], label [[FIRST:%.*]], label [[SECOND:%.*]] +; CHECK: proxy: +; CHECK-NEXT: br i1 [[SWITCH]], label [[FIRST]], label [[SECOND]] +; CHECK: first: +; CHECK-NEXT: [[RESULT11:%.*]] = phi i32 [ 0, [[PROXY:%.*]] ], [ [[VECTINT_SCALAR]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[RESULT12:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR9]], [[ENTRY]] ] +; CHECK-NEXT: [[RESULT13:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR10]], [[ENTRY]] ] +; CHECK-NEXT: [[RESULT14:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR11]], [[ENTRY]] ] +; CHECK-NEXT: [[RESULT15:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR12]], [[ENTRY]] ] +; CHECK-NEXT: [[RESULT16:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR13]], [[ENTRY]] ] +; CHECK-NEXT: [[RESULT17:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR14]], [[ENTRY]] ] +; CHECK-NEXT: [[RESULT18:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR15]], [[ENTRY]] ] +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: second: +; CHECK-NEXT: [[RESULT224:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR]], [[ENTRY]] ] +; CHECK-NEXT: [[RESULT225:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR9]], [[ENTRY]] ] +; CHECK-NEXT: [[RESULT226:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR10]], [[ENTRY]] ] +; CHECK-NEXT: [[RESULT227:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR11]], [[ENTRY]] ] +; CHECK-NEXT: [[RESULT228:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR12]], [[ENTRY]] ] +; CHECK-NEXT: [[RESULT229:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR13]], [[ENTRY]] ] +; CHECK-NEXT: [[RESULT230:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR14]], [[ENTRY]] ] +; CHECK-NEXT: [[RESULT231:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR15]], [[ENTRY]] ] +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: [[RESULT316:%.*]] = phi i32 [ [[RESULT11]], [[FIRST]] ], [ [[RESULT224]], [[SECOND]] ] +; CHECK-NEXT: [[RESULT317:%.*]] = phi i32 [ [[RESULT12]], [[FIRST]] ], [ [[RESULT225]], [[SECOND]] ] +; CHECK-NEXT: [[RESULT318:%.*]] = phi i32 [ [[RESULT13]], [[FIRST]] ], [ [[RESULT226]], [[SECOND]] ] +; CHECK-NEXT: [[RESULT319:%.*]] = phi i32 [ [[RESULT14]], [[FIRST]] ], [ [[RESULT227]], [[SECOND]] ] +; CHECK-NEXT: [[RESULT320:%.*]] = phi i32 [ [[RESULT15]], [[FIRST]] ], [ [[RESULT228]], [[SECOND]] ] +; CHECK-NEXT: [[RESULT321:%.*]] = phi i32 [ [[RESULT16]], [[FIRST]] ], [ [[RESULT229]], [[SECOND]] ] +; CHECK-NEXT: [[RESULT322:%.*]] = phi i32 [ [[RESULT17]], [[FIRST]] ], [ [[RESULT230]], [[SECOND]] ] +; CHECK-NEXT: [[RESULT323:%.*]] = phi i32 [ [[RESULT18]], [[FIRST]] ], [ [[RESULT231]], [[SECOND]] ] +; CHECK-NEXT: [[RESULT3_ASSEMBLED_VECT:%.*]] = insertelement <8 x i32> undef, i32 [[RESULT316]], i32 0 +; CHECK-NEXT: [[RESULT3_ASSEMBLED_VECT32:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT]], i32 [[RESULT317]], i32 1 +; CHECK-NEXT: [[RESULT3_ASSEMBLED_VECT33:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT32]], i32 [[RESULT318]], i32 2 +; CHECK-NEXT: [[RESULT3_ASSEMBLED_VECT34:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT33]], i32 [[RESULT319]], i32 3 +; CHECK-NEXT: [[RESULT3_ASSEMBLED_VECT35:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT34]], i32 [[RESULT320]], i32 4 +; CHECK-NEXT: [[RESULT3_ASSEMBLED_VECT36:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT35]], i32 [[RESULT321]], i32 5 +; CHECK-NEXT: [[RESULT3_ASSEMBLED_VECT37:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT36]], i32 [[RESULT322]], i32 6 +; CHECK-NEXT: [[RESULT3_ASSEMBLED_VECT38:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT37]], i32 [[RESULT323]], i32 7 +; CHECK-NEXT: [[NEWDATA:%.*]] = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> [[RESULT3_ASSEMBLED_VECT38]]) +; CHECK-NEXT: ret void +; +entry: + %vectint = add <8 x i32> , zeroinitializer + br i1 %switch, label %first, label %second +proxy: + br i1 %switch, label %first, label %second +first: + %result1 = phi <8 x i32> [ zeroinitializer, %proxy ], [ %vectint, %entry] + br label %exit +second: + %result2 = phi <8 x i32> [ zeroinitializer, %proxy ], [ %vectint, %entry] + br label %exit +exit: + %result3 = phi <8 x i32> [ %result1, %first], [ %result2, %second] + %newdata = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> %result3) + ret void +} + +@vector.8x.float = global <8 x float> +@vector.8x.i16 = global <8 x i16> +@vector.8x.i32 = global <8 x i32> + +declare <8 x i32> @llvm.genx.GenISA.vmeSendIME2(<8 x i32>, i32, i32, i32, i32) +declare <4 x i32> @llvm.genx.GenISA.vmeSendFBR2(<4 x i32>, i32, i32, i32) +declare <4 x i32> @llvm.genx.GenISA.vmeSendSIC2(<4 x i32>, i32, i32, i32) +declare <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) +declare i32 @llvm.genx.GenISA.dpas.v8i32(<8 x i32>, <8 x i32>, <8 x i32>, i32, i32, i32, i32, i1) +declare void @llvm.genx.GenISA.simdBlockWrite.v2i32.p1v2i32(<2 x i32> addrspace(1)*, <2 x i32>) +declare void @llvm.genx.GenISA.simdBlockWriteBindless.p2490368i8.v2f32.i32(i8 addrspace(2490368)*, <2 x float>, i32) +declare void @llvm.genx.GenISA.simdMediaBlockWrite.v16i16(i32, i32, i32, i32, <16 x i16>) +declare void @llvm.genx.GenISA.LSC2DBlockWrite.p0i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32, <8 x i32>) +declare void @llvm.genx.GenISA.LSC2DBlockWriteAddrPayload.p0i32.v16i16(i32*, i32, i32, i32, i32, i32, i32, i1, i1, i32, <16 x i16>) +declare spir_func <8 x i32> @do_math_v8i32_v8i32(<8 x i32>) #1 diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-phi-instruction.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-phi-instruction.ll new file mode 100644 index 000000000000..64c7b15f9aff --- /dev/null +++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-phi-instruction.ll @@ -0,0 +1,589 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2024 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= +; +; RUN: igc_opt --igc-scalarize --opaque-pointers -S < %s | FileCheck %s +; REQUIRES: llvm-14-plus +; ------------------------------------------------ +; ScalarizeFunction +; ------------------------------------------------ +; The pass should break vector operation into many scalar operations +; ------------------------------------------------ + +define <2 x i32> @basic(i1 %switch, <2 x i32> %src1, <2 x i32> %src2) { +; CHECK-LABEL: define <2 x i32> @basic( +; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR4:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR3:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RESULT1:%.*]] = phi i32 [ [[SRC1_SCALAR]], %[[FIRST]] ], [ [[SRC2_SCALAR]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT2:%.*]] = phi i32 [ [[SRC1_SCALAR3]], %[[FIRST]] ], [ [[SRC2_SCALAR4]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[RESULT1]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT5:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT2]], i32 1 +; CHECK-NEXT: ret <2 x i32> [[RESULT_ASSEMBLED_VECT5]] +; +entry: + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %result = phi <2 x i32> [ %src1, %first], [ %src2, %second] + ret <2 x i32> %result +} + +define <2 x float> @should_work_with_different_value_type(i1 %switch, <2 x float> %src1, <2 x float> %src2) { +; CHECK-LABEL: define <2 x float> @should_work_with_different_value_type( +; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR4:%.*]] = extractelement <2 x float> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR3:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RESULT1:%.*]] = phi float [ [[SRC1_SCALAR]], %[[FIRST]] ], [ [[SRC2_SCALAR]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT2:%.*]] = phi float [ [[SRC1_SCALAR3]], %[[FIRST]] ], [ [[SRC2_SCALAR4]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[RESULT1]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT5:%.*]] = insertelement <2 x float> [[RESULT_ASSEMBLED_VECT]], float [[RESULT2]], i32 1 +; CHECK-NEXT: ret <2 x float> [[RESULT_ASSEMBLED_VECT5]] +; +entry: + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %result = phi <2 x float> [ %src1, %first], [ %src2, %second] + ret <2 x float> %result +} + +; triangulating with @should_work_with_different_value_type +; update checks if fast will be preserved +define <2 x float> @should_work_with_fast_math_flags(i1 %switch, <2 x float> %src1, <2 x float> %src2) { +; CHECK-LABEL: define <2 x float> @should_work_with_fast_math_flags( +; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR4:%.*]] = extractelement <2 x float> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR3:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RESULT1:%.*]] = phi float [ [[SRC1_SCALAR]], %[[FIRST]] ], [ [[SRC2_SCALAR]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT2:%.*]] = phi float [ [[SRC1_SCALAR3]], %[[FIRST]] ], [ [[SRC2_SCALAR4]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[RESULT1]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT5:%.*]] = insertelement <2 x float> [[RESULT_ASSEMBLED_VECT]], float [[RESULT2]], i32 1 +; CHECK-NEXT: ret <2 x float> [[RESULT_ASSEMBLED_VECT5]] +; + entry: + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %result = phi fast <2 x float> [ %src1, %first], [ %src2, %second] + ret <2 x float> %result +} + +define <16 x i32> @should_work_with_larger_vector_size(i1 %switch, <16 x i32> %src1, <16 x i32> %src2) { +; CHECK-LABEL: define <16 x i32> @should_work_with_larger_vector_size( +; CHECK-SAME: i1 [[SWITCH:%.*]], <16 x i32> [[SRC1:%.*]], <16 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <16 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR32:%.*]] = extractelement <16 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC2_SCALAR33:%.*]] = extractelement <16 x i32> [[SRC2]], i32 2 +; CHECK-NEXT: [[SRC2_SCALAR34:%.*]] = extractelement <16 x i32> [[SRC2]], i32 3 +; CHECK-NEXT: [[SRC2_SCALAR35:%.*]] = extractelement <16 x i32> [[SRC2]], i32 4 +; CHECK-NEXT: [[SRC2_SCALAR36:%.*]] = extractelement <16 x i32> [[SRC2]], i32 5 +; CHECK-NEXT: [[SRC2_SCALAR37:%.*]] = extractelement <16 x i32> [[SRC2]], i32 6 +; CHECK-NEXT: [[SRC2_SCALAR38:%.*]] = extractelement <16 x i32> [[SRC2]], i32 7 +; CHECK-NEXT: [[SRC2_SCALAR39:%.*]] = extractelement <16 x i32> [[SRC2]], i32 8 +; CHECK-NEXT: [[SRC2_SCALAR40:%.*]] = extractelement <16 x i32> [[SRC2]], i32 9 +; CHECK-NEXT: [[SRC2_SCALAR41:%.*]] = extractelement <16 x i32> [[SRC2]], i32 10 +; CHECK-NEXT: [[SRC2_SCALAR42:%.*]] = extractelement <16 x i32> [[SRC2]], i32 11 +; CHECK-NEXT: [[SRC2_SCALAR43:%.*]] = extractelement <16 x i32> [[SRC2]], i32 12 +; CHECK-NEXT: [[SRC2_SCALAR44:%.*]] = extractelement <16 x i32> [[SRC2]], i32 13 +; CHECK-NEXT: [[SRC2_SCALAR45:%.*]] = extractelement <16 x i32> [[SRC2]], i32 14 +; CHECK-NEXT: [[SRC2_SCALAR46:%.*]] = extractelement <16 x i32> [[SRC2]], i32 15 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <16 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR17:%.*]] = extractelement <16 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR18:%.*]] = extractelement <16 x i32> [[SRC1]], i32 2 +; CHECK-NEXT: [[SRC1_SCALAR19:%.*]] = extractelement <16 x i32> [[SRC1]], i32 3 +; CHECK-NEXT: [[SRC1_SCALAR20:%.*]] = extractelement <16 x i32> [[SRC1]], i32 4 +; CHECK-NEXT: [[SRC1_SCALAR21:%.*]] = extractelement <16 x i32> [[SRC1]], i32 5 +; CHECK-NEXT: [[SRC1_SCALAR22:%.*]] = extractelement <16 x i32> [[SRC1]], i32 6 +; CHECK-NEXT: [[SRC1_SCALAR23:%.*]] = extractelement <16 x i32> [[SRC1]], i32 7 +; CHECK-NEXT: [[SRC1_SCALAR24:%.*]] = extractelement <16 x i32> [[SRC1]], i32 8 +; CHECK-NEXT: [[SRC1_SCALAR25:%.*]] = extractelement <16 x i32> [[SRC1]], i32 9 +; CHECK-NEXT: [[SRC1_SCALAR26:%.*]] = extractelement <16 x i32> [[SRC1]], i32 10 +; CHECK-NEXT: [[SRC1_SCALAR27:%.*]] = extractelement <16 x i32> [[SRC1]], i32 11 +; CHECK-NEXT: [[SRC1_SCALAR28:%.*]] = extractelement <16 x i32> [[SRC1]], i32 12 +; CHECK-NEXT: [[SRC1_SCALAR29:%.*]] = extractelement <16 x i32> [[SRC1]], i32 13 +; CHECK-NEXT: [[SRC1_SCALAR30:%.*]] = extractelement <16 x i32> [[SRC1]], i32 14 +; CHECK-NEXT: [[SRC1_SCALAR31:%.*]] = extractelement <16 x i32> [[SRC1]], i32 15 +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RESULT1:%.*]] = phi i32 [ [[SRC1_SCALAR]], %[[FIRST]] ], [ [[SRC2_SCALAR]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT2:%.*]] = phi i32 [ [[SRC1_SCALAR17]], %[[FIRST]] ], [ [[SRC2_SCALAR32]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT3:%.*]] = phi i32 [ [[SRC1_SCALAR18]], %[[FIRST]] ], [ [[SRC2_SCALAR33]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT4:%.*]] = phi i32 [ [[SRC1_SCALAR19]], %[[FIRST]] ], [ [[SRC2_SCALAR34]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT5:%.*]] = phi i32 [ [[SRC1_SCALAR20]], %[[FIRST]] ], [ [[SRC2_SCALAR35]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT6:%.*]] = phi i32 [ [[SRC1_SCALAR21]], %[[FIRST]] ], [ [[SRC2_SCALAR36]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT7:%.*]] = phi i32 [ [[SRC1_SCALAR22]], %[[FIRST]] ], [ [[SRC2_SCALAR37]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT8:%.*]] = phi i32 [ [[SRC1_SCALAR23]], %[[FIRST]] ], [ [[SRC2_SCALAR38]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT9:%.*]] = phi i32 [ [[SRC1_SCALAR24]], %[[FIRST]] ], [ [[SRC2_SCALAR39]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT10:%.*]] = phi i32 [ [[SRC1_SCALAR25]], %[[FIRST]] ], [ [[SRC2_SCALAR40]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT11:%.*]] = phi i32 [ [[SRC1_SCALAR26]], %[[FIRST]] ], [ [[SRC2_SCALAR41]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT12:%.*]] = phi i32 [ [[SRC1_SCALAR27]], %[[FIRST]] ], [ [[SRC2_SCALAR42]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT13:%.*]] = phi i32 [ [[SRC1_SCALAR28]], %[[FIRST]] ], [ [[SRC2_SCALAR43]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT14:%.*]] = phi i32 [ [[SRC1_SCALAR29]], %[[FIRST]] ], [ [[SRC2_SCALAR44]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT15:%.*]] = phi i32 [ [[SRC1_SCALAR30]], %[[FIRST]] ], [ [[SRC2_SCALAR45]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT16:%.*]] = phi i32 [ [[SRC1_SCALAR31]], %[[FIRST]] ], [ [[SRC2_SCALAR46]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <16 x i32> undef, i32 [[RESULT1]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT47:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT2]], i32 1 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT48:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT47]], i32 [[RESULT3]], i32 2 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT49:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT48]], i32 [[RESULT4]], i32 3 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT50:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT49]], i32 [[RESULT5]], i32 4 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT51:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT50]], i32 [[RESULT6]], i32 5 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT52:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT51]], i32 [[RESULT7]], i32 6 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT53:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT52]], i32 [[RESULT8]], i32 7 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT54:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT53]], i32 [[RESULT9]], i32 8 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT55:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT54]], i32 [[RESULT10]], i32 9 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT56:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT55]], i32 [[RESULT11]], i32 10 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT57:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT56]], i32 [[RESULT12]], i32 11 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT58:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT57]], i32 [[RESULT13]], i32 12 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT59:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT58]], i32 [[RESULT14]], i32 13 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT60:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT59]], i32 [[RESULT15]], i32 14 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT61:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT60]], i32 [[RESULT16]], i32 15 +; CHECK-NEXT: ret <16 x i32> [[RESULT_ASSEMBLED_VECT61]] +; +entry: + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %result = phi <16 x i32> [ %src1, %first], [ %src2, %second] + ret <16 x i32> %result +} + +define <2 x i32> @should_work_with_constant_value(i1 %switch, <2 x i32> %src1) { +; CHECK-LABEL: define <2 x i32> @should_work_with_constant_value( +; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR3:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RESULT1:%.*]] = phi i32 [ [[SRC1_SCALAR]], %[[FIRST]] ], [ 2, %[[SECOND]] ] +; CHECK-NEXT: [[RESULT2:%.*]] = phi i32 [ [[SRC1_SCALAR3]], %[[FIRST]] ], [ 4, %[[SECOND]] ] +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[RESULT1]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT4:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT2]], i32 1 +; CHECK-NEXT: ret <2 x i32> [[RESULT_ASSEMBLED_VECT4]] +; +entry: + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %result = phi <2 x i32> [ %src1, %first], [ , %second] + ret <2 x i32> %result +} + + +define <8 x i32> @should_not_scalarize(i1 %switch, <8 x i32> %src1) { +; CHECK-LABEL: define <8 x i32> @should_not_scalarize( +; CHECK-SAME: i1 [[SWITCH:%.*]], <8 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHOULD_NOT_SCALARIZE_ME:%.*]] = call <8 x i32> @llvm.genx.GenISA.vmeSendIME2(<8 x i32> [[SRC1]], i32 1, i32 2, i32 3, i32 0) +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RESULT:%.*]] = phi <8 x i32> [ [[SRC1]], %[[FIRST]] ], [ [[SHOULD_NOT_SCALARIZE_ME]], %[[SECOND]] ] +; CHECK-NEXT: ret <8 x i32> [[RESULT]] +; +entry: + %should_not_scalarize_me = call <8 x i32> @llvm.genx.GenISA.vmeSendIME2(<8 x i32> %src1, i32 1, i32 2, i32 3, i32 0) + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %result = phi <8 x i32> [ %src1, %first], [ %should_not_scalarize_me, %second] + ret <8 x i32> %result +} + +define <4 x i32> @should_not_scalarize_2(i1 %switch, <4 x i32> %src1) { +; CHECK-LABEL: define <4 x i32> @should_not_scalarize_2( +; CHECK-SAME: i1 [[SWITCH:%.*]], <4 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHOULD_NOT_SCALARIZE_ME:%.*]] = call <4 x i32> @llvm.genx.GenISA.vmeSendFBR2(<4 x i32> [[SRC1]], i32 1, i32 2, i32 3) +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RESULT:%.*]] = phi <4 x i32> [ [[SRC1]], %[[FIRST]] ], [ [[SHOULD_NOT_SCALARIZE_ME]], %[[SECOND]] ] +; CHECK-NEXT: ret <4 x i32> [[RESULT]] +; +entry: + %should_not_scalarize_me = call <4 x i32> @llvm.genx.GenISA.vmeSendFBR2(<4 x i32> %src1, i32 1, i32 2, i32 3) + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %result = phi <4 x i32> [ %src1, %first], [ %should_not_scalarize_me, %second] + ret <4 x i32> %result +} + +define <4 x i32> @should_not_scalarize_3(i1 %switch, <4 x i32> %src1) { +; CHECK-LABEL: define <4 x i32> @should_not_scalarize_3( +; CHECK-SAME: i1 [[SWITCH:%.*]], <4 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SHOULD_NOT_SCALARIZE_ME:%.*]] = call <4 x i32> @llvm.genx.GenISA.vmeSendSIC2(<4 x i32> [[SRC1]], i32 1, i32 2, i32 3) +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RESULT:%.*]] = phi <4 x i32> [ [[SRC1]], %[[FIRST]] ], [ [[SHOULD_NOT_SCALARIZE_ME]], %[[SECOND]] ] +; CHECK-NEXT: ret <4 x i32> [[RESULT]] +; +entry: + %should_not_scalarize_me = call <4 x i32> @llvm.genx.GenISA.vmeSendSIC2(<4 x i32> %src1, i32 1, i32 2, i32 3) + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %result = phi <4 x i32> [ %src1, %first], [ %should_not_scalarize_me, %second] + ret <4 x i32> %result +} + +define <8 x float> @should_not_scalarize_4(i1 %switch, <8 x float> %src1, <8 x float> %src2) { +; CHECK-LABEL: define <8 x float> @should_not_scalarize_4( +; CHECK-SAME: i1 [[SWITCH:%.*]], <8 x float> [[SRC1:%.*]], <8 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr @vector.8x.i16, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @vector.8x.i32, align 32 +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <8 x float> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ] +; CHECK-NEXT: [[RETURN:%.*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[SHOULD_NOT_SCALARIZE_ME]], <8 x i16> [[TMP0]], <8 x i32> [[TMP1]], i32 11, i32 11, i32 8, i32 8, i1 false) +; CHECK-NEXT: ret <8 x float> [[RETURN]] +; +entry: + %0 = load <8 x i16>, ptr @vector.8x.i16 + %1 = load <8 x i32>, ptr @vector.8x.i32 + + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %should_not_scalarize_me = phi <8 x float> [ %src1, %first], [ %src2, %second] + %return = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %should_not_scalarize_me, <8 x i16> %0, <8 x i32> %1, i32 11, i32 11, i32 8, i32 8, i1 false) + ret <8 x float> %return +} + +define i32 @should_not_scalarize_5(i1 %switch, <8 x i32> %src1, <8 x i32> %src2) { +; CHECK-LABEL: define i32 @should_not_scalarize_5( +; CHECK-SAME: i1 [[SWITCH:%.*]], <8 x i32> [[SRC1:%.*]], <8 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @vector.8x.i32, align 32 +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <8 x i32> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ] +; CHECK-NEXT: [[RETURN:%.*]] = call i32 @llvm.genx.GenISA.dpas.v8i32(<8 x i32> [[SHOULD_NOT_SCALARIZE_ME]], <8 x i32> [[TMP0]], <8 x i32> [[TMP0]], i32 7, i32 7, i32 8, i32 1, i1 false) +; CHECK-NEXT: ret i32 [[RETURN]] +; +entry: + %0 = load <8 x i32>, ptr @vector.8x.i32 + + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %should_not_scalarize_me = phi <8 x i32> [ %src1, %first], [ %src2, %second] + %return = call i32 @llvm.genx.GenISA.dpas.v8i32(<8 x i32> %should_not_scalarize_me, <8 x i32> %0, <8 x i32> %0, i32 7, i32 7, i32 8, i32 1, i1 false) + ret i32 %return +} + +define spir_kernel void @should_not_scalarize_6(i1 %switch, ptr addrspace(1) %src1, ptr addrspace(1) %src2) { +; CHECK-LABEL: define spir_kernel void @should_not_scalarize_6( +; CHECK-SAME: i1 [[SWITCH:%.*]], ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi ptr addrspace(1) [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ] +; CHECK-NEXT: call void @llvm.genx.GenISA.simdBlockWrite.v2i32.p1v2i32(ptr addrspace(1) [[SHOULD_NOT_SCALARIZE_ME]], <2 x i32> ) +; CHECK-NEXT: ret void +; +entry: + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %should_not_scalarize_me = phi ptr addrspace(1) [ %src1, %first], [ %src2, %second] + call void @llvm.genx.GenISA.simdBlockWrite.v2i32.p1v2i32(ptr addrspace(1) %should_not_scalarize_me, <2 x i32> ) + ret void +} + +define spir_kernel void @should_not_scalarize_7(i1 %switch, <2 x float> %src1, <2 x float> %src2) { +; CHECK-LABEL: define spir_kernel void @should_not_scalarize_7( +; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = alloca i8, align 1, addrspace(2490368) +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <2 x float> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ] +; CHECK-NEXT: call void @llvm.genx.GenISA.simdBlockWriteBindless.p2490368i8.v2f32.i32(ptr addrspace(2490368) [[TMP0]], <2 x float> [[SHOULD_NOT_SCALARIZE_ME]], i32 0) +; CHECK-NEXT: ret void +; +entry: + %0 = alloca i8, addrspace(2490368) + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %should_not_scalarize_me = phi <2 x float> [ %src1, %first], [ %src2, %second] + call void @llvm.genx.GenISA.simdBlockWriteBindless.p2490368i8.v2f32.i32(ptr addrspace(2490368) %0, <2 x float> %should_not_scalarize_me, i32 0) + ret void +} + +define spir_kernel void @should_not_scalarize_8(i1 %switch, <16 x i16> %src1, <16 x i16> %src2) { +; CHECK-LABEL: define spir_kernel void @should_not_scalarize_8( +; CHECK-SAME: i1 [[SWITCH:%.*]], <16 x i16> [[SRC1:%.*]], <16 x i16> [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <16 x i16> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ] +; CHECK-NEXT: call void @llvm.genx.GenISA.simdMediaBlockWrite.v16i16(i32 1, i32 1, i32 1, i32 1, <16 x i16> [[SHOULD_NOT_SCALARIZE_ME]]) +; CHECK-NEXT: ret void +; +entry: + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %should_not_scalarize_me = phi <16 x i16> [ %src1, %first], [ %src2, %second] + call void @llvm.genx.GenISA.simdMediaBlockWrite.v16i16(i32 1, i32 1, i32 1, i32 1, <16 x i16> %should_not_scalarize_me) + ret void +} + +define spir_kernel void @should_not_scalarize_9(i1 %switch, <8 x i32> %src1, <8 x i32> %src2) { +; CHECK-LABEL: define spir_kernel void @should_not_scalarize_9( +; CHECK-SAME: i1 [[SWITCH:%.*]], <8 x i32> [[SRC1:%.*]], <8 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <8 x i32> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ] +; CHECK-NEXT: call void @llvm.genx.GenISA.LSC2DBlockWrite.p0i32(i64 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i1 false, i1 true, i32 11, <8 x i32> [[SHOULD_NOT_SCALARIZE_ME]]) +; CHECK-NEXT: ret void +; +entry: + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %should_not_scalarize_me = phi <8 x i32> [ %src1, %first], [ %src2, %second] + call void @llvm.genx.GenISA.LSC2DBlockWrite.p0i32(i64 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i1 false, i1 true, i32 11, <8 x i32> %should_not_scalarize_me) + ret void +} + +define spir_kernel void @should_not_scalarize_10(i1 %switch, <16 x i16> %src1, <16 x i16> %src2) { +; CHECK-LABEL: define spir_kernel void @should_not_scalarize_10( +; CHECK-SAME: i1 [[SWITCH:%.*]], <16 x i16> [[SRC1:%.*]], <16 x i16> [[SRC2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <16 x i16> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ] +; CHECK-NEXT: call void @llvm.genx.GenISA.LSC2DBlockWriteAddrPayload.p0i32.v16i16(ptr [[TMP0]], i32 0, i32 0, i32 16, i32 16, i32 16, i32 1, i1 false, i1 false, i32 0, <16 x i16> [[SHOULD_NOT_SCALARIZE_ME]]) +; CHECK-NEXT: ret void +; +entry: + %0 = alloca i32 + br i1 %switch, label %first, label %second +first: + br label %exit +second: + br label %exit +exit: + %should_not_scalarize_me = phi <16 x i16> [ %src1, %first], [ %src2, %second] + call void @llvm.genx.GenISA.LSC2DBlockWriteAddrPayload.p0i32.v16i16(ptr %0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 1, i1 false, i1 false, i32 0, <16 x i16> %should_not_scalarize_me) + ret void +} + +define spir_kernel void @should_work_with_nested_phi(i1 %switch) { +; CHECK-LABEL: define spir_kernel void @should_work_with_nested_phi( +; CHECK-SAME: i1 [[SWITCH:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[VECTINT:%.*]] = add <8 x i32> , zeroinitializer +; CHECK-NEXT: [[VECTINT_SCALAR:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 0 +; CHECK-NEXT: [[VECTINT_SCALAR9:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 1 +; CHECK-NEXT: [[VECTINT_SCALAR10:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 2 +; CHECK-NEXT: [[VECTINT_SCALAR11:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 3 +; CHECK-NEXT: [[VECTINT_SCALAR12:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 4 +; CHECK-NEXT: [[VECTINT_SCALAR13:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 5 +; CHECK-NEXT: [[VECTINT_SCALAR14:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 6 +; CHECK-NEXT: [[VECTINT_SCALAR15:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 7 +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[PROXY:.*]]: +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST]], label %[[SECOND]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: [[RESULT11:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR]], %[[ENTRY]] ] +; CHECK-NEXT: [[RESULT12:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR9]], %[[ENTRY]] ] +; CHECK-NEXT: [[RESULT13:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR10]], %[[ENTRY]] ] +; CHECK-NEXT: [[RESULT14:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR11]], %[[ENTRY]] ] +; CHECK-NEXT: [[RESULT15:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR12]], %[[ENTRY]] ] +; CHECK-NEXT: [[RESULT16:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR13]], %[[ENTRY]] ] +; CHECK-NEXT: [[RESULT17:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR14]], %[[ENTRY]] ] +; CHECK-NEXT: [[RESULT18:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR15]], %[[ENTRY]] ] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: [[RESULT224:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR]], %[[ENTRY]] ] +; CHECK-NEXT: [[RESULT225:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR9]], %[[ENTRY]] ] +; CHECK-NEXT: [[RESULT226:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR10]], %[[ENTRY]] ] +; CHECK-NEXT: [[RESULT227:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR11]], %[[ENTRY]] ] +; CHECK-NEXT: [[RESULT228:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR12]], %[[ENTRY]] ] +; CHECK-NEXT: [[RESULT229:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR13]], %[[ENTRY]] ] +; CHECK-NEXT: [[RESULT230:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR14]], %[[ENTRY]] ] +; CHECK-NEXT: [[RESULT231:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR15]], %[[ENTRY]] ] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RESULT316:%.*]] = phi i32 [ [[RESULT11]], %[[FIRST]] ], [ [[RESULT224]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT317:%.*]] = phi i32 [ [[RESULT12]], %[[FIRST]] ], [ [[RESULT225]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT318:%.*]] = phi i32 [ [[RESULT13]], %[[FIRST]] ], [ [[RESULT226]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT319:%.*]] = phi i32 [ [[RESULT14]], %[[FIRST]] ], [ [[RESULT227]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT320:%.*]] = phi i32 [ [[RESULT15]], %[[FIRST]] ], [ [[RESULT228]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT321:%.*]] = phi i32 [ [[RESULT16]], %[[FIRST]] ], [ [[RESULT229]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT322:%.*]] = phi i32 [ [[RESULT17]], %[[FIRST]] ], [ [[RESULT230]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT323:%.*]] = phi i32 [ [[RESULT18]], %[[FIRST]] ], [ [[RESULT231]], %[[SECOND]] ] +; CHECK-NEXT: [[RESULT3_ASSEMBLED_VECT:%.*]] = insertelement <8 x i32> undef, i32 [[RESULT316]], i32 0 +; CHECK-NEXT: [[RESULT3_ASSEMBLED_VECT32:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT]], i32 [[RESULT317]], i32 1 +; CHECK-NEXT: [[RESULT3_ASSEMBLED_VECT33:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT32]], i32 [[RESULT318]], i32 2 +; CHECK-NEXT: [[RESULT3_ASSEMBLED_VECT34:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT33]], i32 [[RESULT319]], i32 3 +; CHECK-NEXT: [[RESULT3_ASSEMBLED_VECT35:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT34]], i32 [[RESULT320]], i32 4 +; CHECK-NEXT: [[RESULT3_ASSEMBLED_VECT36:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT35]], i32 [[RESULT321]], i32 5 +; CHECK-NEXT: [[RESULT3_ASSEMBLED_VECT37:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT36]], i32 [[RESULT322]], i32 6 +; CHECK-NEXT: [[RESULT3_ASSEMBLED_VECT38:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT37]], i32 [[RESULT323]], i32 7 +; CHECK-NEXT: [[NEWDATA:%.*]] = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> [[RESULT3_ASSEMBLED_VECT38]]) +; CHECK-NEXT: ret void +; +entry: + %vectint = add <8 x i32> , zeroinitializer + br i1 %switch, label %first, label %second +proxy: + br i1 %switch, label %first, label %second +first: + %result1 = phi <8 x i32> [ zeroinitializer, %proxy ], [ %vectint, %entry] + br label %exit +second: + %result2 = phi <8 x i32> [ zeroinitializer, %proxy ], [ %vectint, %entry] + br label %exit +exit: + %result3 = phi <8 x i32> [ %result1, %first], [ %result2, %second] + %newdata = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> %result3) + ret void +} + +@vector.8x.float = global <8 x float> +@vector.8x.i16 = global <8 x i16> +@vector.8x.i32 = global <8 x i32> + +declare <8 x i32> @llvm.genx.GenISA.vmeSendIME2(<8 x i32>, i32, i32, i32, i32) +declare <4 x i32> @llvm.genx.GenISA.vmeSendFBR2(<4 x i32>, i32, i32, i32) +declare <4 x i32> @llvm.genx.GenISA.vmeSendSIC2(<4 x i32>, i32, i32, i32) +declare <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) +declare i32 @llvm.genx.GenISA.dpas.v8i32(<8 x i32>, <8 x i32>, <8 x i32>, i32, i32, i32, i32, i1) +declare void @llvm.genx.GenISA.simdBlockWrite.v2i32.p1v2i32(ptr addrspace(1), <2 x i32>) +declare void @llvm.genx.GenISA.simdBlockWriteBindless.p2490368i8.v2f32.i32(ptr addrspace(2490368), <2 x float>, i32) +declare void @llvm.genx.GenISA.simdMediaBlockWrite.v16i16(i32, i32, i32, i32, <16 x i16>) +declare void @llvm.genx.GenISA.LSC2DBlockWrite.p0i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32, <8 x i32>) +declare void @llvm.genx.GenISA.LSC2DBlockWriteAddrPayload.p0i32.v16i16(ptr, i32, i32, i32, i32, i32, i32, i1, i1, i32, <16 x i16>) +declare spir_func <8 x i32> @do_math_v8i32_v8i32(<8 x i32>) #1 diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-select-instruction-typed-pointers.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-select-instruction-typed-pointers.ll new file mode 100644 index 000000000000..29016636148b --- /dev/null +++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-select-instruction-typed-pointers.ll @@ -0,0 +1,221 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2024 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= +; +; RUN: igc_opt --igc-scalarize -S < %s | FileCheck %s +; ------------------------------------------------ +; ScalarizeFunction +; ------------------------------------------------ +; The pass should break vector operation into many scalar operations +; ------------------------------------------------ + +define <2 x i32> @basic(<2 x i1> %switch, <2 x i32> %src1, <2 x i32> %src2) { +; CHECK-LABEL: define <2 x i32> @basic( +; CHECK-SAME: <2 x i1> [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SWITCH_SCALAR:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 0 +; CHECK-NEXT: [[SWITCH_SCALAR3:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 1 +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[RESULT4:%.*]] = select i1 [[SWITCH_SCALAR]], i32 [[SRC1_SCALAR]], i32 [[SRC2_SCALAR]] +; CHECK-NEXT: [[RESULT5:%.*]] = select i1 [[SWITCH_SCALAR3]], i32 [[SRC1_SCALAR1]], i32 [[SRC2_SCALAR2]] +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[RESULT4]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT6:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT5]], i32 1 +; CHECK-NEXT: ret <2 x i32> [[RESULT_ASSEMBLED_VECT6]] +; + %result = select <2 x i1> %switch, <2 x i32> %src1, <2 x i32> %src2 + ret <2 x i32> %result +} + +define <2 x float> @should_work_with_different_value_type(<2 x i1> %switch, <2 x float> %src1, <2 x float> %src2) { +; CHECK-LABEL: define <2 x float> @should_work_with_different_value_type( +; CHECK-SAME: <2 x i1> [[SWITCH:%.*]], <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SWITCH_SCALAR:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 0 +; CHECK-NEXT: [[SWITCH_SCALAR3:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 1 +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 +; CHECK-NEXT: [[RESULT4:%.*]] = select i1 [[SWITCH_SCALAR]], float [[SRC1_SCALAR]], float [[SRC2_SCALAR]] +; CHECK-NEXT: [[RESULT5:%.*]] = select i1 [[SWITCH_SCALAR3]], float [[SRC1_SCALAR1]], float [[SRC2_SCALAR2]] +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[RESULT4]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT6:%.*]] = insertelement <2 x float> [[RESULT_ASSEMBLED_VECT]], float [[RESULT5]], i32 1 +; CHECK-NEXT: ret <2 x float> [[RESULT_ASSEMBLED_VECT6]] +; + %result = select <2 x i1> %switch, <2 x float> %src1, <2 x float> %src2 + ret <2 x float> %result +} + +; triangulating with @should_work_with_different_value_type +; update checks if fast will be preserved +define <2 x float> @should_work_with_fast_math_flags(<2 x i1> %switch, <2 x float> %src1, <2 x float> %src2) { +; CHECK-LABEL: define <2 x float> @should_work_with_fast_math_flags( +; CHECK-SAME: <2 x i1> [[SWITCH:%.*]], <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SWITCH_SCALAR:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 0 +; CHECK-NEXT: [[SWITCH_SCALAR3:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 1 +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 +; CHECK-NEXT: [[RESULT4:%.*]] = select i1 [[SWITCH_SCALAR]], float [[SRC1_SCALAR]], float [[SRC2_SCALAR]] +; CHECK-NEXT: [[RESULT5:%.*]] = select i1 [[SWITCH_SCALAR3]], float [[SRC1_SCALAR1]], float [[SRC2_SCALAR2]] +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[RESULT4]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT6:%.*]] = insertelement <2 x float> [[RESULT_ASSEMBLED_VECT]], float [[RESULT5]], i32 1 +; CHECK-NEXT: ret <2 x float> [[RESULT_ASSEMBLED_VECT6]] +; + %result = select fast <2 x i1> %switch, <2 x float> %src1, <2 x float> %src2 + ret <2 x float> %result +} + +define <16 x i32> @should_work_with_larger_vector_size(<16 x i1> %switch, <16 x i32> %src1, <16 x i32> %src2) { +; CHECK-LABEL: define <16 x i32> @should_work_with_larger_vector_size( +; CHECK-SAME: <16 x i1> [[SWITCH:%.*]], <16 x i32> [[SRC1:%.*]], <16 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SWITCH_SCALAR:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 0 +; CHECK-NEXT: [[SWITCH_SCALAR31:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 1 +; CHECK-NEXT: [[SWITCH_SCALAR32:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 2 +; CHECK-NEXT: [[SWITCH_SCALAR33:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 3 +; CHECK-NEXT: [[SWITCH_SCALAR34:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 4 +; CHECK-NEXT: [[SWITCH_SCALAR35:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 5 +; CHECK-NEXT: [[SWITCH_SCALAR36:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 6 +; CHECK-NEXT: [[SWITCH_SCALAR37:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 7 +; CHECK-NEXT: [[SWITCH_SCALAR38:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 8 +; CHECK-NEXT: [[SWITCH_SCALAR39:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 9 +; CHECK-NEXT: [[SWITCH_SCALAR40:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 10 +; CHECK-NEXT: [[SWITCH_SCALAR41:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 11 +; CHECK-NEXT: [[SWITCH_SCALAR42:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 12 +; CHECK-NEXT: [[SWITCH_SCALAR43:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 13 +; CHECK-NEXT: [[SWITCH_SCALAR44:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 14 +; CHECK-NEXT: [[SWITCH_SCALAR45:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 15 +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <16 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR16:%.*]] = extractelement <16 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC2_SCALAR17:%.*]] = extractelement <16 x i32> [[SRC2]], i32 2 +; CHECK-NEXT: [[SRC2_SCALAR18:%.*]] = extractelement <16 x i32> [[SRC2]], i32 3 +; CHECK-NEXT: [[SRC2_SCALAR19:%.*]] = extractelement <16 x i32> [[SRC2]], i32 4 +; CHECK-NEXT: [[SRC2_SCALAR20:%.*]] = extractelement <16 x i32> [[SRC2]], i32 5 +; CHECK-NEXT: [[SRC2_SCALAR21:%.*]] = extractelement <16 x i32> [[SRC2]], i32 6 +; CHECK-NEXT: [[SRC2_SCALAR22:%.*]] = extractelement <16 x i32> [[SRC2]], i32 7 +; CHECK-NEXT: [[SRC2_SCALAR23:%.*]] = extractelement <16 x i32> [[SRC2]], i32 8 +; CHECK-NEXT: [[SRC2_SCALAR24:%.*]] = extractelement <16 x i32> [[SRC2]], i32 9 +; CHECK-NEXT: [[SRC2_SCALAR25:%.*]] = extractelement <16 x i32> [[SRC2]], i32 10 +; CHECK-NEXT: [[SRC2_SCALAR26:%.*]] = extractelement <16 x i32> [[SRC2]], i32 11 +; CHECK-NEXT: [[SRC2_SCALAR27:%.*]] = extractelement <16 x i32> [[SRC2]], i32 12 +; CHECK-NEXT: [[SRC2_SCALAR28:%.*]] = extractelement <16 x i32> [[SRC2]], i32 13 +; CHECK-NEXT: [[SRC2_SCALAR29:%.*]] = extractelement <16 x i32> [[SRC2]], i32 14 +; CHECK-NEXT: [[SRC2_SCALAR30:%.*]] = extractelement <16 x i32> [[SRC2]], i32 15 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <16 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <16 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR2:%.*]] = extractelement <16 x i32> [[SRC1]], i32 2 +; CHECK-NEXT: [[SRC1_SCALAR3:%.*]] = extractelement <16 x i32> [[SRC1]], i32 3 +; CHECK-NEXT: [[SRC1_SCALAR4:%.*]] = extractelement <16 x i32> [[SRC1]], i32 4 +; CHECK-NEXT: [[SRC1_SCALAR5:%.*]] = extractelement <16 x i32> [[SRC1]], i32 5 +; CHECK-NEXT: [[SRC1_SCALAR6:%.*]] = extractelement <16 x i32> [[SRC1]], i32 6 +; CHECK-NEXT: [[SRC1_SCALAR7:%.*]] = extractelement <16 x i32> [[SRC1]], i32 7 +; CHECK-NEXT: [[SRC1_SCALAR8:%.*]] = extractelement <16 x i32> [[SRC1]], i32 8 +; CHECK-NEXT: [[SRC1_SCALAR9:%.*]] = extractelement <16 x i32> [[SRC1]], i32 9 +; CHECK-NEXT: [[SRC1_SCALAR10:%.*]] = extractelement <16 x i32> [[SRC1]], i32 10 +; CHECK-NEXT: [[SRC1_SCALAR11:%.*]] = extractelement <16 x i32> [[SRC1]], i32 11 +; CHECK-NEXT: [[SRC1_SCALAR12:%.*]] = extractelement <16 x i32> [[SRC1]], i32 12 +; CHECK-NEXT: [[SRC1_SCALAR13:%.*]] = extractelement <16 x i32> [[SRC1]], i32 13 +; CHECK-NEXT: [[SRC1_SCALAR14:%.*]] = extractelement <16 x i32> [[SRC1]], i32 14 +; CHECK-NEXT: [[SRC1_SCALAR15:%.*]] = extractelement <16 x i32> [[SRC1]], i32 15 +; CHECK-NEXT: [[RESULT46:%.*]] = select i1 [[SWITCH_SCALAR]], i32 [[SRC1_SCALAR]], i32 [[SRC2_SCALAR]] +; CHECK-NEXT: [[RESULT47:%.*]] = select i1 [[SWITCH_SCALAR31]], i32 [[SRC1_SCALAR1]], i32 [[SRC2_SCALAR16]] +; CHECK-NEXT: [[RESULT48:%.*]] = select i1 [[SWITCH_SCALAR32]], i32 [[SRC1_SCALAR2]], i32 [[SRC2_SCALAR17]] +; CHECK-NEXT: [[RESULT49:%.*]] = select i1 [[SWITCH_SCALAR33]], i32 [[SRC1_SCALAR3]], i32 [[SRC2_SCALAR18]] +; CHECK-NEXT: [[RESULT50:%.*]] = select i1 [[SWITCH_SCALAR34]], i32 [[SRC1_SCALAR4]], i32 [[SRC2_SCALAR19]] +; CHECK-NEXT: [[RESULT51:%.*]] = select i1 [[SWITCH_SCALAR35]], i32 [[SRC1_SCALAR5]], i32 [[SRC2_SCALAR20]] +; CHECK-NEXT: [[RESULT52:%.*]] = select i1 [[SWITCH_SCALAR36]], i32 [[SRC1_SCALAR6]], i32 [[SRC2_SCALAR21]] +; CHECK-NEXT: [[RESULT53:%.*]] = select i1 [[SWITCH_SCALAR37]], i32 [[SRC1_SCALAR7]], i32 [[SRC2_SCALAR22]] +; CHECK-NEXT: [[RESULT54:%.*]] = select i1 [[SWITCH_SCALAR38]], i32 [[SRC1_SCALAR8]], i32 [[SRC2_SCALAR23]] +; CHECK-NEXT: [[RESULT55:%.*]] = select i1 [[SWITCH_SCALAR39]], i32 [[SRC1_SCALAR9]], i32 [[SRC2_SCALAR24]] +; CHECK-NEXT: [[RESULT56:%.*]] = select i1 [[SWITCH_SCALAR40]], i32 [[SRC1_SCALAR10]], i32 [[SRC2_SCALAR25]] +; CHECK-NEXT: [[RESULT57:%.*]] = select i1 [[SWITCH_SCALAR41]], i32 [[SRC1_SCALAR11]], i32 [[SRC2_SCALAR26]] +; CHECK-NEXT: [[RESULT58:%.*]] = select i1 [[SWITCH_SCALAR42]], i32 [[SRC1_SCALAR12]], i32 [[SRC2_SCALAR27]] +; CHECK-NEXT: [[RESULT59:%.*]] = select i1 [[SWITCH_SCALAR43]], i32 [[SRC1_SCALAR13]], i32 [[SRC2_SCALAR28]] +; CHECK-NEXT: [[RESULT60:%.*]] = select i1 [[SWITCH_SCALAR44]], i32 [[SRC1_SCALAR14]], i32 [[SRC2_SCALAR29]] +; CHECK-NEXT: [[RESULT61:%.*]] = select i1 [[SWITCH_SCALAR45]], i32 [[SRC1_SCALAR15]], i32 [[SRC2_SCALAR30]] +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <16 x i32> undef, i32 [[RESULT46]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT62:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT47]], i32 1 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT63:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT62]], i32 [[RESULT48]], i32 2 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT64:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT63]], i32 [[RESULT49]], i32 3 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT65:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT64]], i32 [[RESULT50]], i32 4 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT66:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT65]], i32 [[RESULT51]], i32 5 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT67:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT66]], i32 [[RESULT52]], i32 6 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT68:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT67]], i32 [[RESULT53]], i32 7 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT69:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT68]], i32 [[RESULT54]], i32 8 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT70:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT69]], i32 [[RESULT55]], i32 9 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT71:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT70]], i32 [[RESULT56]], i32 10 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT72:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT71]], i32 [[RESULT57]], i32 11 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT73:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT72]], i32 [[RESULT58]], i32 12 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT74:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT73]], i32 [[RESULT59]], i32 13 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT75:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT74]], i32 [[RESULT60]], i32 14 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT76:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT75]], i32 [[RESULT61]], i32 15 +; CHECK-NEXT: ret <16 x i32> [[RESULT_ASSEMBLED_VECT76]] +; + %result = select <16 x i1> %switch, <16 x i32> %src1, <16 x i32> %src2 + ret <16 x i32> %result +} + +define <2 x i32> @should_work_with_constant_value(<2 x i1> %switch, <2 x i32> %src1) { +; CHECK-LABEL: define <2 x i32> @should_work_with_constant_value( +; CHECK-SAME: <2 x i1> [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SWITCH_SCALAR:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 0 +; CHECK-NEXT: [[SWITCH_SCALAR2:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[RESULT3:%.*]] = select i1 [[SWITCH_SCALAR]], i32 [[SRC1_SCALAR]], i32 2 +; CHECK-NEXT: [[RESULT4:%.*]] = select i1 [[SWITCH_SCALAR2]], i32 [[SRC1_SCALAR1]], i32 4 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[RESULT3]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT5:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT4]], i32 1 +; CHECK-NEXT: ret <2 x i32> [[RESULT_ASSEMBLED_VECT5]] +; + %result = select <2 x i1> %switch, <2 x i32> %src1, <2 x i32> + ret <2 x i32> %result +} + +define <2 x i32> @should_work_with_non_vector_condition(i1 %switch, <2 x i32> %src1, <2 x i32> %src2) { +; CHECK-LABEL: define <2 x i32> @should_work_with_non_vector_condition( +; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[RESULT3:%.*]] = select i1 [[SWITCH]], i32 [[SRC1_SCALAR]], i32 [[SRC2_SCALAR]] +; CHECK-NEXT: [[RESULT4:%.*]] = select i1 [[SWITCH]], i32 [[SRC1_SCALAR1]], i32 [[SRC2_SCALAR2]] +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[RESULT3]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT5:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT4]], i32 1 +; CHECK-NEXT: ret <2 x i32> [[RESULT_ASSEMBLED_VECT5]] +; + %result = select i1 %switch, <2 x i32> %src1, <2 x i32> %src2 + ret <2 x i32> %result +} + +define <2 x i32> @should_not_select_from_the_same_value(<2 x i1> %switch, <2 x i32> %src1) { +; CHECK-LABEL: define <2 x i32> @should_not_select_from_the_same_value( +; CHECK-SAME: <2 x i1> [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SWITCH_SCALAR:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 0 +; CHECK-NEXT: [[SWITCH_SCALAR2:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[SRC1_SCALAR]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT3:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[SRC1_SCALAR1]], i32 1 +; CHECK-NEXT: ret <2 x i32> [[RESULT_ASSEMBLED_VECT3]] +; + %result = select <2 x i1> %switch, <2 x i32> %src1, <2 x i32> %src1 + ret <2 x i32> %result +} + +define i32 @should_not_scalarize_scalar(i1 %switch, i32 %src1, i32 %src2) { +; CHECK-LABEL: @should_not_scalarize_scalar( +; CHECK-NEXT: [[RESULT:%.*]] = select i1 [[SWITCH:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]] +; CHECK-NEXT: ret i32 [[RESULT]] +; + %result = select i1 %switch, i32 %src1, i32 %src2 + ret i32 %result +} diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-select-instruction.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-select-instruction.ll new file mode 100644 index 000000000000..0e44c872d8ab --- /dev/null +++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-select-instruction.ll @@ -0,0 +1,223 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2024 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= +; +; RUN: igc_opt --igc-scalarize --opaque-pointers -S < %s | FileCheck %s +; REQUIRES: llvm-14-plus +; ------------------------------------------------ +; ScalarizeFunction +; ------------------------------------------------ +; The pass should break vector operation into many scalar operations +; ------------------------------------------------ + +define <2 x i32> @basic(<2 x i1> %switch, <2 x i32> %src1, <2 x i32> %src2) { +; CHECK-LABEL: define <2 x i32> @basic( +; CHECK-SAME: <2 x i1> [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SWITCH_SCALAR:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 0 +; CHECK-NEXT: [[SWITCH_SCALAR3:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 1 +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[RESULT4:%.*]] = select i1 [[SWITCH_SCALAR]], i32 [[SRC1_SCALAR]], i32 [[SRC2_SCALAR]] +; CHECK-NEXT: [[RESULT5:%.*]] = select i1 [[SWITCH_SCALAR3]], i32 [[SRC1_SCALAR1]], i32 [[SRC2_SCALAR2]] +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[RESULT4]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT6:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT5]], i32 1 +; CHECK-NEXT: ret <2 x i32> [[RESULT_ASSEMBLED_VECT6]] +; + %result = select <2 x i1> %switch, <2 x i32> %src1, <2 x i32> %src2 + ret <2 x i32> %result +} + +define <2 x float> @should_work_with_different_value_type(<2 x i1> %switch, <2 x float> %src1, <2 x float> %src2) { +; CHECK-LABEL: define <2 x float> @should_work_with_different_value_type( +; CHECK-SAME: <2 x i1> [[SWITCH:%.*]], <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SWITCH_SCALAR:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 0 +; CHECK-NEXT: [[SWITCH_SCALAR3:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 1 +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 +; CHECK-NEXT: [[RESULT4:%.*]] = select i1 [[SWITCH_SCALAR]], float [[SRC1_SCALAR]], float [[SRC2_SCALAR]] +; CHECK-NEXT: [[RESULT5:%.*]] = select i1 [[SWITCH_SCALAR3]], float [[SRC1_SCALAR1]], float [[SRC2_SCALAR2]] +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[RESULT4]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT6:%.*]] = insertelement <2 x float> [[RESULT_ASSEMBLED_VECT]], float [[RESULT5]], i32 1 +; CHECK-NEXT: ret <2 x float> [[RESULT_ASSEMBLED_VECT6]] +; + %result = select <2 x i1> %switch, <2 x float> %src1, <2 x float> %src2 + ret <2 x float> %result +} + +; triangulating with @should_work_with_different_value_type +; update checks if fast will be preserved +define <2 x float> @should_work_with_fast_math_flags(<2 x i1> %switch, <2 x float> %src1, <2 x float> %src2) { +; CHECK-LABEL: define <2 x float> @should_work_with_fast_math_flags( +; CHECK-SAME: <2 x i1> [[SWITCH:%.*]], <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SWITCH_SCALAR:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 0 +; CHECK-NEXT: [[SWITCH_SCALAR3:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 1 +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 +; CHECK-NEXT: [[RESULT4:%.*]] = select i1 [[SWITCH_SCALAR]], float [[SRC1_SCALAR]], float [[SRC2_SCALAR]] +; CHECK-NEXT: [[RESULT5:%.*]] = select i1 [[SWITCH_SCALAR3]], float [[SRC1_SCALAR1]], float [[SRC2_SCALAR2]] +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[RESULT4]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT6:%.*]] = insertelement <2 x float> [[RESULT_ASSEMBLED_VECT]], float [[RESULT5]], i32 1 +; CHECK-NEXT: ret <2 x float> [[RESULT_ASSEMBLED_VECT6]] +; + %result = select fast <2 x i1> %switch, <2 x float> %src1, <2 x float> %src2 + ret <2 x float> %result +} + +define <16 x i32> @should_work_with_larger_vector_size(<16 x i1> %switch, <16 x i32> %src1, <16 x i32> %src2) { +; CHECK-LABEL: define <16 x i32> @should_work_with_larger_vector_size( +; CHECK-SAME: <16 x i1> [[SWITCH:%.*]], <16 x i32> [[SRC1:%.*]], <16 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SWITCH_SCALAR:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 0 +; CHECK-NEXT: [[SWITCH_SCALAR31:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 1 +; CHECK-NEXT: [[SWITCH_SCALAR32:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 2 +; CHECK-NEXT: [[SWITCH_SCALAR33:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 3 +; CHECK-NEXT: [[SWITCH_SCALAR34:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 4 +; CHECK-NEXT: [[SWITCH_SCALAR35:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 5 +; CHECK-NEXT: [[SWITCH_SCALAR36:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 6 +; CHECK-NEXT: [[SWITCH_SCALAR37:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 7 +; CHECK-NEXT: [[SWITCH_SCALAR38:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 8 +; CHECK-NEXT: [[SWITCH_SCALAR39:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 9 +; CHECK-NEXT: [[SWITCH_SCALAR40:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 10 +; CHECK-NEXT: [[SWITCH_SCALAR41:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 11 +; CHECK-NEXT: [[SWITCH_SCALAR42:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 12 +; CHECK-NEXT: [[SWITCH_SCALAR43:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 13 +; CHECK-NEXT: [[SWITCH_SCALAR44:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 14 +; CHECK-NEXT: [[SWITCH_SCALAR45:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 15 +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <16 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR16:%.*]] = extractelement <16 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC2_SCALAR17:%.*]] = extractelement <16 x i32> [[SRC2]], i32 2 +; CHECK-NEXT: [[SRC2_SCALAR18:%.*]] = extractelement <16 x i32> [[SRC2]], i32 3 +; CHECK-NEXT: [[SRC2_SCALAR19:%.*]] = extractelement <16 x i32> [[SRC2]], i32 4 +; CHECK-NEXT: [[SRC2_SCALAR20:%.*]] = extractelement <16 x i32> [[SRC2]], i32 5 +; CHECK-NEXT: [[SRC2_SCALAR21:%.*]] = extractelement <16 x i32> [[SRC2]], i32 6 +; CHECK-NEXT: [[SRC2_SCALAR22:%.*]] = extractelement <16 x i32> [[SRC2]], i32 7 +; CHECK-NEXT: [[SRC2_SCALAR23:%.*]] = extractelement <16 x i32> [[SRC2]], i32 8 +; CHECK-NEXT: [[SRC2_SCALAR24:%.*]] = extractelement <16 x i32> [[SRC2]], i32 9 +; CHECK-NEXT: [[SRC2_SCALAR25:%.*]] = extractelement <16 x i32> [[SRC2]], i32 10 +; CHECK-NEXT: [[SRC2_SCALAR26:%.*]] = extractelement <16 x i32> [[SRC2]], i32 11 +; CHECK-NEXT: [[SRC2_SCALAR27:%.*]] = extractelement <16 x i32> [[SRC2]], i32 12 +; CHECK-NEXT: [[SRC2_SCALAR28:%.*]] = extractelement <16 x i32> [[SRC2]], i32 13 +; CHECK-NEXT: [[SRC2_SCALAR29:%.*]] = extractelement <16 x i32> [[SRC2]], i32 14 +; CHECK-NEXT: [[SRC2_SCALAR30:%.*]] = extractelement <16 x i32> [[SRC2]], i32 15 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <16 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <16 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR2:%.*]] = extractelement <16 x i32> [[SRC1]], i32 2 +; CHECK-NEXT: [[SRC1_SCALAR3:%.*]] = extractelement <16 x i32> [[SRC1]], i32 3 +; CHECK-NEXT: [[SRC1_SCALAR4:%.*]] = extractelement <16 x i32> [[SRC1]], i32 4 +; CHECK-NEXT: [[SRC1_SCALAR5:%.*]] = extractelement <16 x i32> [[SRC1]], i32 5 +; CHECK-NEXT: [[SRC1_SCALAR6:%.*]] = extractelement <16 x i32> [[SRC1]], i32 6 +; CHECK-NEXT: [[SRC1_SCALAR7:%.*]] = extractelement <16 x i32> [[SRC1]], i32 7 +; CHECK-NEXT: [[SRC1_SCALAR8:%.*]] = extractelement <16 x i32> [[SRC1]], i32 8 +; CHECK-NEXT: [[SRC1_SCALAR9:%.*]] = extractelement <16 x i32> [[SRC1]], i32 9 +; CHECK-NEXT: [[SRC1_SCALAR10:%.*]] = extractelement <16 x i32> [[SRC1]], i32 10 +; CHECK-NEXT: [[SRC1_SCALAR11:%.*]] = extractelement <16 x i32> [[SRC1]], i32 11 +; CHECK-NEXT: [[SRC1_SCALAR12:%.*]] = extractelement <16 x i32> [[SRC1]], i32 12 +; CHECK-NEXT: [[SRC1_SCALAR13:%.*]] = extractelement <16 x i32> [[SRC1]], i32 13 +; CHECK-NEXT: [[SRC1_SCALAR14:%.*]] = extractelement <16 x i32> [[SRC1]], i32 14 +; CHECK-NEXT: [[SRC1_SCALAR15:%.*]] = extractelement <16 x i32> [[SRC1]], i32 15 +; CHECK-NEXT: [[RESULT46:%.*]] = select i1 [[SWITCH_SCALAR]], i32 [[SRC1_SCALAR]], i32 [[SRC2_SCALAR]] +; CHECK-NEXT: [[RESULT47:%.*]] = select i1 [[SWITCH_SCALAR31]], i32 [[SRC1_SCALAR1]], i32 [[SRC2_SCALAR16]] +; CHECK-NEXT: [[RESULT48:%.*]] = select i1 [[SWITCH_SCALAR32]], i32 [[SRC1_SCALAR2]], i32 [[SRC2_SCALAR17]] +; CHECK-NEXT: [[RESULT49:%.*]] = select i1 [[SWITCH_SCALAR33]], i32 [[SRC1_SCALAR3]], i32 [[SRC2_SCALAR18]] +; CHECK-NEXT: [[RESULT50:%.*]] = select i1 [[SWITCH_SCALAR34]], i32 [[SRC1_SCALAR4]], i32 [[SRC2_SCALAR19]] +; CHECK-NEXT: [[RESULT51:%.*]] = select i1 [[SWITCH_SCALAR35]], i32 [[SRC1_SCALAR5]], i32 [[SRC2_SCALAR20]] +; CHECK-NEXT: [[RESULT52:%.*]] = select i1 [[SWITCH_SCALAR36]], i32 [[SRC1_SCALAR6]], i32 [[SRC2_SCALAR21]] +; CHECK-NEXT: [[RESULT53:%.*]] = select i1 [[SWITCH_SCALAR37]], i32 [[SRC1_SCALAR7]], i32 [[SRC2_SCALAR22]] +; CHECK-NEXT: [[RESULT54:%.*]] = select i1 [[SWITCH_SCALAR38]], i32 [[SRC1_SCALAR8]], i32 [[SRC2_SCALAR23]] +; CHECK-NEXT: [[RESULT55:%.*]] = select i1 [[SWITCH_SCALAR39]], i32 [[SRC1_SCALAR9]], i32 [[SRC2_SCALAR24]] +; CHECK-NEXT: [[RESULT56:%.*]] = select i1 [[SWITCH_SCALAR40]], i32 [[SRC1_SCALAR10]], i32 [[SRC2_SCALAR25]] +; CHECK-NEXT: [[RESULT57:%.*]] = select i1 [[SWITCH_SCALAR41]], i32 [[SRC1_SCALAR11]], i32 [[SRC2_SCALAR26]] +; CHECK-NEXT: [[RESULT58:%.*]] = select i1 [[SWITCH_SCALAR42]], i32 [[SRC1_SCALAR12]], i32 [[SRC2_SCALAR27]] +; CHECK-NEXT: [[RESULT59:%.*]] = select i1 [[SWITCH_SCALAR43]], i32 [[SRC1_SCALAR13]], i32 [[SRC2_SCALAR28]] +; CHECK-NEXT: [[RESULT60:%.*]] = select i1 [[SWITCH_SCALAR44]], i32 [[SRC1_SCALAR14]], i32 [[SRC2_SCALAR29]] +; CHECK-NEXT: [[RESULT61:%.*]] = select i1 [[SWITCH_SCALAR45]], i32 [[SRC1_SCALAR15]], i32 [[SRC2_SCALAR30]] +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <16 x i32> undef, i32 [[RESULT46]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT62:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT47]], i32 1 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT63:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT62]], i32 [[RESULT48]], i32 2 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT64:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT63]], i32 [[RESULT49]], i32 3 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT65:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT64]], i32 [[RESULT50]], i32 4 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT66:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT65]], i32 [[RESULT51]], i32 5 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT67:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT66]], i32 [[RESULT52]], i32 6 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT68:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT67]], i32 [[RESULT53]], i32 7 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT69:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT68]], i32 [[RESULT54]], i32 8 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT70:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT69]], i32 [[RESULT55]], i32 9 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT71:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT70]], i32 [[RESULT56]], i32 10 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT72:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT71]], i32 [[RESULT57]], i32 11 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT73:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT72]], i32 [[RESULT58]], i32 12 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT74:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT73]], i32 [[RESULT59]], i32 13 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT75:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT74]], i32 [[RESULT60]], i32 14 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT76:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT75]], i32 [[RESULT61]], i32 15 +; CHECK-NEXT: ret <16 x i32> [[RESULT_ASSEMBLED_VECT76]] +; + %result = select <16 x i1> %switch, <16 x i32> %src1, <16 x i32> %src2 + ret <16 x i32> %result +} + +define <2 x i32> @should_work_with_constant_value(<2 x i1> %switch, <2 x i32> %src1) { +; CHECK-LABEL: define <2 x i32> @should_work_with_constant_value( +; CHECK-SAME: <2 x i1> [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SWITCH_SCALAR:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 0 +; CHECK-NEXT: [[SWITCH_SCALAR2:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[RESULT3:%.*]] = select i1 [[SWITCH_SCALAR]], i32 [[SRC1_SCALAR]], i32 2 +; CHECK-NEXT: [[RESULT4:%.*]] = select i1 [[SWITCH_SCALAR2]], i32 [[SRC1_SCALAR1]], i32 4 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[RESULT3]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT5:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT4]], i32 1 +; CHECK-NEXT: ret <2 x i32> [[RESULT_ASSEMBLED_VECT5]] +; + %result = select <2 x i1> %switch, <2 x i32> %src1, <2 x i32> + ret <2 x i32> %result +} + +define <2 x i32> @should_work_with_non_vector_condition(i1 %switch, <2 x i32> %src1, <2 x i32> %src2) { +; CHECK-LABEL: define <2 x i32> @should_work_with_non_vector_condition( +; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) { +; CHECK-NEXT: [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0 +; CHECK-NEXT: [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[RESULT3:%.*]] = select i1 [[SWITCH]], i32 [[SRC1_SCALAR]], i32 [[SRC2_SCALAR]] +; CHECK-NEXT: [[RESULT4:%.*]] = select i1 [[SWITCH]], i32 [[SRC1_SCALAR1]], i32 [[SRC2_SCALAR2]] +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[RESULT3]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT5:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT4]], i32 1 +; CHECK-NEXT: ret <2 x i32> [[RESULT_ASSEMBLED_VECT5]] +; + %result = select i1 %switch, <2 x i32> %src1, <2 x i32> %src2 + ret <2 x i32> %result +} + +define <2 x i32> @should_not_select_from_the_same_value(<2 x i1> %switch, <2 x i32> %src1) { +; CHECK-LABEL: define <2 x i32> @should_not_select_from_the_same_value( +; CHECK-SAME: <2 x i1> [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SWITCH_SCALAR:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 0 +; CHECK-NEXT: [[SWITCH_SCALAR2:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[SRC1_SCALAR]], i32 0 +; CHECK-NEXT: [[RESULT_ASSEMBLED_VECT3:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[SRC1_SCALAR1]], i32 1 +; CHECK-NEXT: ret <2 x i32> [[RESULT_ASSEMBLED_VECT3]] +; + %result = select <2 x i1> %switch, <2 x i32> %src1, <2 x i32> %src1 + ret <2 x i32> %result +} + +define i32 @should_not_scalarize_scalar(i1 %switch, i32 %src1, i32 %src2) { +; CHECK-LABEL: define i32 @should_not_scalarize_scalar( +; CHECK-SAME: i1 [[SWITCH:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = select i1 [[SWITCH]], i32 [[SRC1]], i32 [[SRC2]] +; CHECK-NEXT: ret i32 [[RESULT]] +; + %result = select i1 %switch, i32 %src1, i32 %src2 + ret i32 %result +} diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-unary-instruction-typed-pointers.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-unary-instruction-typed-pointers.ll new file mode 100644 index 000000000000..3b33266b812b --- /dev/null +++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-unary-instruction-typed-pointers.ll @@ -0,0 +1,187 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2024 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= +; +; RUN: igc_opt --igc-scalarize -S < %s | FileCheck %s +; ------------------------------------------------ +; ScalarizeFunction +; ------------------------------------------------ +; The pass should break vector operation into many scalar operations +; ------------------------------------------------ + +define spir_kernel void @basic(<2 x float> %src1) { +; CHECK-LABEL: define spir_kernel void @basic( +; CHECK-SAME: <2 x float> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x float>, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = fneg float [[SRC1_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = fneg float [[SRC1_SCALAR1]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x float> [[DOTASSEMBLED_VECT]], float [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x float> [[DOTASSEMBLED_VECT2]], <2 x float>* [[TMP1]], align 8 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x float> + %2 = fneg <2 x float> %src1 + store <2 x float> %2, <2 x float>* %1 + ret void +} + +define spir_kernel void @should_work_with_different_type(<2 x double> %src1) { +; CHECK-LABEL: define spir_kernel void @should_work_with_different_type( +; CHECK-SAME: <2 x double> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x double> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x double> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x double>, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = fneg double [[SRC1_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = fneg double [[SRC1_SCALAR1]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x double> [[DOTASSEMBLED_VECT]], double [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x double> [[DOTASSEMBLED_VECT2]], <2 x double>* [[TMP1]], align 16 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x double> + %2 = fneg <2 x double> %src1 + store <2 x double> %2, <2 x double>* %1 + ret void +} + +define spir_kernel void @should_work_with_fast_math_flags(<2 x double> %src1) { +; CHECK-LABEL: @should_work_with_fast_math_flags( +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x double> [[SRC1:%.*]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x double> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x double>, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = fneg fast double [[SRC1_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = fneg fast double [[SRC1_SCALAR1]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x double> [[DOTASSEMBLED_VECT]], double [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x double> [[DOTASSEMBLED_VECT2]], <2 x double>* [[TMP1]], align 16 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x double> + %2 = fneg fast <2 x double> %src1 + store <2 x double> %2, <2 x double>* %1 + ret void +} + +define spir_kernel void @should_work_with_larger_vector_size(<16 x float> %src1) { +; CHECK-LABEL: define spir_kernel void @should_work_with_larger_vector_size( +; CHECK-SAME: <16 x float> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <16 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <16 x float> [[SRC1]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR2:%.*]] = extractelement <16 x float> [[SRC1]], i32 2 +; CHECK-NEXT: [[SRC1_SCALAR3:%.*]] = extractelement <16 x float> [[SRC1]], i32 3 +; CHECK-NEXT: [[SRC1_SCALAR4:%.*]] = extractelement <16 x float> [[SRC1]], i32 4 +; CHECK-NEXT: [[SRC1_SCALAR5:%.*]] = extractelement <16 x float> [[SRC1]], i32 5 +; CHECK-NEXT: [[SRC1_SCALAR6:%.*]] = extractelement <16 x float> [[SRC1]], i32 6 +; CHECK-NEXT: [[SRC1_SCALAR7:%.*]] = extractelement <16 x float> [[SRC1]], i32 7 +; CHECK-NEXT: [[SRC1_SCALAR8:%.*]] = extractelement <16 x float> [[SRC1]], i32 8 +; CHECK-NEXT: [[SRC1_SCALAR9:%.*]] = extractelement <16 x float> [[SRC1]], i32 9 +; CHECK-NEXT: [[SRC1_SCALAR10:%.*]] = extractelement <16 x float> [[SRC1]], i32 10 +; CHECK-NEXT: [[SRC1_SCALAR11:%.*]] = extractelement <16 x float> [[SRC1]], i32 11 +; CHECK-NEXT: [[SRC1_SCALAR12:%.*]] = extractelement <16 x float> [[SRC1]], i32 12 +; CHECK-NEXT: [[SRC1_SCALAR13:%.*]] = extractelement <16 x float> [[SRC1]], i32 13 +; CHECK-NEXT: [[SRC1_SCALAR14:%.*]] = extractelement <16 x float> [[SRC1]], i32 14 +; CHECK-NEXT: [[SRC1_SCALAR15:%.*]] = extractelement <16 x float> [[SRC1]], i32 15 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <16 x float>, align 64 +; CHECK-NEXT: [[TMP2:%.*]] = fneg float [[SRC1_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = fneg float [[SRC1_SCALAR1]] +; CHECK-NEXT: [[TMP4:%.*]] = fneg float [[SRC1_SCALAR2]] +; CHECK-NEXT: [[TMP5:%.*]] = fneg float [[SRC1_SCALAR3]] +; CHECK-NEXT: [[TMP6:%.*]] = fneg float [[SRC1_SCALAR4]] +; CHECK-NEXT: [[TMP7:%.*]] = fneg float [[SRC1_SCALAR5]] +; CHECK-NEXT: [[TMP8:%.*]] = fneg float [[SRC1_SCALAR6]] +; CHECK-NEXT: [[TMP9:%.*]] = fneg float [[SRC1_SCALAR7]] +; CHECK-NEXT: [[TMP10:%.*]] = fneg float [[SRC1_SCALAR8]] +; CHECK-NEXT: [[TMP11:%.*]] = fneg float [[SRC1_SCALAR9]] +; CHECK-NEXT: [[TMP12:%.*]] = fneg float [[SRC1_SCALAR10]] +; CHECK-NEXT: [[TMP13:%.*]] = fneg float [[SRC1_SCALAR11]] +; CHECK-NEXT: [[TMP14:%.*]] = fneg float [[SRC1_SCALAR12]] +; CHECK-NEXT: [[TMP15:%.*]] = fneg float [[SRC1_SCALAR13]] +; CHECK-NEXT: [[TMP16:%.*]] = fneg float [[SRC1_SCALAR14]] +; CHECK-NEXT: [[TMP17:%.*]] = fneg float [[SRC1_SCALAR15]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT16:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[DOTASSEMBLED_VECT17:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT16]], float [[TMP4]], i32 2 +; CHECK-NEXT: [[DOTASSEMBLED_VECT18:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT17]], float [[TMP5]], i32 3 +; CHECK-NEXT: [[DOTASSEMBLED_VECT19:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT18]], float [[TMP6]], i32 4 +; CHECK-NEXT: [[DOTASSEMBLED_VECT20:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT19]], float [[TMP7]], i32 5 +; CHECK-NEXT: [[DOTASSEMBLED_VECT21:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT20]], float [[TMP8]], i32 6 +; CHECK-NEXT: [[DOTASSEMBLED_VECT22:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT21]], float [[TMP9]], i32 7 +; CHECK-NEXT: [[DOTASSEMBLED_VECT23:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT22]], float [[TMP10]], i32 8 +; CHECK-NEXT: [[DOTASSEMBLED_VECT24:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT23]], float [[TMP11]], i32 9 +; CHECK-NEXT: [[DOTASSEMBLED_VECT25:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT24]], float [[TMP12]], i32 10 +; CHECK-NEXT: [[DOTASSEMBLED_VECT26:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT25]], float [[TMP13]], i32 11 +; CHECK-NEXT: [[DOTASSEMBLED_VECT27:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT26]], float [[TMP14]], i32 12 +; CHECK-NEXT: [[DOTASSEMBLED_VECT28:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT27]], float [[TMP15]], i32 13 +; CHECK-NEXT: [[DOTASSEMBLED_VECT29:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT28]], float [[TMP16]], i32 14 +; CHECK-NEXT: [[DOTASSEMBLED_VECT30:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT29]], float [[TMP17]], i32 15 +; CHECK-NEXT: store <16 x float> [[DOTASSEMBLED_VECT30]], <16 x float>* [[TMP1]], align 64 +; CHECK-NEXT: ret void +; + %1 = alloca <16 x float> + %2 = fneg <16 x float> %src1 + store <16 x float> %2, <16 x float>* %1 + ret void +} + +define void @should_not_scalarize_optnone(<2 x float> %src1) #0 { +; CHECK-LABEL: @should_not_scalarize_optnone( +; CHECK: fneg <2 x float> %src1 + %1 = alloca <2 x float> + %2 = fneg <2 x float> %src1 + store <2 x float> %2, <2 x float>* %1 + ret void +} + +define <2 x float> @should_not_scalarize_const(<2 x float> %src1) { +; CHECK-LABEL: @should_not_scalarize_const( +; CHECK-NEXT: [[TMP1:%.*]] = fneg <2 x float> +; CHECK-NEXT: ret <2 x float> [[TMP1]] +; + %1 = fneg <2 x float> + ret <2 x float> %1 +} + +define spir_kernel void @should_not_scalarize_scalar(float %src1) { +; CHECK-LABEL: @should_not_scalarize_scalar( +; CHECK-NEXT: [[TMP1:%.*]] = alloca float, align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fneg float [[SRC1:%.*]] +; CHECK-NEXT: store float [[TMP2]], float* [[TMP1]], align 4 +; CHECK-NEXT: ret void +; + %1 = alloca float + %2 = fneg float %src1 + store float %2, float* %1 + ret void +} + +define void @test_fneg_optnone(<4 x float> %src, <3 x float> addrspace(1)* %out) #0 { + +; CHECK-LABEL: @test_fneg_optnone( +; +; CHECK: [[EE0:%.*]] = extractelement <4 x float> %src, i32 0 +; CHECK: [[EE1:%.*]] = extractelement <4 x float> %src, i32 1 +; CHECK: [[EE2:%.*]] = extractelement <4 x float> %src, i32 2 +; CHECK: [[EE3:%.*]] = extractelement <4 x float> %src, i32 3 +; CHECK: [[IE0:%.*]] = insertelement <3 x float> undef, float [[EE0]], i32 0 +; CHECK: [[IE1:%.*]] = insertelement <3 x float> [[IE0]], float [[EE1]], i32 1 +; CHECK: [[IE2:%.*]] = insertelement <3 x float> [[IE1]], float [[EE2]], i32 2 +; CHECK: [[FNEG:%.*]] = fneg <3 x float> [[IE2]] +; CHECK: store <3 x float> [[FNEG]], <3 x float> addrspace(1)* %out, align 4 + +; CHECK-NOT: fneg <3 x float> undef + + %1 = shufflevector <4 x float> %src, <4 x float> undef, <3 x i32> + %2 = fneg <3 x float> %1 + store <3 x float> %2, <3 x float> addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { noinline optnone } \ No newline at end of file diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-unary-instruction.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-unary-instruction.ll new file mode 100644 index 000000000000..c8b0720820e6 --- /dev/null +++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-unary-instruction.ll @@ -0,0 +1,194 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2024 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= +; +; RUN: igc_opt --igc-scalarize --opaque-pointers -S < %s | FileCheck %s +; REQUIRES: llvm-14-plus +; ------------------------------------------------ +; ScalarizeFunction +; ------------------------------------------------ +; The pass should break vector operation into many scalar operations +; ------------------------------------------------ + +define spir_kernel void @basic(<2 x float> %src1) { +; CHECK-LABEL: define spir_kernel void @basic( +; CHECK-SAME: <2 x float> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x float>, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = fneg float [[SRC1_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = fneg float [[SRC1_SCALAR1]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x float> [[DOTASSEMBLED_VECT]], float [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x float> [[DOTASSEMBLED_VECT2]], ptr [[TMP1]], align 8 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x float> + %2 = fneg <2 x float> %src1 + store <2 x float> %2, ptr %1 + ret void +} + +define spir_kernel void @should_work_with_different_type(<2 x double> %src1) { +; CHECK-LABEL: define spir_kernel void @should_work_with_different_type( +; CHECK-SAME: <2 x double> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x double> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x double> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x double>, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = fneg double [[SRC1_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = fneg double [[SRC1_SCALAR1]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x double> [[DOTASSEMBLED_VECT]], double [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x double> [[DOTASSEMBLED_VECT2]], ptr [[TMP1]], align 16 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x double> + %2 = fneg <2 x double> %src1 + store <2 x double> %2, ptr %1 + ret void +} + +define spir_kernel void @should_work_with_fast_math_flags(<2 x double> %src1) { +; CHECK-LABEL: define spir_kernel void @should_work_with_fast_math_flags( +; CHECK-SAME: <2 x double> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <2 x double> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <2 x double> [[SRC1]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x double>, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = fneg fast double [[SRC1_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = fneg fast double [[SRC1_SCALAR1]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x double> [[DOTASSEMBLED_VECT]], double [[TMP3]], i32 1 +; CHECK-NEXT: store <2 x double> [[DOTASSEMBLED_VECT2]], ptr [[TMP1]], align 16 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x double> + %2 = fneg fast <2 x double> %src1 + store <2 x double> %2, ptr %1 + ret void +} + +define spir_kernel void @should_work_with_larger_vector_size(<16 x float> %src1) { +; CHECK-LABEL: define spir_kernel void @should_work_with_larger_vector_size( +; CHECK-SAME: <16 x float> [[SRC1:%.*]]) { +; CHECK-NEXT: [[SRC1_SCALAR:%.*]] = extractelement <16 x float> [[SRC1]], i32 0 +; CHECK-NEXT: [[SRC1_SCALAR1:%.*]] = extractelement <16 x float> [[SRC1]], i32 1 +; CHECK-NEXT: [[SRC1_SCALAR2:%.*]] = extractelement <16 x float> [[SRC1]], i32 2 +; CHECK-NEXT: [[SRC1_SCALAR3:%.*]] = extractelement <16 x float> [[SRC1]], i32 3 +; CHECK-NEXT: [[SRC1_SCALAR4:%.*]] = extractelement <16 x float> [[SRC1]], i32 4 +; CHECK-NEXT: [[SRC1_SCALAR5:%.*]] = extractelement <16 x float> [[SRC1]], i32 5 +; CHECK-NEXT: [[SRC1_SCALAR6:%.*]] = extractelement <16 x float> [[SRC1]], i32 6 +; CHECK-NEXT: [[SRC1_SCALAR7:%.*]] = extractelement <16 x float> [[SRC1]], i32 7 +; CHECK-NEXT: [[SRC1_SCALAR8:%.*]] = extractelement <16 x float> [[SRC1]], i32 8 +; CHECK-NEXT: [[SRC1_SCALAR9:%.*]] = extractelement <16 x float> [[SRC1]], i32 9 +; CHECK-NEXT: [[SRC1_SCALAR10:%.*]] = extractelement <16 x float> [[SRC1]], i32 10 +; CHECK-NEXT: [[SRC1_SCALAR11:%.*]] = extractelement <16 x float> [[SRC1]], i32 11 +; CHECK-NEXT: [[SRC1_SCALAR12:%.*]] = extractelement <16 x float> [[SRC1]], i32 12 +; CHECK-NEXT: [[SRC1_SCALAR13:%.*]] = extractelement <16 x float> [[SRC1]], i32 13 +; CHECK-NEXT: [[SRC1_SCALAR14:%.*]] = extractelement <16 x float> [[SRC1]], i32 14 +; CHECK-NEXT: [[SRC1_SCALAR15:%.*]] = extractelement <16 x float> [[SRC1]], i32 15 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <16 x float>, align 64 +; CHECK-NEXT: [[TMP2:%.*]] = fneg float [[SRC1_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = fneg float [[SRC1_SCALAR1]] +; CHECK-NEXT: [[TMP4:%.*]] = fneg float [[SRC1_SCALAR2]] +; CHECK-NEXT: [[TMP5:%.*]] = fneg float [[SRC1_SCALAR3]] +; CHECK-NEXT: [[TMP6:%.*]] = fneg float [[SRC1_SCALAR4]] +; CHECK-NEXT: [[TMP7:%.*]] = fneg float [[SRC1_SCALAR5]] +; CHECK-NEXT: [[TMP8:%.*]] = fneg float [[SRC1_SCALAR6]] +; CHECK-NEXT: [[TMP9:%.*]] = fneg float [[SRC1_SCALAR7]] +; CHECK-NEXT: [[TMP10:%.*]] = fneg float [[SRC1_SCALAR8]] +; CHECK-NEXT: [[TMP11:%.*]] = fneg float [[SRC1_SCALAR9]] +; CHECK-NEXT: [[TMP12:%.*]] = fneg float [[SRC1_SCALAR10]] +; CHECK-NEXT: [[TMP13:%.*]] = fneg float [[SRC1_SCALAR11]] +; CHECK-NEXT: [[TMP14:%.*]] = fneg float [[SRC1_SCALAR12]] +; CHECK-NEXT: [[TMP15:%.*]] = fneg float [[SRC1_SCALAR13]] +; CHECK-NEXT: [[TMP16:%.*]] = fneg float [[SRC1_SCALAR14]] +; CHECK-NEXT: [[TMP17:%.*]] = fneg float [[SRC1_SCALAR15]] +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT16:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[DOTASSEMBLED_VECT17:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT16]], float [[TMP4]], i32 2 +; CHECK-NEXT: [[DOTASSEMBLED_VECT18:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT17]], float [[TMP5]], i32 3 +; CHECK-NEXT: [[DOTASSEMBLED_VECT19:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT18]], float [[TMP6]], i32 4 +; CHECK-NEXT: [[DOTASSEMBLED_VECT20:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT19]], float [[TMP7]], i32 5 +; CHECK-NEXT: [[DOTASSEMBLED_VECT21:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT20]], float [[TMP8]], i32 6 +; CHECK-NEXT: [[DOTASSEMBLED_VECT22:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT21]], float [[TMP9]], i32 7 +; CHECK-NEXT: [[DOTASSEMBLED_VECT23:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT22]], float [[TMP10]], i32 8 +; CHECK-NEXT: [[DOTASSEMBLED_VECT24:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT23]], float [[TMP11]], i32 9 +; CHECK-NEXT: [[DOTASSEMBLED_VECT25:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT24]], float [[TMP12]], i32 10 +; CHECK-NEXT: [[DOTASSEMBLED_VECT26:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT25]], float [[TMP13]], i32 11 +; CHECK-NEXT: [[DOTASSEMBLED_VECT27:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT26]], float [[TMP14]], i32 12 +; CHECK-NEXT: [[DOTASSEMBLED_VECT28:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT27]], float [[TMP15]], i32 13 +; CHECK-NEXT: [[DOTASSEMBLED_VECT29:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT28]], float [[TMP16]], i32 14 +; CHECK-NEXT: [[DOTASSEMBLED_VECT30:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT29]], float [[TMP17]], i32 15 +; CHECK-NEXT: store <16 x float> [[DOTASSEMBLED_VECT30]], ptr [[TMP1]], align 64 +; CHECK-NEXT: ret void +; + %1 = alloca <16 x float> + %2 = fneg <16 x float> %src1 + store <16 x float> %2, ptr %1 + ret void +} + +define void @should_not_scalarize_optnone(<2 x float> %src1) #0 { +; CHECK-LABEL: define void @should_not_scalarize_optnone( +; CHECK-SAME: <2 x float> [[SRC1:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = alloca <2 x float>, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = fneg <2 x float> [[SRC1]] +; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[TMP1]], align 8 +; CHECK-NEXT: ret void +; + %1 = alloca <2 x float> + %2 = fneg <2 x float> %src1 + store <2 x float> %2, ptr %1 + ret void +} + +define <2 x float> @should_not_scalarize_const(<2 x float> %src1) { +; CHECK-LABEL: define <2 x float> @should_not_scalarize_const( +; CHECK-SAME: <2 x float> [[SRC1:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = fneg <2 x float> +; CHECK-NEXT: ret <2 x float> [[TMP1]] +; + %1 = fneg <2 x float> + ret <2 x float> %1 +} + +define spir_kernel void @should_not_scalarize_scalar(float %src1) { +; CHECK-LABEL: define spir_kernel void @should_not_scalarize_scalar( +; CHECK-SAME: float [[SRC1:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = alloca float, align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fneg float [[SRC1]] +; CHECK-NEXT: store float [[TMP2]], ptr [[TMP1]], align 4 +; CHECK-NEXT: ret void +; + %1 = alloca float + %2 = fneg float %src1 + store float %2, ptr %1 + ret void +} + +define void @test_fneg_optnone(<4 x float> %src, ptr addrspace(1) %out) #0 { +; CHECK-LABEL: define void @test_fneg_optnone( +; CHECK-SAME: <4 x float> [[SRC:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[SRC_SCALAR:%.*]] = extractelement <4 x float> [[SRC]], i32 0 +; CHECK-NEXT: [[SRC_SCALAR1:%.*]] = extractelement <4 x float> [[SRC]], i32 1 +; CHECK-NEXT: [[SRC_SCALAR2:%.*]] = extractelement <4 x float> [[SRC]], i32 2 +; CHECK-NEXT: [[SRC_SCALAR3:%.*]] = extractelement <4 x float> [[SRC]], i32 3 +; CHECK-NEXT: [[DOTASSEMBLED_VECT:%.*]] = insertelement <3 x float> undef, float [[SRC_SCALAR]], i32 0 +; CHECK-NEXT: [[DOTASSEMBLED_VECT4:%.*]] = insertelement <3 x float> [[DOTASSEMBLED_VECT]], float [[SRC_SCALAR1]], i32 1 +; CHECK-NEXT: [[DOTASSEMBLED_VECT5:%.*]] = insertelement <3 x float> [[DOTASSEMBLED_VECT4]], float [[SRC_SCALAR2]], i32 2 +; CHECK-NEXT: [[TMP1:%.*]] = fneg <3 x float> [[DOTASSEMBLED_VECT5]] +; CHECK-NEXT: store <3 x float> [[TMP1]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: ret void +; + %1 = shufflevector <4 x float> %src, <4 x float> undef, <3 x i32> + %2 = fneg <3 x float> %1 + store <3 x float> %2, ptr addrspace(1) %out, align 4 + ret void +} + +attributes #0 = { noinline optnone } diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-vector-instructions-typed-pointers.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-vector-instructions-typed-pointers.ll new file mode 100644 index 000000000000..af285857e35f --- /dev/null +++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-vector-instructions-typed-pointers.ll @@ -0,0 +1,145 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2024 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= +; +; RUN: igc_opt --igc-scalarize -S < %s | FileCheck %s +; ------------------------------------------------ +; ScalarizeFunction +; ------------------------------------------------ +; The pass should break vector operation into many scalar operations +; ------------------------------------------------ + +define i32 @basic(i32 %src) { +; CHECK-LABEL: define i32 @basic( +; CHECK-SAME: i32 [[SRC:%.*]]) { +; CHECK-NEXT: ret i32 [[SRC]] +; + %vector = insertelement <2 x i32> undef, i32 %src, i32 0 + %result = extractelement <2 x i32> %vector, i32 0 + ret i32 %result +} + +define float @should_work_with_different_value_type(float %src) { +; CHECK-LABEL: define float @should_work_with_different_value_type( +; CHECK-SAME: float [[SRC:%.*]]) { +; CHECK-NEXT: ret float [[SRC]] +; + %vector = insertelement <2 x float> undef, float %src, i32 0 + %result = extractelement <2 x float> %vector, i32 0 + ret float %result +} + +define i32 @should_return_undef(i32 %src) { +; CHECK-LABEL: define i32 @should_return_undef( +; CHECK-SAME: i32 [[SRC:%.*]]) { +; CHECK-NEXT: ret i32 undef +; + %vector = insertelement <2 x i32> undef, i32 %src, i32 0 + %result = extractelement <2 x i32> %vector, i32 1 + ret i32 %result +} + + +define i32 @should_work_with_larger_vector_size() { +; CHECK-LABEL: define i32 @should_work_with_larger_vector_size() { +; CHECK-NEXT: [[RESULT:%.*]] = add i32 0, 8 +; CHECK-NEXT: ret i32 [[RESULT]] +; + %1 = insertelement <16 x i32> undef, i32 0, i32 0 + %2 = insertelement <16 x i32> %1, i32 1, i32 1 + %3 = insertelement <16 x i32> %2, i32 2, i32 2 + %4 = insertelement <16 x i32> %3, i32 3, i32 3 + %5 = insertelement <16 x i32> %4, i32 4, i32 4 + %6 = insertelement <16 x i32> %5, i32 5, i32 5 + %7 = insertelement <16 x i32> %6, i32 6, i32 6 + %8 = insertelement <16 x i32> %7, i32 7, i32 7 + %9 = insertelement <16 x i32> %8, i32 8, i32 8 + %10 = insertelement <16 x i32> %9, i32 9, i32 9 + %11 = insertelement <16 x i32> %10, i32 10, i32 10 + %12 = insertelement <16 x i32> %11, i32 11, i32 11 + %13 = insertelement <16 x i32> %12, i32 12, i32 12 + %14 = insertelement <16 x i32> %13, i32 13, i32 13 + %15 = insertelement <16 x i32> %14, i32 14, i32 14 + %vector = insertelement <16 x i32> %15, i32 15, i32 15 + %first = extractelement <16 x i32> %vector, i32 0 + %second = extractelement <16 x i32> %vector, i32 8 + %result = add i32 %first, %second + ret i32 %result +} + +define i32 @should_work_with_shuffle_instruction() { +; CHECK-LABEL: define i32 @should_work_with_shuffle_instruction() { +; CHECK-NEXT: [[RETURN:%.*]] = add i32 1, 7 +; CHECK-NEXT: ret i32 [[RETURN]] +; + %1 = insertelement <4 x i32> undef, i32 0, i32 0 + %2 = insertelement <4 x i32> %1, i32 1, i32 1 + %3 = insertelement <4 x i32> %2, i32 2, i32 2 + %first_vector = insertelement <4 x i32> %3, i32 3, i32 3 + + %4 = insertelement <4 x i32> undef, i32 4, i32 0 + %5 = insertelement <4 x i32> %4, i32 5, i32 1 + %6 = insertelement <4 x i32> %5, i32 6, i32 2 + %second_vector = insertelement <4 x i32> %6, i32 7, i32 3 + + %result = shufflevector <4 x i32> %first_vector, <4 x i32> %second_vector, <4 x i32> + + %first = extractelement <4 x i32> %result, i32 0 + %second = extractelement <4 x i32> %result, i32 3 + %return = add i32 %first, %second + ret i32 %return +} + + +define i32 @should_not_scalarize_with_index_as_variable(i32 %data, i32 %index) { +; CHECK-LABEL: define i32 @should_not_scalarize_with_index_as_variable( +; CHECK-SAME: i32 [[DATA:%.*]], i32 [[INDEX:%.*]]) { +; CHECK-NEXT: [[VECTOR:%.*]] = insertelement <2 x i32> undef, i32 [[DATA]], i32 [[INDEX]] +; CHECK-NEXT: [[RESULT:%.*]] = extractelement <2 x i32> [[VECTOR]], i32 [[INDEX]] +; CHECK-NEXT: ret i32 [[RESULT]] +; + %vector = insertelement <2 x i32> undef, i32 %data, i32 %index + %result = extractelement <2 x i32> %vector, i32 %index + ret i32 %result +} + +define i32 @should_work_with_shuffle_undef() { +; CHECK-LABEL: define i32 @should_work_with_shuffle_undef() { +; CHECK-NEXT: [[RETURN:%.*]] = add i32 undef, undef +; CHECK-NEXT: ret i32 [[RETURN]] +; + %result = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> + + %first = extractelement <4 x i32> %result, i32 0 + %second = extractelement <4 x i32> %result, i32 3 + %return = add i32 %first, %second + ret i32 %return +} + +define i32 @should_work_with_shuffle_undef_2() { +; CHECK-LABEL: define i32 @should_work_with_shuffle_undef_2() { +; CHECK-NEXT: [[RETURN:%.*]] = add i32 1, undef +; CHECK-NEXT: ret i32 [[RETURN]] +; + %1 = insertelement <4 x i32> undef, i32 0, i32 0 + %2 = insertelement <4 x i32> %1, i32 1, i32 1 + %3 = insertelement <4 x i32> %2, i32 2, i32 2 + %first_vector = insertelement <4 x i32> %3, i32 3, i32 3 + + %4 = insertelement <4 x i32> undef, i32 4, i32 0 + %5 = insertelement <4 x i32> %4, i32 5, i32 1 + %6 = insertelement <4 x i32> %5, i32 6, i32 2 + %second_vector = insertelement <4 x i32> %6, i32 7, i32 3 + + %result = shufflevector <4 x i32> %first_vector, <4 x i32> %second_vector, <4 x i32> + + %first = extractelement <4 x i32> %result, i32 0 + %second = extractelement <4 x i32> %result, i32 3 + %return = add i32 %first, %second + ret i32 %return +} diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-vector-instructions.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-vector-instructions.ll new file mode 100644 index 000000000000..e1672779d0b6 --- /dev/null +++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-vector-instructions.ll @@ -0,0 +1,146 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2024 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= +; +; RUN: igc_opt --igc-scalarize --opaque-pointers -S < %s | FileCheck %s +; REQUIRES: llvm-14-plus +; ------------------------------------------------ +; ScalarizeFunction +; ------------------------------------------------ +; The pass should remove redundant vector operations +; ------------------------------------------------ + +define i32 @basic(i32 %src) { +; CHECK-LABEL: define i32 @basic( +; CHECK-SAME: i32 [[SRC:%.*]]) { +; CHECK-NEXT: ret i32 [[SRC]] +; + %vector = insertelement <2 x i32> undef, i32 %src, i32 0 + %result = extractelement <2 x i32> %vector, i32 0 + ret i32 %result +} + +define float @should_work_with_different_value_type(float %src) { +; CHECK-LABEL: define float @should_work_with_different_value_type( +; CHECK-SAME: float [[SRC:%.*]]) { +; CHECK-NEXT: ret float [[SRC]] +; + %vector = insertelement <2 x float> undef, float %src, i32 0 + %result = extractelement <2 x float> %vector, i32 0 + ret float %result +} + +define i32 @should_return_undef(i32 %src) { +; CHECK-LABEL: define i32 @should_return_undef( +; CHECK-SAME: i32 [[SRC:%.*]]) { +; CHECK-NEXT: ret i32 undef +; + %vector = insertelement <2 x i32> undef, i32 %src, i32 0 + %result = extractelement <2 x i32> %vector, i32 1 + ret i32 %result +} + + +define i32 @should_work_with_larger_vector_size() { +; CHECK-LABEL: define i32 @should_work_with_larger_vector_size() { +; CHECK-NEXT: [[RESULT:%.*]] = add i32 0, 8 +; CHECK-NEXT: ret i32 [[RESULT]] +; + %1 = insertelement <16 x i32> undef, i32 0, i32 0 + %2 = insertelement <16 x i32> %1, i32 1, i32 1 + %3 = insertelement <16 x i32> %2, i32 2, i32 2 + %4 = insertelement <16 x i32> %3, i32 3, i32 3 + %5 = insertelement <16 x i32> %4, i32 4, i32 4 + %6 = insertelement <16 x i32> %5, i32 5, i32 5 + %7 = insertelement <16 x i32> %6, i32 6, i32 6 + %8 = insertelement <16 x i32> %7, i32 7, i32 7 + %9 = insertelement <16 x i32> %8, i32 8, i32 8 + %10 = insertelement <16 x i32> %9, i32 9, i32 9 + %11 = insertelement <16 x i32> %10, i32 10, i32 10 + %12 = insertelement <16 x i32> %11, i32 11, i32 11 + %13 = insertelement <16 x i32> %12, i32 12, i32 12 + %14 = insertelement <16 x i32> %13, i32 13, i32 13 + %15 = insertelement <16 x i32> %14, i32 14, i32 14 + %vector = insertelement <16 x i32> %15, i32 15, i32 15 + %first = extractelement <16 x i32> %vector, i32 0 + %second = extractelement <16 x i32> %vector, i32 8 + %result = add i32 %first, %second + ret i32 %result +} + +define i32 @should_work_with_shuffle_instruction() { +; CHECK-LABEL: define i32 @should_work_with_shuffle_instruction() { +; CHECK-NEXT: [[RETURN:%.*]] = add i32 1, 7 +; CHECK-NEXT: ret i32 [[RETURN]] +; + %1 = insertelement <4 x i32> undef, i32 0, i32 0 + %2 = insertelement <4 x i32> %1, i32 1, i32 1 + %3 = insertelement <4 x i32> %2, i32 2, i32 2 + %first_vector = insertelement <4 x i32> %3, i32 3, i32 3 + + %4 = insertelement <4 x i32> undef, i32 4, i32 0 + %5 = insertelement <4 x i32> %4, i32 5, i32 1 + %6 = insertelement <4 x i32> %5, i32 6, i32 2 + %second_vector = insertelement <4 x i32> %6, i32 7, i32 3 + + %result = shufflevector <4 x i32> %first_vector, <4 x i32> %second_vector, <4 x i32> + + %first = extractelement <4 x i32> %result, i32 0 + %second = extractelement <4 x i32> %result, i32 3 + %return = add i32 %first, %second + ret i32 %return +} + + +define i32 @should_not_scalarize_with_index_as_variable(i32 %data, i32 %index) { +; CHECK-LABEL: define i32 @should_not_scalarize_with_index_as_variable( +; CHECK-SAME: i32 [[DATA:%.*]], i32 [[INDEX:%.*]]) { +; CHECK-NEXT: [[VECTOR:%.*]] = insertelement <2 x i32> undef, i32 [[DATA]], i32 [[INDEX]] +; CHECK-NEXT: [[RESULT:%.*]] = extractelement <2 x i32> [[VECTOR]], i32 [[INDEX]] +; CHECK-NEXT: ret i32 [[RESULT]] +; + %vector = insertelement <2 x i32> undef, i32 %data, i32 %index + %result = extractelement <2 x i32> %vector, i32 %index + ret i32 %result +} + +define i32 @should_work_with_shuffle_undef() { +; CHECK-LABEL: define i32 @should_work_with_shuffle_undef() { +; CHECK-NEXT: [[RETURN:%.*]] = add i32 undef, undef +; CHECK-NEXT: ret i32 [[RETURN]] +; + %result = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> + + %first = extractelement <4 x i32> %result, i32 0 + %second = extractelement <4 x i32> %result, i32 3 + %return = add i32 %first, %second + ret i32 %return +} + +define i32 @should_work_with_shuffle_undef_2() { +; CHECK-LABEL: define i32 @should_work_with_shuffle_undef_2() { +; CHECK-NEXT: [[RETURN:%.*]] = add i32 1, undef +; CHECK-NEXT: ret i32 [[RETURN]] +; + %1 = insertelement <4 x i32> undef, i32 0, i32 0 + %2 = insertelement <4 x i32> %1, i32 1, i32 1 + %3 = insertelement <4 x i32> %2, i32 2, i32 2 + %first_vector = insertelement <4 x i32> %3, i32 3, i32 3 + + %4 = insertelement <4 x i32> undef, i32 4, i32 0 + %5 = insertelement <4 x i32> %4, i32 5, i32 1 + %6 = insertelement <4 x i32> %5, i32 6, i32 2 + %second_vector = insertelement <4 x i32> %6, i32 7, i32 3 + + %result = shufflevector <4 x i32> %first_vector, <4 x i32> %second_vector, <4 x i32> + + %first = extractelement <4 x i32> %result, i32 0 + %second = extractelement <4 x i32> %result, i32 3 + %return = add i32 %first, %second + ret i32 %return +} diff --git a/IGC/Compiler/tests/ScalarizeFunction/selective-typed-pointers.ll b/IGC/Compiler/tests/ScalarizeFunction/selective-typed-pointers.ll new file mode 100644 index 000000000000..3a63e9914073 --- /dev/null +++ b/IGC/Compiler/tests/ScalarizeFunction/selective-typed-pointers.ll @@ -0,0 +1,352 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2024 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= +; +; REQUIRES: regkeys +; RUN: igc_opt --igc-scalarize -S --regkey=EnableSelectiveScalarizer=1 < %s | FileCheck %s +; ------------------------------------------------ +; ScalarizeFunction +; ------------------------------------------------ +; This test checks if selective scalarization leaves vectorial instructions un-scalarized. +; ------------------------------------------------ + +define spir_kernel void @test_selective_1(i64 %addr) #0 { +; CHECK-LABEL: @test_selective_1( +; CHECK-NEXT: [[VECTINT:%.*]] = add <8 x i32> , zeroinitializer +; CHECK-NEXT: [[VECTFLOAT:%.*]] = bitcast <8 x i32> [[VECTINT]] to <8 x float> +; CHECK-NEXT: [[VECTCAST:%.*]] = bitcast <8 x float> [[VECTFLOAT]] to <8 x i32> +; CHECK-NEXT: call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECTCAST]]) +; CHECK-NEXT: ret void +; + +; define a vector and do some bitcasts +; nothing should get scalarized here + + %vectint = add <8 x i32> , zeroinitializer + %vectfloat = bitcast <8 x i32> %vectint to <8 x float> + %vectcast = bitcast <8 x float> %vectfloat to <8 x i32> + call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> %vectcast) + + ret void +} + +define spir_kernel void @test_selective_2(i64 %addr) #0 { +; CHECK-LABEL: @test_selective_2( +; CHECK-NEXT: [[VECTINT:%.*]] = add <8 x i32> , zeroinitializer +; CHECK-NEXT: [[VECTFLOAT:%.*]] = bitcast <8 x i32> [[VECTINT]] to <8 x float> +; CHECK-NEXT: [[VECTCAST:%.*]] = bitcast <8 x float> [[VECTFLOAT]] to <8 x i32> +; CHECK-NEXT: call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECTCAST]]) +; CHECK-NEXT: [[ANOTHERCAST:%.*]] = bitcast <8 x float> [[VECTFLOAT]] to <8 x i32> +; CHECK-NEXT: [[ANOTHERCAST_SCALAR:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 0 +; CHECK-NEXT: [[ANOTHERCAST_SCALAR1:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 1 +; CHECK-NEXT: [[ANOTHERCAST_SCALAR2:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 2 +; CHECK-NEXT: [[ANOTHERCAST_SCALAR3:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 3 +; CHECK-NEXT: [[ANOTHERCAST_SCALAR4:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 4 +; CHECK-NEXT: [[ANOTHERCAST_SCALAR5:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 5 +; CHECK-NEXT: [[ANOTHERCAST_SCALAR6:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 6 +; CHECK-NEXT: [[ANOTHERCAST_SCALAR7:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 7 +; CHECK-NEXT: [[V3:%.*]] = add i32 [[ANOTHERCAST_SCALAR3]], [[ANOTHERCAST_SCALAR5]] +; CHECK-NEXT: ret void +; +; same as before, but %vectfloat is used in another branch of the code + %vectint = add <8 x i32> , zeroinitializer + %vectfloat = bitcast <8 x i32> %vectint to <8 x float> + %vectcast = bitcast <8 x float> %vectfloat to <8 x i32> + call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> %vectcast) +; so scalarization should happen here + %anothercast = bitcast <8 x float> %vectfloat to <8 x i32> + %v1 = extractelement <8 x i32> %anothercast, i32 3 + %v2 = extractelement <8 x i32> %anothercast, i32 5 + %v3 = add i32 %v1, %v2 + ret void +} + +define spir_kernel void @test_selective_3() { +; CHECK-LABEL: @test_selective_3( +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[DATA:%.*]] = phi <8 x i32> [ zeroinitializer, [[TMP0]] ], [ [[NEWDATA:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[NEWDATA]] = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> [[DATA]]) +; CHECK-NEXT: [[NEWOFFSET]] = add i32 [[OFFSET]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 10 +; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]] +; CHECK: end: +; CHECK-NEXT: ret void +; +; no scalarization happens here because the vectors %data and %newdata are used as whole + br label %loop + +loop: + %offset = phi i32 [ 0, %0 ], [ %newoffset, %loop ] + + %data = phi <8 x i32> [ zeroinitializer, %0 ], [ %newdata, %loop ] + %newdata = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> %data) + + %newoffset = add i32 %offset, 1 + %1 = icmp eq i32 %newoffset, 10 + br i1 %1, label %end, label %loop + +end: + ret void +} + +define spir_kernel void @test_selective_4(i64 %addr) #0 { +; CHECK-LABEL: @test_selective_4( +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[FLOAT_VECTOR:%.*]] = phi <8 x float> [ zeroinitializer, [[TMP0]] ], [ [[NEW_FLOAT_VECTOR:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[INT_VECTOR:%.*]] = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 [[OFFSET]], i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0) +; CHECK-NEXT: [[NEW_FLOAT_VECTOR]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[FLOAT_VECTOR]], <8 x i16> , <8 x i32> [[INT_VECTOR]], i32 11, i32 11, i32 8, i32 8, i1 false) +; CHECK-NEXT: [[NEWOFFSET]] = add i32 [[OFFSET]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 256 +; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]] +; CHECK: end: +; CHECK-NEXT: ret void +; +; same here: no scalarization + br label %loop + +loop: + %offset = phi i32 [ 0, %0 ], [ %newoffset, %loop ] + %float_vector = phi <8 x float> [ zeroinitializer, %0 ], [ %new_float_vector, %loop ] + %int_vector = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 %offset, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0) + %new_float_vector = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %float_vector, <8 x i16> , <8 x i32> %int_vector, i32 11, i32 11, i32 8, i32 8, i1 false) + %newoffset = add i32 %offset, 16 + %1 = icmp eq i32 %newoffset, 256 + br i1 %1, label %end, label %loop + +end: + ret void +} + + +define spir_kernel void @test_selective_5() { +; CHECK-LABEL: @test_selective_5( +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[DATA1:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[DATA3:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR10:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[DATA4:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR11:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[DATA5:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR12:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[DATA6:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ] +; CHECK-NEXT: [[DATA7:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ] +; CHECK-NEXT: [[DATA8:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ] +; CHECK-NEXT: [[DATA9:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ] +; CHECK-NEXT: [[DATA2_ASSEMBLED_VECT:%.*]] = insertelement <4 x i32> undef, i32 [[DATA1]], i32 0 +; CHECK-NEXT: [[DATA2_ASSEMBLED_VECT13:%.*]] = insertelement <4 x i32> [[DATA2_ASSEMBLED_VECT]], i32 [[DATA3]], i32 1 +; CHECK-NEXT: [[DATA2_ASSEMBLED_VECT14:%.*]] = insertelement <4 x i32> [[DATA2_ASSEMBLED_VECT13]], i32 [[DATA4]], i32 2 +; CHECK-NEXT: [[DATA2_ASSEMBLED_VECT15:%.*]] = insertelement <4 x i32> [[DATA2_ASSEMBLED_VECT14]], i32 [[DATA5]], i32 3 +; CHECK-NEXT: [[NEWDATA:%.*]] = call <4 x i32> @do_math_v4i32_v4i32(<4 x i32> [[DATA2_ASSEMBLED_VECT15]]) +; CHECK-NEXT: [[NEWDATA_SCALAR]] = extractelement <4 x i32> [[NEWDATA]], i32 0 +; CHECK-NEXT: [[NEWDATA_SCALAR10]] = extractelement <4 x i32> [[NEWDATA]], i32 1 +; CHECK-NEXT: [[NEWDATA_SCALAR11]] = extractelement <4 x i32> [[NEWDATA]], i32 2 +; CHECK-NEXT: [[NEWDATA_SCALAR12]] = extractelement <4 x i32> [[NEWDATA]], i32 3 +; CHECK-NEXT: [[NEWOFFSET]] = add i32 [[OFFSET]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 10 +; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]] +; CHECK: end: +; CHECK-NEXT: ret void +; +; here shufflevectors break vectorial nature of the arguments +; scalarization should be done + br label %loop + +loop: + %offset = phi i32 [ 0, %0 ], [ %newoffset, %loop ] + + %data = phi <8 x i32> [ zeroinitializer, %0 ], [ %newdata2, %loop ] + %data2 = shufflevector <8 x i32> %data, <8 x i32> undef, <4 x i32> + %newdata = call <4 x i32> @do_math_v4i32_v4i32(<4 x i32> %data2) + %newdata2 = shufflevector <4 x i32> %newdata, <4 x i32> undef, <8 x i32> + + %newoffset = add i32 %offset, 1 + %1 = icmp eq i32 %newoffset, 10 + br i1 %1, label %end, label %loop + +end: + ret void +} + +define spir_kernel void @test_selective_6() { +; CHECK-LABEL: @test_selective_6( +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT1:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT25:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT2:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT26:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT3:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT27:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT4:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT28:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT5:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT29:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT6:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT30:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT7:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT31:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT8:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT32:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VECTINT9:%.*]] = bitcast float [[VECTFLOAT1]] to i32 +; CHECK-NEXT: [[VECTINT10:%.*]] = bitcast float [[VECTFLOAT2]] to i32 +; CHECK-NEXT: [[VECTINT11:%.*]] = bitcast float [[VECTFLOAT3]] to i32 +; CHECK-NEXT: [[VECTINT12:%.*]] = bitcast float [[VECTFLOAT4]] to i32 +; CHECK-NEXT: [[VECTINT13:%.*]] = bitcast float [[VECTFLOAT5]] to i32 +; CHECK-NEXT: [[VECTINT14:%.*]] = bitcast float [[VECTFLOAT6]] to i32 +; CHECK-NEXT: [[VECTINT15:%.*]] = bitcast float [[VECTFLOAT7]] to i32 +; CHECK-NEXT: [[VECTINT16:%.*]] = bitcast float [[VECTFLOAT8]] to i32 +; CHECK-NEXT: [[VECTADD17:%.*]] = add i32 [[VECTINT9]], 1 +; CHECK-NEXT: [[VECTADD18:%.*]] = add i32 [[VECTINT10]], 2 +; CHECK-NEXT: [[VECTADD19:%.*]] = add i32 [[VECTINT11]], 3 +; CHECK-NEXT: [[VECTADD20:%.*]] = add i32 [[VECTINT12]], 4 +; CHECK-NEXT: [[VECTADD21:%.*]] = add i32 [[VECTINT13]], 5 +; CHECK-NEXT: [[VECTADD22:%.*]] = add i32 [[VECTINT14]], 6 +; CHECK-NEXT: [[VECTADD23:%.*]] = add i32 [[VECTINT15]], 7 +; CHECK-NEXT: [[VECTADD24:%.*]] = add i32 [[VECTINT16]], 8 +; CHECK-NEXT: [[VECTFLOAT_NEXT25]] = bitcast i32 [[VECTADD17]] to float +; CHECK-NEXT: [[VECTFLOAT_NEXT26]] = bitcast i32 [[VECTADD18]] to float +; CHECK-NEXT: [[VECTFLOAT_NEXT27]] = bitcast i32 [[VECTADD19]] to float +; CHECK-NEXT: [[VECTFLOAT_NEXT28]] = bitcast i32 [[VECTADD20]] to float +; CHECK-NEXT: [[VECTFLOAT_NEXT29]] = bitcast i32 [[VECTADD21]] to float +; CHECK-NEXT: [[VECTFLOAT_NEXT30]] = bitcast i32 [[VECTADD22]] to float +; CHECK-NEXT: [[VECTFLOAT_NEXT31]] = bitcast i32 [[VECTADD23]] to float +; CHECK-NEXT: [[VECTFLOAT_NEXT32]] = bitcast i32 [[VECTADD24]] to float +; CHECK-NEXT: [[NEWOFFSET]] = add i32 [[OFFSET]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 10 +; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]] +; CHECK: end: +; CHECK-NEXT: ret void +; +; scalarization should not be prevented due to elementwise bitcasts +; such bitcasts can be part of a chain of vector instructions, but +; should not be at the end of it + br label %loop + +loop: + %offset = phi i32 [ 0, %0 ], [ %newoffset, %loop ] + %vectfloat = phi <8 x float> [ zeroinitializer, %0 ], [ %vectfloat.next, %loop ] + + %vectint = bitcast <8 x float> %vectfloat to <8 x i32> + %vectadd = add <8 x i32> %vectint, + %vectfloat.next = bitcast <8 x i32> %vectadd to <8 x float> + + %newoffset = add i32 %offset, 1 + %1 = icmp eq i32 %newoffset, 10 + br i1 %1, label %end, label %loop +end: + ret void +} + +define spir_kernel void @test_selective_7() { +; CHECK-LABEL: @test_selective_7( +; CHECK-NEXT: [[VECTINT:%.*]] = add <4 x i16> , zeroinitializer +; CHECK-NEXT: [[VECTFLOAT:%.*]] = bitcast <4 x i16> [[VECTINT]] to <4 x half> +; CHECK-NEXT: [[VECTCAST:%.*]] = bitcast <4 x half> [[VECTFLOAT]] to i64 +; CHECK-NEXT: ret void +; +; non-elementwise bitcasts (result type is scalar) should prevent scalarization, +; thus no scalarization should happen here + %vectint = add <4 x i16> , zeroinitializer + %vectfloat = bitcast <4 x i16> %vectint to <4 x half> + %vectcast = bitcast <4 x half> %vectfloat to i64 + + ret void +} + +define spir_kernel void @test_selective_8() { +; CHECK-LABEL: @test_selective_8( +; CHECK-NEXT: [[VECTINT:%.*]] = add <4 x i16> , zeroinitializer +; CHECK-NEXT: [[VECTFLOAT:%.*]] = bitcast <4 x i16> [[VECTINT]] to <4 x half> +; CHECK-NEXT: [[VECTCAST:%.*]] = bitcast <4 x half> [[VECTFLOAT]] to <2 x i32> +; CHECK-NEXT: ret void +; +; non-elementwise bitcasts (result is different sized vector) should prevent scalarization, +; thus no scalarization should happen here + %vectint = add <4 x i16> , zeroinitializer + %vectfloat = bitcast <4 x i16> %vectint to <4 x half> + %vectcast = bitcast <4 x half> %vectfloat to <2 x i32> + + ret void +} + +define <32 x i1> @test_selective_9(i64 %addr) #0 { +; CHECK-LABEL: @test_selective_9( +; CHECK-NEXT: [[INT:%.*]] = add i32 1, 0 +; CHECK-NEXT: [[FLOAT:%.*]] = bitcast i32 [[INT]] to float +; CHECK-NEXT: [[VECTCAST:%.*]] = bitcast float [[FLOAT]] to <32 x i1> +; CHECK-NEXT: ret <32 x i1> [[VECTCAST]] +; + %int = add i32 1, zeroinitializer + %float = bitcast i32 %int to float + %vectcast = bitcast float %float to <32 x i1> + ret <32 x i1> %vectcast +} + +define <2 x i32> @should_not_scalarize_if_the_index_is_not_a_constant(i32 %src) { +; CHECK-LABEL: @should_not_scalarize_if_the_index_is_not_a_constant( +; CHECK-NEXT: [[VECTOR_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[SRC:%.*]], i32 0 +; CHECK-NEXT: [[VECTOR_ASSEMBLED_VECT1:%.*]] = insertelement <2 x i32> [[VECTOR_ASSEMBLED_VECT]], i32 undef, i32 1 +; CHECK-NEXT: [[RESULT:%.*]] = insertelement <2 x i32> [[VECTOR_ASSEMBLED_VECT1]], i32 3, i32 [[SRC]] +; CHECK-NEXT: ret <2 x i32> [[RESULT]] +; + %vector = insertelement <2 x i32> undef, i32 %src, i32 0 + %result = insertelement <2 x i32> %vector, i32 3, i32 %src + ret <2 x i32> %result +} + +define i32 @should_not_scalarize_if_the_index_is_not_a_constant_2(i32 %src) { +; CHECK-LABEL: @should_not_scalarize_if_the_index_is_not_a_constant_2( +; CHECK-NEXT: [[VECTOR_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[SRC:%.*]], i32 0 +; CHECK-NEXT: [[VECTOR_ASSEMBLED_VECT1:%.*]] = insertelement <2 x i32> [[VECTOR_ASSEMBLED_VECT]], i32 undef, i32 1 +; CHECK-NEXT: [[RESULT:%.*]] = extractelement <2 x i32> [[VECTOR_ASSEMBLED_VECT1]], i32 [[SRC]] +; CHECK-NEXT: ret i32 [[RESULT]] +; + %vector = insertelement <2 x i32> undef, i32 %src, i32 0 + %result = extractelement <2 x i32> %vector, i32 %src + ret i32 %result +} + +define spir_kernel void @should_not_scalarize_nested_phi(i1 %switch) { +; CHECK-LABEL: @should_not_scalarize_nested_phi( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[VECTINT:%.*]] = add <8 x i32> , zeroinitializer +; CHECK-NEXT: br i1 [[SWITCH:%.*]], label [[FIRST:%.*]], label [[SECOND:%.*]] +; CHECK: proxy: +; CHECK-NEXT: br i1 [[SWITCH]], label [[FIRST]], label [[SECOND]] +; CHECK: first: +; CHECK-NEXT: [[RESULT1:%.*]] = phi <8 x i32> [ zeroinitializer, [[PROXY:%.*]] ], [ [[VECTINT]], [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[EXIT:%.*]] +; CHECK: second: +; CHECK-NEXT: [[RESULT2:%.*]] = phi <8 x i32> [ zeroinitializer, [[PROXY]] ], [ [[VECTINT]], [[ENTRY]] ] +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: [[RESULT3:%.*]] = phi <8 x i32> [ [[RESULT1]], [[FIRST]] ], [ [[RESULT2]], [[SECOND]] ] +; CHECK-NEXT: [[NEWDATA:%.*]] = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> [[RESULT3]]) +; CHECK-NEXT: ret void +; +entry: + %vectint = add <8 x i32> , zeroinitializer + br i1 %switch, label %first, label %second +proxy: + br i1 %switch, label %first, label %second +first: + %result1 = phi <8 x i32> [ zeroinitializer, %proxy ], [ %vectint, %entry] + br label %exit +second: + %result2 = phi <8 x i32> [ zeroinitializer, %proxy ], [ %vectint, %entry] + br label %exit +exit: + %result3 = phi <8 x i32> [ %result1, %first], [ %result2, %second] + %newdata = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> %result3) + ret void +} + +declare spir_func void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32, <8 x i32>) #1 +declare spir_func <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) #1 +declare spir_func <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) #1 +declare spir_func <4 x i32> @do_math_v4i32_v4i32(<4 x i32>) #1 +declare spir_func <8 x i32> @do_math_v8i32_v8i32(<8 x i32>) #1 + +attributes #0 = { convergent nounwind } +attributes #1 = { nounwind } diff --git a/IGC/Compiler/tests/ScalarizeFunction/selective.ll b/IGC/Compiler/tests/ScalarizeFunction/selective.ll index 7c39694a61ce..9312bcae45ed 100644 --- a/IGC/Compiler/tests/ScalarizeFunction/selective.ll +++ b/IGC/Compiler/tests/ScalarizeFunction/selective.ll @@ -1,14 +1,15 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ;=========================== begin_copyright_notice ============================ ; -; Copyright (C) 2022 Intel Corporation +; Copyright (C) 2024 Intel Corporation ; ; SPDX-License-Identifier: MIT ; ;============================ end_copyright_notice ============================= ; ; REQUIRES: regkeys -; RUN: igc_opt --igc-scalarize -S --regkey=EnableSelectiveScalarizer=1 < %s | FileCheck %s +; RUN: igc_opt --igc-scalarize --opaque-pointers -S --regkey=EnableSelectiveScalarizer=1 < %s | FileCheck %s +; REQUIRES: llvm-14-plus ; ------------------------------------------------ ; ScalarizeFunction ; ------------------------------------------------ @@ -16,11 +17,12 @@ ; ------------------------------------------------ define spir_kernel void @test_selective_1(i64 %addr) #0 { -; CHECK-LABEL: @test_selective_1( +; CHECK-LABEL: define spir_kernel void @test_selective_1( +; CHECK-SAME: i64 [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[VECTINT:%.*]] = add <8 x i32> , zeroinitializer ; CHECK-NEXT: [[VECTFLOAT:%.*]] = bitcast <8 x i32> [[VECTINT]] to <8 x float> ; CHECK-NEXT: [[VECTCAST:%.*]] = bitcast <8 x float> [[VECTFLOAT]] to <8 x i32> -; CHECK-NEXT: call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECTCAST]]) +; CHECK-NEXT: call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECTCAST]]) ; CHECK-NEXT: ret void ; @@ -36,11 +38,12 @@ define spir_kernel void @test_selective_1(i64 %addr) #0 { } define spir_kernel void @test_selective_2(i64 %addr) #0 { -; CHECK-LABEL: @test_selective_2( +; CHECK-LABEL: define spir_kernel void @test_selective_2( +; CHECK-SAME: i64 [[ADDR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[VECTINT:%.*]] = add <8 x i32> , zeroinitializer ; CHECK-NEXT: [[VECTFLOAT:%.*]] = bitcast <8 x i32> [[VECTINT]] to <8 x float> ; CHECK-NEXT: [[VECTCAST:%.*]] = bitcast <8 x float> [[VECTFLOAT]] to <8 x i32> -; CHECK-NEXT: call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECTCAST]]) +; CHECK-NEXT: call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECTCAST]]) ; CHECK-NEXT: [[ANOTHERCAST:%.*]] = bitcast <8 x float> [[VECTFLOAT]] to <8 x i32> ; CHECK-NEXT: [[ANOTHERCAST_SCALAR:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 0 ; CHECK-NEXT: [[ANOTHERCAST_SCALAR1:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 1 @@ -67,18 +70,19 @@ define spir_kernel void @test_selective_2(i64 %addr) #0 { } define spir_kernel void @test_selective_3() { -; CHECK-LABEL: @test_selective_3( -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[DATA:%.*]] = phi <8 x i32> [ zeroinitializer, [[TMP0]] ], [ [[NEWDATA:%.*]], [[LOOP]] ] +; CHECK-LABEL: define spir_kernel void @test_selective_3() { +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[DATA:%.*]] = phi <8 x i32> [ zeroinitializer, [[TMP0]] ], [ [[NEWDATA:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[NEWDATA]] = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> [[DATA]]) ; CHECK-NEXT: [[NEWOFFSET]] = add i32 [[OFFSET]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 10 -; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]] -; CHECK: end: +; CHECK-NEXT: br i1 [[TMP1]], label %[[END:.*]], label %[[LOOP]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; + ; no scalarization happens here because the vectors %data and %newdata are used as whole br label %loop @@ -97,19 +101,21 @@ end: } define spir_kernel void @test_selective_4(i64 %addr) #0 { -; CHECK-LABEL: @test_selective_4( -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[FLOAT_VECTOR:%.*]] = phi <8 x float> [ zeroinitializer, [[TMP0]] ], [ [[NEW_FLOAT_VECTOR:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[INT_VECTOR:%.*]] = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 [[OFFSET]], i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0) +; CHECK-LABEL: define spir_kernel void @test_selective_4( +; CHECK-SAME: i64 [[ADDR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[FLOAT_VECTOR:%.*]] = phi <8 x float> [ zeroinitializer, [[TMP0]] ], [ [[NEW_FLOAT_VECTOR:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[INT_VECTOR:%.*]] = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 [[ADDR]], i32 1023, i32 511, i32 1023, i32 [[OFFSET]], i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0) ; CHECK-NEXT: [[NEW_FLOAT_VECTOR]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[FLOAT_VECTOR]], <8 x i16> , <8 x i32> [[INT_VECTOR]], i32 11, i32 11, i32 8, i32 8, i1 false) ; CHECK-NEXT: [[NEWOFFSET]] = add i32 [[OFFSET]], 16 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 256 -; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]] -; CHECK: end: +; CHECK-NEXT: br i1 [[TMP1]], label %[[END:.*]], label %[[LOOP]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; + ; same here: no scalarization br label %loop @@ -128,18 +134,18 @@ end: define spir_kernel void @test_selective_5() { -; CHECK-LABEL: @test_selective_5( -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[DATA1:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[DATA3:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR10:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[DATA4:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR11:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[DATA5:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR12:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[DATA6:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ] -; CHECK-NEXT: [[DATA7:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ] -; CHECK-NEXT: [[DATA8:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ] -; CHECK-NEXT: [[DATA9:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ] +; CHECK-LABEL: define spir_kernel void @test_selective_5() { +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[DATA1:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[DATA3:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR10:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[DATA4:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR11:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[DATA5:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR12:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[DATA6:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, %[[LOOP]] ] +; CHECK-NEXT: [[DATA7:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, %[[LOOP]] ] +; CHECK-NEXT: [[DATA8:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, %[[LOOP]] ] +; CHECK-NEXT: [[DATA9:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, %[[LOOP]] ] ; CHECK-NEXT: [[DATA2_ASSEMBLED_VECT:%.*]] = insertelement <4 x i32> undef, i32 [[DATA1]], i32 0 ; CHECK-NEXT: [[DATA2_ASSEMBLED_VECT13:%.*]] = insertelement <4 x i32> [[DATA2_ASSEMBLED_VECT]], i32 [[DATA3]], i32 1 ; CHECK-NEXT: [[DATA2_ASSEMBLED_VECT14:%.*]] = insertelement <4 x i32> [[DATA2_ASSEMBLED_VECT13]], i32 [[DATA4]], i32 2 @@ -151,10 +157,11 @@ define spir_kernel void @test_selective_5() { ; CHECK-NEXT: [[NEWDATA_SCALAR12]] = extractelement <4 x i32> [[NEWDATA]], i32 3 ; CHECK-NEXT: [[NEWOFFSET]] = add i32 [[OFFSET]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 10 -; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]] -; CHECK: end: +; CHECK-NEXT: br i1 [[TMP1]], label %[[END:.*]], label %[[LOOP]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; + ; here shufflevectors break vectorial nature of the arguments ; scalarization should be done br label %loop @@ -176,18 +183,18 @@ end: } define spir_kernel void @test_selective_6() { -; CHECK-LABEL: @test_selective_6( -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[VECTFLOAT1:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT25:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[VECTFLOAT2:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT26:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[VECTFLOAT3:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT27:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[VECTFLOAT4:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT28:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[VECTFLOAT5:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT29:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[VECTFLOAT6:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT30:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[VECTFLOAT7:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT31:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[VECTFLOAT8:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT32:%.*]], [[LOOP]] ] +; CHECK-LABEL: define spir_kernel void @test_selective_6() { +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT1:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT25:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT2:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT26:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT3:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT27:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT4:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT28:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT5:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT29:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT6:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT30:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT7:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT31:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[VECTFLOAT8:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT32:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[VECTINT9:%.*]] = bitcast float [[VECTFLOAT1]] to i32 ; CHECK-NEXT: [[VECTINT10:%.*]] = bitcast float [[VECTFLOAT2]] to i32 ; CHECK-NEXT: [[VECTINT11:%.*]] = bitcast float [[VECTFLOAT3]] to i32 @@ -214,10 +221,11 @@ define spir_kernel void @test_selective_6() { ; CHECK-NEXT: [[VECTFLOAT_NEXT32]] = bitcast i32 [[VECTADD24]] to float ; CHECK-NEXT: [[NEWOFFSET]] = add i32 [[OFFSET]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 10 -; CHECK-NEXT: br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]] -; CHECK: end: +; CHECK-NEXT: br i1 [[TMP1]], label %[[END:.*]], label %[[LOOP]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; + ; scalarization should not be prevented due to elementwise bitcasts ; such bitcasts can be part of a chain of vector instructions, but ; should not be at the end of it @@ -239,12 +247,13 @@ end: } define spir_kernel void @test_selective_7() { -; CHECK-LABEL: @test_selective_7( +; CHECK-LABEL: define spir_kernel void @test_selective_7() { ; CHECK-NEXT: [[VECTINT:%.*]] = add <4 x i16> , zeroinitializer ; CHECK-NEXT: [[VECTFLOAT:%.*]] = bitcast <4 x i16> [[VECTINT]] to <4 x half> ; CHECK-NEXT: [[VECTCAST:%.*]] = bitcast <4 x half> [[VECTFLOAT]] to i64 ; CHECK-NEXT: ret void ; + ; non-elementwise bitcasts (result type is scalar) should prevent scalarization, ; thus no scalarization should happen here %vectint = add <4 x i16> , zeroinitializer @@ -255,12 +264,13 @@ define spir_kernel void @test_selective_7() { } define spir_kernel void @test_selective_8() { -; CHECK-LABEL: @test_selective_8( +; CHECK-LABEL: define spir_kernel void @test_selective_8() { ; CHECK-NEXT: [[VECTINT:%.*]] = add <4 x i16> , zeroinitializer ; CHECK-NEXT: [[VECTFLOAT:%.*]] = bitcast <4 x i16> [[VECTINT]] to <4 x half> ; CHECK-NEXT: [[VECTCAST:%.*]] = bitcast <4 x half> [[VECTFLOAT]] to <2 x i32> ; CHECK-NEXT: ret void ; + ; non-elementwise bitcasts (result is different sized vector) should prevent scalarization, ; thus no scalarization should happen here %vectint = add <4 x i16> , zeroinitializer @@ -270,6 +280,82 @@ define spir_kernel void @test_selective_8() { ret void } +define <32 x i1> @test_selective_9(i64 %addr) #0 { +; CHECK-LABEL: define <32 x i1> @test_selective_9( +; CHECK-SAME: i64 [[ADDR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[INT:%.*]] = add i32 1, 0 +; CHECK-NEXT: [[FLOAT:%.*]] = bitcast i32 [[INT]] to float +; CHECK-NEXT: [[VECTCAST:%.*]] = bitcast float [[FLOAT]] to <32 x i1> +; CHECK-NEXT: ret <32 x i1> [[VECTCAST]] +; + %int = add i32 1, zeroinitializer + %float = bitcast i32 %int to float + %vectcast = bitcast float %float to <32 x i1> + ret <32 x i1> %vectcast +} + +define <2 x i32> @should_not_scalarize_if_the_index_is_not_a_constant(i32 %src) { +; CHECK-LABEL: define <2 x i32> @should_not_scalarize_if_the_index_is_not_a_constant( +; CHECK-SAME: i32 [[SRC:%.*]]) { +; CHECK-NEXT: [[VECTOR_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[SRC]], i32 0 +; CHECK-NEXT: [[VECTOR_ASSEMBLED_VECT1:%.*]] = insertelement <2 x i32> [[VECTOR_ASSEMBLED_VECT]], i32 undef, i32 1 +; CHECK-NEXT: [[RESULT:%.*]] = insertelement <2 x i32> [[VECTOR_ASSEMBLED_VECT1]], i32 3, i32 [[SRC]] +; CHECK-NEXT: ret <2 x i32> [[RESULT]] +; + %vector = insertelement <2 x i32> undef, i32 %src, i32 0 + %result = insertelement <2 x i32> %vector, i32 3, i32 %src + ret <2 x i32> %result +} + +define i32 @should_not_scalarize_if_the_index_is_not_a_constant_2(i32 %src) { +; CHECK-LABEL: define i32 @should_not_scalarize_if_the_index_is_not_a_constant_2( +; CHECK-SAME: i32 [[SRC:%.*]]) { +; CHECK-NEXT: [[VECTOR_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[SRC]], i32 0 +; CHECK-NEXT: [[VECTOR_ASSEMBLED_VECT1:%.*]] = insertelement <2 x i32> [[VECTOR_ASSEMBLED_VECT]], i32 undef, i32 1 +; CHECK-NEXT: [[RESULT:%.*]] = extractelement <2 x i32> [[VECTOR_ASSEMBLED_VECT1]], i32 [[SRC]] +; CHECK-NEXT: ret i32 [[RESULT]] +; + %vector = insertelement <2 x i32> undef, i32 %src, i32 0 + %result = extractelement <2 x i32> %vector, i32 %src + ret i32 %result +} + +define spir_kernel void @should_not_scalarize_nested_phi(i1 %switch) { +; CHECK-LABEL: define spir_kernel void @should_not_scalarize_nested_phi( +; CHECK-SAME: i1 [[SWITCH:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[VECTINT:%.*]] = add <8 x i32> , zeroinitializer +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; CHECK: [[PROXY:.*]]: +; CHECK-NEXT: br i1 [[SWITCH]], label %[[FIRST]], label %[[SECOND]] +; CHECK: [[FIRST]]: +; CHECK-NEXT: [[RESULT1:%.*]] = phi <8 x i32> [ zeroinitializer, %[[PROXY]] ], [ [[VECTINT]], %[[ENTRY]] ] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SECOND]]: +; CHECK-NEXT: [[RESULT2:%.*]] = phi <8 x i32> [ zeroinitializer, %[[PROXY]] ], [ [[VECTINT]], %[[ENTRY]] ] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RESULT3:%.*]] = phi <8 x i32> [ [[RESULT1]], %[[FIRST]] ], [ [[RESULT2]], %[[SECOND]] ] +; CHECK-NEXT: [[NEWDATA:%.*]] = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> [[RESULT3]]) +; CHECK-NEXT: ret void +; +entry: + %vectint = add <8 x i32> , zeroinitializer + br i1 %switch, label %first, label %second +proxy: + br i1 %switch, label %first, label %second +first: + %result1 = phi <8 x i32> [ zeroinitializer, %proxy ], [ %vectint, %entry] + br label %exit +second: + %result2 = phi <8 x i32> [ zeroinitializer, %proxy ], [ %vectint, %entry] + br label %exit +exit: + %result3 = phi <8 x i32> [ %result1, %first], [ %result2, %second] + %newdata = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> %result3) + ret void +} + declare spir_func void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32, <8 x i32>) #1 declare spir_func <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) #1 declare spir_func <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) #1