From 8acaf5265a636a384ec3a63cba34baa5b8c8c10c Mon Sep 17 00:00:00 2001
From: kstokop <krzysztof.stokop@intel.com>
Date: Fri, 15 Nov 2024 12:26:12 +0000
Subject: [PATCH]  Add Lit tests for ScalarizeFunction pass

Current tests were only debug info. Created Lit test with proper checks. Created also opaque pointers versions, but pass not fully supports opaque pointers. The "Requires: opaque-ptr-fix" was added.
---
 IGC/Compiler/tests/ScalarizeFunction/basic.ll | 140 -----
 .../tests/ScalarizeFunction/fneg_optnone.ll   |  35 --
 ...arize-binary-instruction-typed-pointers.ll | 258 ++++++++
 .../scalarize-binary-instruction.ll           | 260 ++++++++
 ...alarize-cast-instruction-typed-pointers.ll | 195 ++++++
 .../scalarize-cast-instruction.ll             | 196 ++++++
 ...alarize-comp-instruction-typed-pointers.ll | 217 +++++++
 .../scalarize-comp-instruction.ll             | 218 +++++++
 ...etelementptr-instruction-typed-pointers.ll | 190 ++++++
 .../scalarize-getelementptr-instruction.ll    |  82 +++
 ...calarize-phi-instruction-typed-pointers.ll | 587 +++++++++++++++++
 .../scalarize-phi-instruction.ll              | 589 ++++++++++++++++++
 ...arize-select-instruction-typed-pointers.ll | 221 +++++++
 .../scalarize-select-instruction.ll           | 223 +++++++
 ...larize-unary-instruction-typed-pointers.ll | 187 ++++++
 .../scalarize-unary-instruction.ll            | 194 ++++++
 ...rize-vector-instructions-typed-pointers.ll | 145 +++++
 .../scalarize-vector-instructions.ll          | 146 +++++
 .../selective-typed-pointers.ll               | 352 +++++++++++
 .../tests/ScalarizeFunction/selective.ll      | 190 ++++--
 20 files changed, 4398 insertions(+), 227 deletions(-)
 delete mode 100644 IGC/Compiler/tests/ScalarizeFunction/basic.ll
 delete mode 100644 IGC/Compiler/tests/ScalarizeFunction/fneg_optnone.ll
 create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-binary-instruction-typed-pointers.ll
 create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-binary-instruction.ll
 create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-cast-instruction-typed-pointers.ll
 create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-cast-instruction.ll
 create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-comp-instruction-typed-pointers.ll
 create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-comp-instruction.ll
 create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-getelementptr-instruction-typed-pointers.ll
 create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-getelementptr-instruction.ll
 create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-phi-instruction-typed-pointers.ll
 create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-phi-instruction.ll
 create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-select-instruction-typed-pointers.ll
 create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-select-instruction.ll
 create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-unary-instruction-typed-pointers.ll
 create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-unary-instruction.ll
 create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-vector-instructions-typed-pointers.ll
 create mode 100644 IGC/Compiler/tests/ScalarizeFunction/scalarize-vector-instructions.ll
 create mode 100644 IGC/Compiler/tests/ScalarizeFunction/selective-typed-pointers.ll

diff --git a/IGC/Compiler/tests/ScalarizeFunction/basic.ll b/IGC/Compiler/tests/ScalarizeFunction/basic.ll
deleted file mode 100644
index baf8bab18fc5..000000000000
--- a/IGC/Compiler/tests/ScalarizeFunction/basic.ll
+++ /dev/null
@@ -1,140 +0,0 @@
-;=========================== begin_copyright_notice ============================
-;
-; Copyright (C) 2022 Intel Corporation
-;
-; SPDX-License-Identifier: MIT
-;
-;============================ end_copyright_notice =============================
-;
-; RUN: igc_opt --igc-scalarize -S < %s | FileCheck %s
-; ------------------------------------------------
-; ScalarizeFunction
-; ------------------------------------------------
-
-define spir_kernel void @test_unary(<2 x float> %src1) {
-; CHECK-LABEL: @test_unary(
-; CHECK:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1:%.*]], i32 0
-; CHECK:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
-; CHECK:    [[TMP1:%.*]] = alloca <2 x float>, align 4
-; CHECK:    [[TMP2:%.*]] = fneg float [[SRC1_SCALAR]]
-; CHECK:    [[TMP3:%.*]] = fneg float [[SRC1_SCALAR1]]
-; CHECK:    [[ASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
-; CHECK:    [[ASSEMBLED_VECT2:%.*]] = insertelement <2 x float> [[ASSEMBLED_VECT]], float [[TMP3]], i32 1
-; CHECK:    store <2 x float> [[ASSEMBLED_VECT2]], <2 x float>* [[TMP1]], align 8
-; CHECK:    ret void
-;
-  %1 = alloca <2 x float>, align 4
-  %2 = fneg <2 x float> %src1
-  store <2 x float> %2, <2 x float>* %1, align 8
-  ret void
-}
-
-define spir_kernel void @test_binary(<2 x i32> %src1, <2 x i32> %src2) {
-; CHECK-LABEL: @test_binary(
-; CHECK:    [[SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2:%.*]], i32 0
-; CHECK:    [[SCALAR3:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1
-; CHECK:    [[SCALAR:%.*]] = extractelement <2 x i32> [[SRC1:%.*]], i32 0
-; CHECK:    [[SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
-; CHECK:    [[TMP1:%.*]] = alloca <2 x i32>, align 4
-; CHECK:    [[TMP2:%.*]] = add i32 [[SCALAR]], [[SCALAR2]]
-; CHECK:    [[TMP3:%.*]] = add i32 [[SCALAR1]], [[SCALAR3]]
-; CHECK:    [[ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
-; CHECK:    [[ASSEMBLED_VECT4:%.*]] = insertelement <2 x i32> [[ASSEMBLED_VECT]], i32 [[TMP3]], i32 1
-; CHECK:    store <2 x i32> [[ASSEMBLED_VECT4]], <2 x i32>* [[TMP1]], align 8
-; CHECK:    ret void
-;
-  %1 = alloca <2 x i32>, align 4
-  %2 = add <2 x i32> %src1, %src2
-  store <2 x i32> %2, <2 x i32>* %1, align 8
-  ret void
-}
-
-define spir_kernel void @test_cast(<2 x i32> %src1) {
-; CHECK-LABEL: @test_cast(
-; CHECK:    [[SCALAR:%.*]] = extractelement <2 x i32> [[SRC1:%.*]], i32 0
-; CHECK:    [[SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
-; CHECK:    [[TMP1:%.*]] = alloca <2 x i64>, align 4
-; CHECK:    [[TMP2:%.*]] = alloca <4 x i16>, align 4
-; CHECK:    [[TMP3:%.*]] = sext i32 [[SCALAR]] to i64
-; CHECK:    [[TMP4:%.*]] = sext i32 [[SCALAR1]] to i64
-; CHECK:    [[ASSEMBLED_VECT:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i32 0
-; CHECK:    [[ASSEMBLED_VECT2:%.*]] = insertelement <2 x i64> [[ASSEMBLED_VECT]], i64 [[TMP4]], i32 1
-; CHECK:    [[TMP5:%.*]] = bitcast <2 x i32> [[SRC1]] to <4 x i16>
-; CHECK:    store <2 x i64> [[ASSEMBLED_VECT2]], <2 x i64>* [[TMP1]], align 16
-; CHECK:    store <4 x i16> [[TMP5]], <4 x i16>* [[TMP2]], align 8
-; CHECK:    ret void
-;
-  %1 = alloca <2 x i64>, align 4
-  %2 = alloca <4 x i16>, align 4
-  %3 = sext <2 x i32> %src1 to <2 x i64>
-  %4 = bitcast <2 x i32> %src1 to <4 x i16>
-  store <2 x i64> %3, <2 x i64>* %1, align 16
-  store <4 x i16> %4, <4 x i16>* %2, align 8
-  ret void
-}
-
-define spir_kernel void @test_cmp(<2 x i32> %src1, <2 x i32> %src2) {
-; CHECK-LABEL: @test_cmp(
-; CHECK:    [[SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2:%.*]], i32 0
-; CHECK:    [[SCALAR3:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1
-; CHECK:    [[SCALAR:%.*]] = extractelement <2 x i32> [[SRC1:%.*]], i32 0
-; CHECK:    [[SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
-; CHECK:    [[TMP1:%.*]] = alloca <2 x i1>, align 4
-; CHECK:    [[TMP2:%.*]] = icmp eq i32 [[SCALAR]], [[SCALAR2]]
-; CHECK:    [[TMP3:%.*]] = icmp eq i32 [[SCALAR1]], [[SCALAR3]]
-; CHECK:    [[ASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP2]], i32 0
-; CHECK:    [[ASSEMBLED_VECT4:%.*]] = insertelement <2 x i1> [[ASSEMBLED_VECT]], i1 [[TMP3]], i32 1
-; CHECK:    store <2 x i1> [[ASSEMBLED_VECT4]], <2 x i1>* [[TMP1]], align 1
-; CHECK:    ret void
-;
-  %1 = alloca <2 x i1>, align 4
-  %2 = icmp eq <2 x i32> %src1, %src2
-  store <2 x i1> %2, <2 x i1>* %1, align 1
-  ret void
-}
-
-define spir_kernel void @test_select(<2 x i32> %src1, <4 x i16> %src2, i1 %cond, <4 x i1> %vcond) {
-; CHECK-LABEL: @test_select(
-; CHECK:    [[SCALAR6:%.*]] = extractelement <4 x i1> [[VCOND:%.*]], i32 0
-; CHECK:    [[SCALAR7:%.*]] = extractelement <4 x i1> [[VCOND]], i32 1
-; CHECK:    [[SCALAR8:%.*]] = extractelement <4 x i1> [[VCOND]], i32 2
-; CHECK:    [[SCALAR9:%.*]] = extractelement <4 x i1> [[VCOND]], i32 3
-; CHECK:    [[SCALAR2:%.*]] = extractelement <4 x i16> [[SRC2:%.*]], i32 0
-; CHECK:    [[SCALAR3:%.*]] = extractelement <4 x i16> [[SRC2]], i32 1
-; CHECK:    [[SCALAR4:%.*]] = extractelement <4 x i16> [[SRC2]], i32 2
-; CHECK:    [[SCALAR5:%.*]] = extractelement <4 x i16> [[SRC2]], i32 3
-; CHECK:    [[SCALAR:%.*]] = extractelement <2 x i32> [[SRC1:%.*]], i32 0
-; CHECK:    [[SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
-; CHECK:    [[TMP1:%.*]] = alloca <2 x i32>, align 4
-; CHECK:    [[TMP2:%.*]] = alloca <4 x i16>, align 4
-; CHECK:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[SCALAR]], i32 42
-; CHECK:    [[TMP4:%.*]] = select i1 [[COND]], i32 [[SCALAR1]], i32 13
-; CHECK:    [[ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP3]], i32 0
-; CHECK:    [[ASSEMBLED_VECT10:%.*]] = insertelement <2 x i32> [[ASSEMBLED_VECT]], i32 [[TMP4]], i32 1
-; CHECK:    [[TMP5:%.*]] = select i1 [[SCALAR6]], i16 [[SCALAR2]], i16 1
-; CHECK:    [[TMP6:%.*]] = select i1 [[SCALAR7]], i16 [[SCALAR3]], i16 2
-; CHECK:    [[TMP7:%.*]] = select i1 [[SCALAR8]], i16 [[SCALAR4]], i16 3
-; CHECK:    [[TMP8:%.*]] = select i1 [[SCALAR9]], i16 [[SCALAR5]], i16 4
-; CHECK:    [[ASSEMBLED_VECT11:%.*]] = insertelement <4 x i16> undef, i16 [[TMP5]], i32 0
-; CHECK:    [[ASSEMBLED_VECT12:%.*]] = insertelement <4 x i16> [[ASSEMBLED_VECT11]], i16 [[TMP6]], i32 1
-; CHECK:    [[ASSEMBLED_VECT13:%.*]] = insertelement <4 x i16> [[ASSEMBLED_VECT12]], i16 [[TMP7]], i32 2
-; CHECK:    [[ASSEMBLED_VECT14:%.*]] = insertelement <4 x i16> [[ASSEMBLED_VECT13]], i16 [[TMP8]], i32 3
-; CHECK:    [[ASSEMBLED_VECT15:%.*]] = insertelement <4 x i16> undef, i16 [[SCALAR2]], i32 0
-; CHECK:    [[ASSEMBLED_VECT16:%.*]] = insertelement <4 x i16> [[ASSEMBLED_VECT15]], i16 [[SCALAR3]], i32 1
-; CHECK:    [[ASSEMBLED_VECT17:%.*]] = insertelement <4 x i16> [[ASSEMBLED_VECT16]], i16 [[SCALAR4]], i32 2
-; CHECK:    [[ASSEMBLED_VECT18:%.*]] = insertelement <4 x i16> [[ASSEMBLED_VECT17]], i16 [[SCALAR5]], i32 3
-; CHECK:    store <2 x i32> [[ASSEMBLED_VECT10]], <2 x i32>* [[TMP1]], align 8
-; CHECK:    store <4 x i16> [[ASSEMBLED_VECT14]], <4 x i16>* [[TMP2]], align 8
-; CHECK:    [[TMP9:%.*]] = bitcast <4 x i16> [[ASSEMBLED_VECT18]] to i64
-; CHECK:    ret void
-;
-  %1 = alloca <2 x i32>, align 4
-  %2 = alloca <4 x i16>, align 4
-  %3 = select i1 %cond, <2 x i32> %src1, <2 x i32> <i32 42, i32 13>
-  %4 = select <4 x i1> %vcond, <4 x i16> %src2, <4 x i16> <i16 1, i16 2, i16 3, i16 4>
-  %5 = select i1 %cond, <4 x i16> %src2, <4 x i16> %src2
-  store <2 x i32> %3, <2 x i32>* %1, align 8
-  store <4 x i16> %4, <4 x i16>* %2, align 8
-  %6 = bitcast <4 x i16> %5 to i64
-  ret void
-}
diff --git a/IGC/Compiler/tests/ScalarizeFunction/fneg_optnone.ll b/IGC/Compiler/tests/ScalarizeFunction/fneg_optnone.ll
deleted file mode 100644
index 3d1ee280b3c9..000000000000
--- a/IGC/Compiler/tests/ScalarizeFunction/fneg_optnone.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-;=========================== begin_copyright_notice ============================
-;
-; Copyright (C) 2024 Intel Corporation
-;
-; SPDX-License-Identifier: MIT
-;
-;============================ end_copyright_notice =============================
-;
-; REQUIRES: llvm-14-plus
-; RUN: igc_opt %s -S -o - --igc-scalarize | FileCheck %s
-
-; Function Attrs: noinline optnone
-define void @test_fneg_optnone(<4 x float> %src, <3 x float> addrspace(1)* %out) #0 {
-
-; CHECK-LABEL: @test_fneg_optnone(
-;
-; CHECK: [[EE0:%.*]] = extractelement <4 x float> %src, i32 0
-; CHECK: [[EE1:%.*]] = extractelement <4 x float> %src, i32 1
-; CHECK: [[EE2:%.*]] = extractelement <4 x float> %src, i32 2
-; CHECK: [[EE3:%.*]] = extractelement <4 x float> %src, i32 3
-; CHECK: [[IE0:%.*]] = insertelement <3 x float> undef, float [[EE0]], i32 0
-; CHECK: [[IE1:%.*]] = insertelement <3 x float> [[IE0]], float [[EE1]], i32 1
-; CHECK: [[IE2:%.*]] = insertelement <3 x float> [[IE1]], float [[EE2]], i32 2
-; CHECK: [[FNEG:%.*]] = fneg <3 x float> [[IE2]]
-; CHECK: store <3 x float> [[FNEG]], <3 x float> addrspace(1)* %out, align 4
-
-; CHECK-NOT: fneg <3 x float> undef
-
-  %1 = shufflevector <4 x float> %src, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  %2 = fneg <3 x float> %1
-  store <3 x float> %2, <3 x float> addrspace(1)* %out, align 4
-  ret void
-}
-
-attributes #0 = { noinline optnone }
diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-binary-instruction-typed-pointers.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-binary-instruction-typed-pointers.ll
new file mode 100644
index 000000000000..93fd85af02ef
--- /dev/null
+++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-binary-instruction-typed-pointers.ll
@@ -0,0 +1,258 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+;
+; RUN: igc_opt --igc-scalarize -S < %s | FileCheck %s
+; ------------------------------------------------
+; ScalarizeFunction
+; ------------------------------------------------
+; The pass should break vector operation into many scalar operations
+; ------------------------------------------------
+
+define spir_kernel void @basic(<2 x i32> %src1, <2 x i32> %src2) {
+; CHECK-LABEL: define spir_kernel void @basic(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x i32>, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[DOTASSEMBLED_VECT3]], <2 x i32>* [[TMP1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x i32>
+  %2 = add <2 x i32> %src1, %src2
+  store <2 x i32> %2, <2 x i32>* %1
+  ret void
+}
+
+define spir_kernel void @should_work_with_different_instruction_type(<2 x float> %src1, <2 x float> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_different_instruction_type(
+; CHECK-SAME: <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x float>, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd float [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd float [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x float> [[DOTASSEMBLED_VECT]], float [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x float> [[DOTASSEMBLED_VECT3]], <2 x float>* [[TMP1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x float>
+  %2 = fadd <2 x float> %src1, %src2
+  store <2 x float> %2, <2 x float>* %1
+  ret void
+}
+
+define spir_kernel void @should_work_with_exact_flag(<2 x i32> %src1, <2 x i32> %src2) {
+; CHECK-LABEL: @should_work_with_exact_flag(
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2:%.*]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1:%.*]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x i32>, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv exact i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = udiv exact i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[DOTASSEMBLED_VECT3]], <2 x i32>* [[TMP1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x i32>
+  %2 = udiv exact <2 x i32> %src1, %src2
+  store <2 x i32> %2, <2 x i32>* %1
+  ret void
+}
+
+; triangulating with @should_work_with_different_instruction_type
+define spir_kernel void @should_work_with_fast_math_flags(<2 x float> %src1, <2 x float> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_fast_math_flags(
+; CHECK-SAME: <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x float>, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast float [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast float [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x float> [[DOTASSEMBLED_VECT]], float [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x float> [[DOTASSEMBLED_VECT3]], <2 x float>* [[TMP1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x float>
+  %2 = fadd fast <2 x float> %src1, %src2
+  store <2 x float> %2, <2 x float>* %1
+  ret void
+}
+
+define spir_kernel void @should_work_with_different_value_type(<2 x i64> %src1, <2 x i64> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_different_value_type(
+; CHECK-SAME: <2 x i64> [[SRC1:%.*]], <2 x i64> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i64> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x i64> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i64> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i64> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x i64>, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i64> [[DOTASSEMBLED_VECT]], i64 [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x i64> [[DOTASSEMBLED_VECT3]], <2 x i64>* [[TMP1]], align 16
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x i64>
+  %2 = add <2 x i64> %src1, %src2
+  store <2 x i64> %2, <2 x i64>* %1
+  ret void
+}
+
+define spir_kernel void @should_work_with_larger_vector_size(<16 x i32> %src1, <16 x i32> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_larger_vector_size(
+; CHECK-SAME: <16 x i32> [[SRC1:%.*]], <16 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <16 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR16:%.*]] = extractelement <16 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC2_SCALAR17:%.*]] = extractelement <16 x i32> [[SRC2]], i32 2
+; CHECK-NEXT:    [[SRC2_SCALAR18:%.*]] = extractelement <16 x i32> [[SRC2]], i32 3
+; CHECK-NEXT:    [[SRC2_SCALAR19:%.*]] = extractelement <16 x i32> [[SRC2]], i32 4
+; CHECK-NEXT:    [[SRC2_SCALAR20:%.*]] = extractelement <16 x i32> [[SRC2]], i32 5
+; CHECK-NEXT:    [[SRC2_SCALAR21:%.*]] = extractelement <16 x i32> [[SRC2]], i32 6
+; CHECK-NEXT:    [[SRC2_SCALAR22:%.*]] = extractelement <16 x i32> [[SRC2]], i32 7
+; CHECK-NEXT:    [[SRC2_SCALAR23:%.*]] = extractelement <16 x i32> [[SRC2]], i32 8
+; CHECK-NEXT:    [[SRC2_SCALAR24:%.*]] = extractelement <16 x i32> [[SRC2]], i32 9
+; CHECK-NEXT:    [[SRC2_SCALAR25:%.*]] = extractelement <16 x i32> [[SRC2]], i32 10
+; CHECK-NEXT:    [[SRC2_SCALAR26:%.*]] = extractelement <16 x i32> [[SRC2]], i32 11
+; CHECK-NEXT:    [[SRC2_SCALAR27:%.*]] = extractelement <16 x i32> [[SRC2]], i32 12
+; CHECK-NEXT:    [[SRC2_SCALAR28:%.*]] = extractelement <16 x i32> [[SRC2]], i32 13
+; CHECK-NEXT:    [[SRC2_SCALAR29:%.*]] = extractelement <16 x i32> [[SRC2]], i32 14
+; CHECK-NEXT:    [[SRC2_SCALAR30:%.*]] = extractelement <16 x i32> [[SRC2]], i32 15
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <16 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <16 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR2:%.*]] = extractelement <16 x i32> [[SRC1]], i32 2
+; CHECK-NEXT:    [[SRC1_SCALAR3:%.*]] = extractelement <16 x i32> [[SRC1]], i32 3
+; CHECK-NEXT:    [[SRC1_SCALAR4:%.*]] = extractelement <16 x i32> [[SRC1]], i32 4
+; CHECK-NEXT:    [[SRC1_SCALAR5:%.*]] = extractelement <16 x i32> [[SRC1]], i32 5
+; CHECK-NEXT:    [[SRC1_SCALAR6:%.*]] = extractelement <16 x i32> [[SRC1]], i32 6
+; CHECK-NEXT:    [[SRC1_SCALAR7:%.*]] = extractelement <16 x i32> [[SRC1]], i32 7
+; CHECK-NEXT:    [[SRC1_SCALAR8:%.*]] = extractelement <16 x i32> [[SRC1]], i32 8
+; CHECK-NEXT:    [[SRC1_SCALAR9:%.*]] = extractelement <16 x i32> [[SRC1]], i32 9
+; CHECK-NEXT:    [[SRC1_SCALAR10:%.*]] = extractelement <16 x i32> [[SRC1]], i32 10
+; CHECK-NEXT:    [[SRC1_SCALAR11:%.*]] = extractelement <16 x i32> [[SRC1]], i32 11
+; CHECK-NEXT:    [[SRC1_SCALAR12:%.*]] = extractelement <16 x i32> [[SRC1]], i32 12
+; CHECK-NEXT:    [[SRC1_SCALAR13:%.*]] = extractelement <16 x i32> [[SRC1]], i32 13
+; CHECK-NEXT:    [[SRC1_SCALAR14:%.*]] = extractelement <16 x i32> [[SRC1]], i32 14
+; CHECK-NEXT:    [[SRC1_SCALAR15:%.*]] = extractelement <16 x i32> [[SRC1]], i32 15
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <16 x i32>, align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR16]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[SRC1_SCALAR2]], [[SRC2_SCALAR17]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[SRC1_SCALAR3]], [[SRC2_SCALAR18]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[SRC1_SCALAR4]], [[SRC2_SCALAR19]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[SRC1_SCALAR5]], [[SRC2_SCALAR20]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[SRC1_SCALAR6]], [[SRC2_SCALAR21]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[SRC1_SCALAR7]], [[SRC2_SCALAR22]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[SRC1_SCALAR8]], [[SRC2_SCALAR23]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[SRC1_SCALAR9]], [[SRC2_SCALAR24]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[SRC1_SCALAR10]], [[SRC2_SCALAR25]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[SRC1_SCALAR11]], [[SRC2_SCALAR26]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[SRC1_SCALAR12]], [[SRC2_SCALAR27]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[SRC1_SCALAR13]], [[SRC2_SCALAR28]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[SRC1_SCALAR14]], [[SRC2_SCALAR29]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[SRC1_SCALAR15]], [[SRC2_SCALAR30]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <16 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT31:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT32:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT31]], i32 [[TMP4]], i32 2
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT33:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT32]], i32 [[TMP5]], i32 3
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT34:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT33]], i32 [[TMP6]], i32 4
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT35:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT34]], i32 [[TMP7]], i32 5
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT36:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT35]], i32 [[TMP8]], i32 6
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT37:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT36]], i32 [[TMP9]], i32 7
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT38:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT37]], i32 [[TMP10]], i32 8
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT39:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT38]], i32 [[TMP11]], i32 9
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT40:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT39]], i32 [[TMP12]], i32 10
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT41:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT40]], i32 [[TMP13]], i32 11
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT42:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT41]], i32 [[TMP14]], i32 12
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT43:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT42]], i32 [[TMP15]], i32 13
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT44:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT43]], i32 [[TMP16]], i32 14
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT45:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT44]], i32 [[TMP17]], i32 15
+; CHECK-NEXT:    store <16 x i32> [[DOTASSEMBLED_VECT45]], <16 x i32>* [[TMP1]], align 64
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <16 x i32>
+  %2 = add <16 x i32> %src1, %src2
+  store <16 x i32> %2, <16 x i32>* %1
+  ret void
+}
+
+define spir_kernel void @should_work_with_bit_wise_instruction(<2 x i32> %src1, <2 x i32> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_bit_wise_instruction(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x i32>, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[DOTASSEMBLED_VECT3]], <2 x i32>* [[TMP1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x i32>
+  %2 = shl <2 x i32> %src1, %src2
+  store <2 x i32> %2, <2 x i32>* %1
+  ret void
+}
+
+define spir_kernel void @should_work_with_constant_value(<2 x i32> %src1) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_constant_value(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x i32>, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[SRC1_SCALAR]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[SRC1_SCALAR1]], 4
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[DOTASSEMBLED_VECT2]], <2 x i32>* [[TMP1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x i32>
+  %2 = add <2 x i32> %src1, <i32 2, i32 4>
+  store <2 x i32> %2, <2 x i32>* %1
+  ret void
+}
+
+define spir_kernel void @should_work_with_nuw_nsw(<2 x i32> %src1, <2 x i32> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_nuw_nsw(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x i32>, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[DOTASSEMBLED_VECT3]], <2 x i32>* [[TMP1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x i32>
+  %2 = add nuw nsw <2 x i32> %src1, %src2
+  store <2 x i32> %2, <2 x i32>* %1
+  ret void
+}
diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-binary-instruction.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-binary-instruction.ll
new file mode 100644
index 000000000000..eeaed5137272
--- /dev/null
+++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-binary-instruction.ll
@@ -0,0 +1,260 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+;
+; RUN: igc_opt --igc-scalarize --opaque-pointers -S < %s | FileCheck %s
+; REQUIRES: llvm-14-plus
+; ------------------------------------------------
+; ScalarizeFunction
+; ------------------------------------------------
+; The pass should break vector operation into many scalar operations
+; ------------------------------------------------
+
+define spir_kernel void @basic(<2 x i32> %src1, <2 x i32> %src2) {
+; CHECK-LABEL: define spir_kernel void @basic(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x i32>, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[DOTASSEMBLED_VECT3]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x i32>
+  %2 = add <2 x i32> %src1, %src2
+  store <2 x i32> %2, ptr %1
+  ret void
+}
+
+define spir_kernel void @should_work_with_different_instruction_type(<2 x float> %src1, <2 x float> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_different_instruction_type(
+; CHECK-SAME: <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x float>, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd float [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd float [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x float> [[DOTASSEMBLED_VECT]], float [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x float> [[DOTASSEMBLED_VECT3]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x float>
+  %2 = fadd <2 x float> %src1, %src2
+  store <2 x float> %2, ptr %1
+  ret void
+}
+
+define spir_kernel void @should_work_with_exact_flag(<2 x i32> %src1, <2 x i32> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_exact_flag(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x i32>, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv exact i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = udiv exact i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[DOTASSEMBLED_VECT3]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x i32>
+  %2 = udiv exact <2 x i32> %src1, %src2
+  store <2 x i32> %2, ptr %1
+  ret void
+}
+
+; triangulating with @should_work_with_different_instruction_type
+define spir_kernel void @should_work_with_fast_math_flags(<2 x float> %src1, <2 x float> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_fast_math_flags(
+; CHECK-SAME: <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x float>, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast float [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast float [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x float> [[DOTASSEMBLED_VECT]], float [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x float> [[DOTASSEMBLED_VECT3]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x float>
+  %2 = fadd fast <2 x float> %src1, %src2
+  store <2 x float> %2, ptr %1
+  ret void
+}
+
+define spir_kernel void @should_work_with_different_value_type(<2 x i64> %src1, <2 x i64> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_different_value_type(
+; CHECK-SAME: <2 x i64> [[SRC1:%.*]], <2 x i64> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i64> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x i64> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i64> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i64> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x i64>, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i64> [[DOTASSEMBLED_VECT]], i64 [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x i64> [[DOTASSEMBLED_VECT3]], ptr [[TMP1]], align 16
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x i64>
+  %2 = add <2 x i64> %src1, %src2
+  store <2 x i64> %2, ptr %1
+  ret void
+}
+
+define spir_kernel void @should_work_with_larger_vector_size(<16 x i32> %src1, <16 x i32> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_larger_vector_size(
+; CHECK-SAME: <16 x i32> [[SRC1:%.*]], <16 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <16 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR16:%.*]] = extractelement <16 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC2_SCALAR17:%.*]] = extractelement <16 x i32> [[SRC2]], i32 2
+; CHECK-NEXT:    [[SRC2_SCALAR18:%.*]] = extractelement <16 x i32> [[SRC2]], i32 3
+; CHECK-NEXT:    [[SRC2_SCALAR19:%.*]] = extractelement <16 x i32> [[SRC2]], i32 4
+; CHECK-NEXT:    [[SRC2_SCALAR20:%.*]] = extractelement <16 x i32> [[SRC2]], i32 5
+; CHECK-NEXT:    [[SRC2_SCALAR21:%.*]] = extractelement <16 x i32> [[SRC2]], i32 6
+; CHECK-NEXT:    [[SRC2_SCALAR22:%.*]] = extractelement <16 x i32> [[SRC2]], i32 7
+; CHECK-NEXT:    [[SRC2_SCALAR23:%.*]] = extractelement <16 x i32> [[SRC2]], i32 8
+; CHECK-NEXT:    [[SRC2_SCALAR24:%.*]] = extractelement <16 x i32> [[SRC2]], i32 9
+; CHECK-NEXT:    [[SRC2_SCALAR25:%.*]] = extractelement <16 x i32> [[SRC2]], i32 10
+; CHECK-NEXT:    [[SRC2_SCALAR26:%.*]] = extractelement <16 x i32> [[SRC2]], i32 11
+; CHECK-NEXT:    [[SRC2_SCALAR27:%.*]] = extractelement <16 x i32> [[SRC2]], i32 12
+; CHECK-NEXT:    [[SRC2_SCALAR28:%.*]] = extractelement <16 x i32> [[SRC2]], i32 13
+; CHECK-NEXT:    [[SRC2_SCALAR29:%.*]] = extractelement <16 x i32> [[SRC2]], i32 14
+; CHECK-NEXT:    [[SRC2_SCALAR30:%.*]] = extractelement <16 x i32> [[SRC2]], i32 15
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <16 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <16 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR2:%.*]] = extractelement <16 x i32> [[SRC1]], i32 2
+; CHECK-NEXT:    [[SRC1_SCALAR3:%.*]] = extractelement <16 x i32> [[SRC1]], i32 3
+; CHECK-NEXT:    [[SRC1_SCALAR4:%.*]] = extractelement <16 x i32> [[SRC1]], i32 4
+; CHECK-NEXT:    [[SRC1_SCALAR5:%.*]] = extractelement <16 x i32> [[SRC1]], i32 5
+; CHECK-NEXT:    [[SRC1_SCALAR6:%.*]] = extractelement <16 x i32> [[SRC1]], i32 6
+; CHECK-NEXT:    [[SRC1_SCALAR7:%.*]] = extractelement <16 x i32> [[SRC1]], i32 7
+; CHECK-NEXT:    [[SRC1_SCALAR8:%.*]] = extractelement <16 x i32> [[SRC1]], i32 8
+; CHECK-NEXT:    [[SRC1_SCALAR9:%.*]] = extractelement <16 x i32> [[SRC1]], i32 9
+; CHECK-NEXT:    [[SRC1_SCALAR10:%.*]] = extractelement <16 x i32> [[SRC1]], i32 10
+; CHECK-NEXT:    [[SRC1_SCALAR11:%.*]] = extractelement <16 x i32> [[SRC1]], i32 11
+; CHECK-NEXT:    [[SRC1_SCALAR12:%.*]] = extractelement <16 x i32> [[SRC1]], i32 12
+; CHECK-NEXT:    [[SRC1_SCALAR13:%.*]] = extractelement <16 x i32> [[SRC1]], i32 13
+; CHECK-NEXT:    [[SRC1_SCALAR14:%.*]] = extractelement <16 x i32> [[SRC1]], i32 14
+; CHECK-NEXT:    [[SRC1_SCALAR15:%.*]] = extractelement <16 x i32> [[SRC1]], i32 15
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <16 x i32>, align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR16]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[SRC1_SCALAR2]], [[SRC2_SCALAR17]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[SRC1_SCALAR3]], [[SRC2_SCALAR18]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[SRC1_SCALAR4]], [[SRC2_SCALAR19]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[SRC1_SCALAR5]], [[SRC2_SCALAR20]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[SRC1_SCALAR6]], [[SRC2_SCALAR21]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[SRC1_SCALAR7]], [[SRC2_SCALAR22]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[SRC1_SCALAR8]], [[SRC2_SCALAR23]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[SRC1_SCALAR9]], [[SRC2_SCALAR24]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[SRC1_SCALAR10]], [[SRC2_SCALAR25]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[SRC1_SCALAR11]], [[SRC2_SCALAR26]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[SRC1_SCALAR12]], [[SRC2_SCALAR27]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[SRC1_SCALAR13]], [[SRC2_SCALAR28]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[SRC1_SCALAR14]], [[SRC2_SCALAR29]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[SRC1_SCALAR15]], [[SRC2_SCALAR30]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <16 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT31:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT32:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT31]], i32 [[TMP4]], i32 2
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT33:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT32]], i32 [[TMP5]], i32 3
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT34:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT33]], i32 [[TMP6]], i32 4
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT35:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT34]], i32 [[TMP7]], i32 5
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT36:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT35]], i32 [[TMP8]], i32 6
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT37:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT36]], i32 [[TMP9]], i32 7
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT38:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT37]], i32 [[TMP10]], i32 8
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT39:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT38]], i32 [[TMP11]], i32 9
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT40:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT39]], i32 [[TMP12]], i32 10
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT41:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT40]], i32 [[TMP13]], i32 11
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT42:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT41]], i32 [[TMP14]], i32 12
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT43:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT42]], i32 [[TMP15]], i32 13
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT44:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT43]], i32 [[TMP16]], i32 14
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT45:%.*]] = insertelement <16 x i32> [[DOTASSEMBLED_VECT44]], i32 [[TMP17]], i32 15
+; CHECK-NEXT:    store <16 x i32> [[DOTASSEMBLED_VECT45]], ptr [[TMP1]], align 64
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <16 x i32>
+  %2 = add <16 x i32> %src1, %src2
+  store <16 x i32> %2, ptr %1
+  ret void
+}
+
+define spir_kernel void @should_work_with_bit_wise_instruction(<2 x i32> %src1, <2 x i32> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_bit_wise_instruction(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x i32>, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[DOTASSEMBLED_VECT3]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x i32>
+  %2 = shl <2 x i32> %src1, %src2
+  store <2 x i32> %2, ptr %1
+  ret void
+}
+
+define spir_kernel void @should_work_with_constant_value(<2 x i32> %src1) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_constant_value(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x i32>, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[SRC1_SCALAR]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[SRC1_SCALAR1]], 4
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[DOTASSEMBLED_VECT2]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x i32>
+  %2 = add <2 x i32> %src1, <i32 2, i32 4>
+  store <2 x i32> %2, ptr %1
+  ret void
+}
+
+define spir_kernel void @should_work_with_nuw_nsw(<2 x i32> %src1, <2 x i32> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_nuw_nsw(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x i32>, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[DOTASSEMBLED_VECT3]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x i32>
+  %2 = add nuw nsw <2 x i32> %src1, %src2
+  store <2 x i32> %2, ptr %1
+  ret void
+}
diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-cast-instruction-typed-pointers.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-cast-instruction-typed-pointers.ll
new file mode 100644
index 000000000000..c2ad0702afd7
--- /dev/null
+++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-cast-instruction-typed-pointers.ll
@@ -0,0 +1,195 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+;
+; RUN: igc_opt --igc-scalarize -S < %s | FileCheck %s
+; ------------------------------------------------
+; ScalarizeFunction
+; ------------------------------------------------
+; The pass should break vector operation into many scalar operations
+; ------------------------------------------------
+
+define <2 x i8> @basic(<2 x i32> %src1) {
+; CHECK-LABEL: define <2 x i8> @basic(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[SRC1_SCALAR]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[SRC1_SCALAR1]] to i8
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i8> undef, i8 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i8> [[DOTASSEMBLED_VECT]], i8 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i8> [[DOTASSEMBLED_VECT2]]
+;
+  %1 = trunc <2 x i32> %src1 to <2 x i8>
+  ret <2 x i8> %1
+}
+
+define <2 x float> @should_work_with_different_instruction_type(<2 x double> %src1) {
+; CHECK-LABEL: define <2 x float> @should_work_with_different_instruction_type(
+; CHECK-SAME: <2 x double> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x double> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x double> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc double [[SRC1_SCALAR]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = fptrunc double [[SRC1_SCALAR1]] to float
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x float> [[DOTASSEMBLED_VECT]], float [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[DOTASSEMBLED_VECT2]]
+;
+  %1 = fptrunc <2 x double> %src1 to <2 x float>
+  ret <2 x float> %1
+}
+
+define <2 x i8> @should_work_with_different_value_type(<2 x i64> %src1) {
+; CHECK-LABEL: define <2 x i8> @should_work_with_different_value_type(
+; CHECK-SAME: <2 x i64> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i64> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i64> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[SRC1_SCALAR]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[SRC1_SCALAR1]] to i8
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i8> undef, i8 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i8> [[DOTASSEMBLED_VECT]], i8 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i8> [[DOTASSEMBLED_VECT2]]
+;
+  %1 = trunc <2 x i64> %src1 to <2 x i8>
+  ret <2 x i8> %1
+}
+
+define <2 x i16> @should_work_with_different_cast_type(<2 x i32> %src1) {
+; CHECK-LABEL: define <2 x i16> @should_work_with_different_cast_type(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[SRC1_SCALAR]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[SRC1_SCALAR1]] to i16
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i16> undef, i16 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i16> [[DOTASSEMBLED_VECT]], i16 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i16> [[DOTASSEMBLED_VECT2]]
+;
+  %1 = trunc <2 x i32> %src1 to <2 x i16>
+  ret <2 x i16> %1
+}
+
+define <2 x float> @should_work_with_type_cast_type_2(<2 x i32> %src1) {
+; CHECK-LABEL: define <2 x float> @should_work_with_type_cast_type_2(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[SRC1_SCALAR]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[SRC1_SCALAR1]] to float
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x float> [[DOTASSEMBLED_VECT]], float [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[DOTASSEMBLED_VECT2]]
+;
+  %1 = bitcast <2 x i32> %src1 to <2 x float>
+  ret <2 x float> %1
+}
+
+define <2 x i32> @should_work_with_type_extension(<2 x i16> %src1) {
+; CHECK-LABEL: define <2 x i32> @should_work_with_type_extension(
+; CHECK-SAME: <2 x i16> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i16> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i16> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[SRC1_SCALAR]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[SRC1_SCALAR1]] to i32
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i32> [[DOTASSEMBLED_VECT2]]
+;
+  %1 = zext <2 x i16> %src1 to <2 x i32>
+  ret <2 x i32> %1
+}
+
+define <16 x i8> @should_work_with_larger_vector_size(<16 x i32> %src1) {
+; CHECK-LABEL: define <16 x i8> @should_work_with_larger_vector_size(
+; CHECK-SAME: <16 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <16 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <16 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR2:%.*]] = extractelement <16 x i32> [[SRC1]], i32 2
+; CHECK-NEXT:    [[SRC1_SCALAR3:%.*]] = extractelement <16 x i32> [[SRC1]], i32 3
+; CHECK-NEXT:    [[SRC1_SCALAR4:%.*]] = extractelement <16 x i32> [[SRC1]], i32 4
+; CHECK-NEXT:    [[SRC1_SCALAR5:%.*]] = extractelement <16 x i32> [[SRC1]], i32 5
+; CHECK-NEXT:    [[SRC1_SCALAR6:%.*]] = extractelement <16 x i32> [[SRC1]], i32 6
+; CHECK-NEXT:    [[SRC1_SCALAR7:%.*]] = extractelement <16 x i32> [[SRC1]], i32 7
+; CHECK-NEXT:    [[SRC1_SCALAR8:%.*]] = extractelement <16 x i32> [[SRC1]], i32 8
+; CHECK-NEXT:    [[SRC1_SCALAR9:%.*]] = extractelement <16 x i32> [[SRC1]], i32 9
+; CHECK-NEXT:    [[SRC1_SCALAR10:%.*]] = extractelement <16 x i32> [[SRC1]], i32 10
+; CHECK-NEXT:    [[SRC1_SCALAR11:%.*]] = extractelement <16 x i32> [[SRC1]], i32 11
+; CHECK-NEXT:    [[SRC1_SCALAR12:%.*]] = extractelement <16 x i32> [[SRC1]], i32 12
+; CHECK-NEXT:    [[SRC1_SCALAR13:%.*]] = extractelement <16 x i32> [[SRC1]], i32 13
+; CHECK-NEXT:    [[SRC1_SCALAR14:%.*]] = extractelement <16 x i32> [[SRC1]], i32 14
+; CHECK-NEXT:    [[SRC1_SCALAR15:%.*]] = extractelement <16 x i32> [[SRC1]], i32 15
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[SRC1_SCALAR]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[SRC1_SCALAR1]] to i8
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[SRC1_SCALAR2]] to i8
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[SRC1_SCALAR3]] to i8
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[SRC1_SCALAR4]] to i8
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[SRC1_SCALAR5]] to i8
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i32 [[SRC1_SCALAR6]] to i8
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i32 [[SRC1_SCALAR7]] to i8
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[SRC1_SCALAR8]] to i8
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc i32 [[SRC1_SCALAR9]] to i8
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i32 [[SRC1_SCALAR10]] to i8
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc i32 [[SRC1_SCALAR11]] to i8
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[SRC1_SCALAR12]] to i8
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i32 [[SRC1_SCALAR13]] to i8
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc i32 [[SRC1_SCALAR14]] to i8
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc i32 [[SRC1_SCALAR15]] to i8
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <16 x i8> undef, i8 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT16:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT]], i8 [[TMP2]], i32 1
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT17:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT16]], i8 [[TMP3]], i32 2
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT18:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT17]], i8 [[TMP4]], i32 3
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT19:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT18]], i8 [[TMP5]], i32 4
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT20:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT19]], i8 [[TMP6]], i32 5
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT21:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT20]], i8 [[TMP7]], i32 6
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT22:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT21]], i8 [[TMP8]], i32 7
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT23:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT22]], i8 [[TMP9]], i32 8
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT24:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT23]], i8 [[TMP10]], i32 9
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT25:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT24]], i8 [[TMP11]], i32 10
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT26:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT25]], i8 [[TMP12]], i32 11
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT27:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT26]], i8 [[TMP13]], i32 12
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT28:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT27]], i8 [[TMP14]], i32 13
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT29:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT28]], i8 [[TMP15]], i32 14
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT30:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT29]], i8 [[TMP16]], i32 15
+; CHECK-NEXT:    ret <16 x i8> [[DOTASSEMBLED_VECT30]]
+;
+  %1 = trunc <16 x i32> %src1 to <16 x i8>
+  ret <16 x i8> %1
+}
+
+define <2 x float*> @should_work_with_different_instruction_type_2(<2 x i64> %src1) {
+; CHECK-LABEL: define <2 x float*> @should_work_with_different_instruction_type_2(
+; CHECK-SAME: <2 x i64> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i64> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i64> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[SRC1_SCALAR]] to float*
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[SRC1_SCALAR1]] to float*
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float*> undef, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x float*> [[DOTASSEMBLED_VECT]], float* [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x float*> [[DOTASSEMBLED_VECT2]]
+;
+  %1 = inttoptr <2 x i64> %src1 to <2 x float*>
+  ret <2 x float*> %1
+}
+
+define <2 x i8> @should_not_scalarize_constants() {
+; CHECK-LABEL: define <2 x i8> @should_not_scalarize_constants() {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> <i32 2, i32 4> to <2 x i8>
+; CHECK-NEXT:    ret <2 x i8> [[TMP1]]
+;
+  %1 = trunc <2 x i32> <i32 2, i32 4> to <2 x i8>
+  ret <2 x i8> %1
+}
+
+define i8 @should_not_scalarize_scalar() {
+; CHECK-LABEL: define i8 @should_not_scalarize_scalar() {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 4 to i8
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %1 = trunc i32 4 to i8
+  ret i8 %1
+}
diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-cast-instruction.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-cast-instruction.ll
new file mode 100644
index 000000000000..e8369bb48df1
--- /dev/null
+++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-cast-instruction.ll
@@ -0,0 +1,196 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+;
+; RUN: igc_opt --igc-scalarize --opaque-pointers -S < %s | FileCheck %s
+; REQUIRES: llvm-14-plus
+; ------------------------------------------------
+; ScalarizeFunction
+; ------------------------------------------------
+; The pass should break vector operation into many scalar operations
+; ------------------------------------------------
+
+define <2 x i8> @basic(<2 x i32> %src1) {
+; CHECK-LABEL: define <2 x i8> @basic(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[SRC1_SCALAR]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[SRC1_SCALAR1]] to i8
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i8> undef, i8 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i8> [[DOTASSEMBLED_VECT]], i8 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i8> [[DOTASSEMBLED_VECT2]]
+;
+  %1 = trunc <2 x i32> %src1 to <2 x i8>
+  ret <2 x i8> %1
+}
+
+define <2 x float> @should_work_with_different_instruction_type(<2 x double> %src1) {
+; CHECK-LABEL: define <2 x float> @should_work_with_different_instruction_type(
+; CHECK-SAME: <2 x double> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x double> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x double> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc double [[SRC1_SCALAR]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = fptrunc double [[SRC1_SCALAR1]] to float
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x float> [[DOTASSEMBLED_VECT]], float [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[DOTASSEMBLED_VECT2]]
+;
+  %1 = fptrunc <2 x double> %src1 to <2 x float>
+  ret <2 x float> %1
+}
+
+define <2 x i8> @should_work_with_different_value_type(<2 x i64> %src1) {
+; CHECK-LABEL: define <2 x i8> @should_work_with_different_value_type(
+; CHECK-SAME: <2 x i64> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i64> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i64> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[SRC1_SCALAR]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[SRC1_SCALAR1]] to i8
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i8> undef, i8 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i8> [[DOTASSEMBLED_VECT]], i8 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i8> [[DOTASSEMBLED_VECT2]]
+;
+  %1 = trunc <2 x i64> %src1 to <2 x i8>
+  ret <2 x i8> %1
+}
+
+define <2 x i16> @should_work_with_different_cast_type(<2 x i32> %src1) {
+; CHECK-LABEL: define <2 x i16> @should_work_with_different_cast_type(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[SRC1_SCALAR]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[SRC1_SCALAR1]] to i16
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i16> undef, i16 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i16> [[DOTASSEMBLED_VECT]], i16 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i16> [[DOTASSEMBLED_VECT2]]
+;
+  %1 = trunc <2 x i32> %src1 to <2 x i16>
+  ret <2 x i16> %1
+}
+
+define <2 x float> @should_work_with_type_cast_type_2(<2 x i32> %src1) {
+; CHECK-LABEL: define <2 x float> @should_work_with_type_cast_type_2(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[SRC1_SCALAR]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[SRC1_SCALAR1]] to float
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x float> [[DOTASSEMBLED_VECT]], float [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[DOTASSEMBLED_VECT2]]
+;
+  %1 = bitcast <2 x i32> %src1 to <2 x float>
+  ret <2 x float> %1
+}
+
+define <2 x i32> @should_work_with_type_extension(<2 x i16> %src1) {
+; CHECK-LABEL: define <2 x i32> @should_work_with_type_extension(
+; CHECK-SAME: <2 x i16> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i16> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i16> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[SRC1_SCALAR]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[SRC1_SCALAR1]] to i32
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i32> [[DOTASSEMBLED_VECT]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i32> [[DOTASSEMBLED_VECT2]]
+;
+  %1 = zext <2 x i16> %src1 to <2 x i32>
+  ret <2 x i32> %1
+}
+
+define <16 x i8> @should_work_with_larger_vector_size(<16 x i32> %src1) {
+; CHECK-LABEL: define <16 x i8> @should_work_with_larger_vector_size(
+; CHECK-SAME: <16 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <16 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <16 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR2:%.*]] = extractelement <16 x i32> [[SRC1]], i32 2
+; CHECK-NEXT:    [[SRC1_SCALAR3:%.*]] = extractelement <16 x i32> [[SRC1]], i32 3
+; CHECK-NEXT:    [[SRC1_SCALAR4:%.*]] = extractelement <16 x i32> [[SRC1]], i32 4
+; CHECK-NEXT:    [[SRC1_SCALAR5:%.*]] = extractelement <16 x i32> [[SRC1]], i32 5
+; CHECK-NEXT:    [[SRC1_SCALAR6:%.*]] = extractelement <16 x i32> [[SRC1]], i32 6
+; CHECK-NEXT:    [[SRC1_SCALAR7:%.*]] = extractelement <16 x i32> [[SRC1]], i32 7
+; CHECK-NEXT:    [[SRC1_SCALAR8:%.*]] = extractelement <16 x i32> [[SRC1]], i32 8
+; CHECK-NEXT:    [[SRC1_SCALAR9:%.*]] = extractelement <16 x i32> [[SRC1]], i32 9
+; CHECK-NEXT:    [[SRC1_SCALAR10:%.*]] = extractelement <16 x i32> [[SRC1]], i32 10
+; CHECK-NEXT:    [[SRC1_SCALAR11:%.*]] = extractelement <16 x i32> [[SRC1]], i32 11
+; CHECK-NEXT:    [[SRC1_SCALAR12:%.*]] = extractelement <16 x i32> [[SRC1]], i32 12
+; CHECK-NEXT:    [[SRC1_SCALAR13:%.*]] = extractelement <16 x i32> [[SRC1]], i32 13
+; CHECK-NEXT:    [[SRC1_SCALAR14:%.*]] = extractelement <16 x i32> [[SRC1]], i32 14
+; CHECK-NEXT:    [[SRC1_SCALAR15:%.*]] = extractelement <16 x i32> [[SRC1]], i32 15
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[SRC1_SCALAR]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[SRC1_SCALAR1]] to i8
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[SRC1_SCALAR2]] to i8
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[SRC1_SCALAR3]] to i8
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[SRC1_SCALAR4]] to i8
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[SRC1_SCALAR5]] to i8
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i32 [[SRC1_SCALAR6]] to i8
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i32 [[SRC1_SCALAR7]] to i8
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[SRC1_SCALAR8]] to i8
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc i32 [[SRC1_SCALAR9]] to i8
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i32 [[SRC1_SCALAR10]] to i8
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc i32 [[SRC1_SCALAR11]] to i8
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[SRC1_SCALAR12]] to i8
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i32 [[SRC1_SCALAR13]] to i8
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc i32 [[SRC1_SCALAR14]] to i8
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc i32 [[SRC1_SCALAR15]] to i8
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <16 x i8> undef, i8 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT16:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT]], i8 [[TMP2]], i32 1
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT17:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT16]], i8 [[TMP3]], i32 2
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT18:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT17]], i8 [[TMP4]], i32 3
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT19:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT18]], i8 [[TMP5]], i32 4
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT20:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT19]], i8 [[TMP6]], i32 5
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT21:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT20]], i8 [[TMP7]], i32 6
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT22:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT21]], i8 [[TMP8]], i32 7
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT23:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT22]], i8 [[TMP9]], i32 8
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT24:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT23]], i8 [[TMP10]], i32 9
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT25:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT24]], i8 [[TMP11]], i32 10
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT26:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT25]], i8 [[TMP12]], i32 11
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT27:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT26]], i8 [[TMP13]], i32 12
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT28:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT27]], i8 [[TMP14]], i32 13
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT29:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT28]], i8 [[TMP15]], i32 14
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT30:%.*]] = insertelement <16 x i8> [[DOTASSEMBLED_VECT29]], i8 [[TMP16]], i32 15
+; CHECK-NEXT:    ret <16 x i8> [[DOTASSEMBLED_VECT30]]
+;
+  %1 = trunc <16 x i32> %src1 to <16 x i8>
+  ret <16 x i8> %1
+}
+
+define <2 x ptr> @should_work_with_different_instruction_type_2(<2 x i64> %src1) {
+; CHECK-LABEL: define <2 x ptr> @should_work_with_different_instruction_type_2(
+; CHECK-SAME: <2 x i64> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i64> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i64> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[SRC1_SCALAR]] to ptr
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[SRC1_SCALAR1]] to ptr
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x ptr> undef, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x ptr> [[DOTASSEMBLED_VECT]], ptr [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x ptr> [[DOTASSEMBLED_VECT2]]
+;
+  %1 = inttoptr <2 x i64> %src1 to <2 x ptr>
+  ret <2 x ptr> %1
+}
+
+define <2 x i8> @should_not_scalarize_constants() {
+; CHECK-LABEL: define <2 x i8> @should_not_scalarize_constants() {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> <i32 2, i32 4> to <2 x i8>
+; CHECK-NEXT:    ret <2 x i8> [[TMP1]]
+;
+  %1 = trunc <2 x i32> <i32 2, i32 4> to <2 x i8>
+  ret <2 x i8> %1
+}
+
+define i8 @should_not_scalarize_scalar() {
+; CHECK-LABEL: define i8 @should_not_scalarize_scalar() {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 4 to i8
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %1 = trunc i32 4 to i8
+  ret i8 %1
+}
diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-comp-instruction-typed-pointers.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-comp-instruction-typed-pointers.ll
new file mode 100644
index 000000000000..97a4d37693bc
--- /dev/null
+++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-comp-instruction-typed-pointers.ll
@@ -0,0 +1,217 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+;
+; RUN: igc_opt --igc-scalarize -S < %s | FileCheck %s
+; ------------------------------------------------
+; ScalarizeFunction
+; ------------------------------------------------
+; The pass should break vector operation into many scalar operations
+; ------------------------------------------------
+
+define <2 x i1> @basic(<2 x i32> %src1, <2 x i32> %src2) {
+; CHECK-LABEL: define <2 x i1> @basic(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i1> [[DOTASSEMBLED_VECT3]]
+;
+  %1 = icmp eq <2 x i32> %src1, %src2
+  ret <2 x i1> %1
+}
+
+define <2 x i1> @should_work_with_different_instruction_type(<2 x float> %src1, <2 x float> %src2) {
+; CHECK-LABEL: define <2 x i1> @should_work_with_different_instruction_type(
+; CHECK-SAME: <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ueq float [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ueq float [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i1> [[DOTASSEMBLED_VECT3]]
+;
+  %1 = fcmp ueq <2 x float> %src1, %src2
+  ret <2 x i1> %1
+}
+
+; triangulating with @should_work_with_different_instruction_type
+; update checks if fast will be preserved
+define <2 x i1> @should_work_with_fast_math_flags(<2 x float> %src1, <2 x float> %src2) {
+; CHECK-LABEL: define <2 x i1> @should_work_with_fast_math_flags(
+; CHECK-SAME: <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ueq float [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ueq float [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i1> [[DOTASSEMBLED_VECT3]]
+;
+  %1 = fcmp fast ueq <2 x float> %src1, %src2
+  ret <2 x i1> %1
+}
+
+define <2 x i1> @should_work_with_different_value_type(<2 x i64> %src1, <2 x i64> %src2) {
+; CHECK-LABEL: define <2 x i1> @should_work_with_different_value_type(
+; CHECK-SAME: <2 x i64> [[SRC1:%.*]], <2 x i64> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i64> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x i64> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i64> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i64> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i1> [[DOTASSEMBLED_VECT3]]
+;
+  %1 = icmp eq <2 x i64> %src1, %src2
+  ret <2 x i1> %1
+}
+
+define <16 x i1> @should_work_with_larger_vector_size(<16 x i32> %src1, <16 x i32> %src2) {
+; CHECK-LABEL: define <16 x i1> @should_work_with_larger_vector_size(
+; CHECK-SAME: <16 x i32> [[SRC1:%.*]], <16 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <16 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR16:%.*]] = extractelement <16 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC2_SCALAR17:%.*]] = extractelement <16 x i32> [[SRC2]], i32 2
+; CHECK-NEXT:    [[SRC2_SCALAR18:%.*]] = extractelement <16 x i32> [[SRC2]], i32 3
+; CHECK-NEXT:    [[SRC2_SCALAR19:%.*]] = extractelement <16 x i32> [[SRC2]], i32 4
+; CHECK-NEXT:    [[SRC2_SCALAR20:%.*]] = extractelement <16 x i32> [[SRC2]], i32 5
+; CHECK-NEXT:    [[SRC2_SCALAR21:%.*]] = extractelement <16 x i32> [[SRC2]], i32 6
+; CHECK-NEXT:    [[SRC2_SCALAR22:%.*]] = extractelement <16 x i32> [[SRC2]], i32 7
+; CHECK-NEXT:    [[SRC2_SCALAR23:%.*]] = extractelement <16 x i32> [[SRC2]], i32 8
+; CHECK-NEXT:    [[SRC2_SCALAR24:%.*]] = extractelement <16 x i32> [[SRC2]], i32 9
+; CHECK-NEXT:    [[SRC2_SCALAR25:%.*]] = extractelement <16 x i32> [[SRC2]], i32 10
+; CHECK-NEXT:    [[SRC2_SCALAR26:%.*]] = extractelement <16 x i32> [[SRC2]], i32 11
+; CHECK-NEXT:    [[SRC2_SCALAR27:%.*]] = extractelement <16 x i32> [[SRC2]], i32 12
+; CHECK-NEXT:    [[SRC2_SCALAR28:%.*]] = extractelement <16 x i32> [[SRC2]], i32 13
+; CHECK-NEXT:    [[SRC2_SCALAR29:%.*]] = extractelement <16 x i32> [[SRC2]], i32 14
+; CHECK-NEXT:    [[SRC2_SCALAR30:%.*]] = extractelement <16 x i32> [[SRC2]], i32 15
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <16 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <16 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR2:%.*]] = extractelement <16 x i32> [[SRC1]], i32 2
+; CHECK-NEXT:    [[SRC1_SCALAR3:%.*]] = extractelement <16 x i32> [[SRC1]], i32 3
+; CHECK-NEXT:    [[SRC1_SCALAR4:%.*]] = extractelement <16 x i32> [[SRC1]], i32 4
+; CHECK-NEXT:    [[SRC1_SCALAR5:%.*]] = extractelement <16 x i32> [[SRC1]], i32 5
+; CHECK-NEXT:    [[SRC1_SCALAR6:%.*]] = extractelement <16 x i32> [[SRC1]], i32 6
+; CHECK-NEXT:    [[SRC1_SCALAR7:%.*]] = extractelement <16 x i32> [[SRC1]], i32 7
+; CHECK-NEXT:    [[SRC1_SCALAR8:%.*]] = extractelement <16 x i32> [[SRC1]], i32 8
+; CHECK-NEXT:    [[SRC1_SCALAR9:%.*]] = extractelement <16 x i32> [[SRC1]], i32 9
+; CHECK-NEXT:    [[SRC1_SCALAR10:%.*]] = extractelement <16 x i32> [[SRC1]], i32 10
+; CHECK-NEXT:    [[SRC1_SCALAR11:%.*]] = extractelement <16 x i32> [[SRC1]], i32 11
+; CHECK-NEXT:    [[SRC1_SCALAR12:%.*]] = extractelement <16 x i32> [[SRC1]], i32 12
+; CHECK-NEXT:    [[SRC1_SCALAR13:%.*]] = extractelement <16 x i32> [[SRC1]], i32 13
+; CHECK-NEXT:    [[SRC1_SCALAR14:%.*]] = extractelement <16 x i32> [[SRC1]], i32 14
+; CHECK-NEXT:    [[SRC1_SCALAR15:%.*]] = extractelement <16 x i32> [[SRC1]], i32 15
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR16]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[SRC1_SCALAR2]], [[SRC2_SCALAR17]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[SRC1_SCALAR3]], [[SRC2_SCALAR18]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[SRC1_SCALAR4]], [[SRC2_SCALAR19]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[SRC1_SCALAR5]], [[SRC2_SCALAR20]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[SRC1_SCALAR6]], [[SRC2_SCALAR21]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[SRC1_SCALAR7]], [[SRC2_SCALAR22]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[SRC1_SCALAR8]], [[SRC2_SCALAR23]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[SRC1_SCALAR9]], [[SRC2_SCALAR24]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[SRC1_SCALAR10]], [[SRC2_SCALAR25]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[SRC1_SCALAR11]], [[SRC2_SCALAR26]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[SRC1_SCALAR12]], [[SRC2_SCALAR27]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[SRC1_SCALAR13]], [[SRC2_SCALAR28]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[SRC1_SCALAR14]], [[SRC2_SCALAR29]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[SRC1_SCALAR15]], [[SRC2_SCALAR30]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <16 x i1> undef, i1 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT31:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT32:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT31]], i1 [[TMP3]], i32 2
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT33:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT32]], i1 [[TMP4]], i32 3
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT34:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT33]], i1 [[TMP5]], i32 4
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT35:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT34]], i1 [[TMP6]], i32 5
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT36:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT35]], i1 [[TMP7]], i32 6
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT37:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT36]], i1 [[TMP8]], i32 7
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT38:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT37]], i1 [[TMP9]], i32 8
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT39:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT38]], i1 [[TMP10]], i32 9
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT40:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT39]], i1 [[TMP11]], i32 10
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT41:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT40]], i1 [[TMP12]], i32 11
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT42:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT41]], i1 [[TMP13]], i32 12
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT43:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT42]], i1 [[TMP14]], i32 13
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT44:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT43]], i1 [[TMP15]], i32 14
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT45:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT44]], i1 [[TMP16]], i32 15
+; CHECK-NEXT:    ret <16 x i1> [[DOTASSEMBLED_VECT45]]
+;
+  %1 = icmp eq <16 x i32> %src1, %src2
+  ret <16 x i1> %1
+}
+
+define <2 x i1> @should_work_with_constant_value(<2 x i32> %src1) {
+; CHECK-LABEL: define <2 x i1> @should_work_with_constant_value(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[SRC1_SCALAR]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[SRC1_SCALAR1]], 8
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i1> [[DOTASSEMBLED_VECT2]]
+;
+  %1 = icmp eq <2 x i32> %src1, <i32 4, i32 8>
+  ret <2 x i1> %1
+}
+
+define <2 x i1> @should_work_with_different_coparison_type(<2 x i32> %src1, <2 x i32> %src2) {
+; CHECK-LABEL: define <2 x i1> @should_work_with_different_coparison_type(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp uge i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i1> [[DOTASSEMBLED_VECT3]]
+;
+  %1 = icmp uge <2 x i32> %src1, %src2
+  ret <2 x i1> %1
+}
+
+; triangulating with @should_work_with_different_instruction_type
+define <2 x i1> @should_work_with_different_coparison_type_2(<2 x float> %src1, <2 x float> %src2) {
+; CHECK-LABEL: define <2 x i1> @should_work_with_different_coparison_type_2(
+; CHECK-SAME: <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp false float [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp false float [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i1> [[DOTASSEMBLED_VECT3]]
+;
+  %1 = fcmp false <2 x float> %src1, %src2
+  ret <2 x i1> %1
+}
+
+define <2 x i1> @should_not_scalarize_two_constants() {
+; CHECK-LABEL: define <2 x i1> @should_not_scalarize_two_constants() {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i32> <i32 4, i32 4>, <i32 4, i32 8>
+; CHECK-NEXT:    ret <2 x i1> [[TMP1]]
+;
+  %1 = icmp eq <2 x i32> <i32 4, i32 4>, <i32 4, i32 8>
+  ret <2 x i1> %1
+}
diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-comp-instruction.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-comp-instruction.ll
new file mode 100644
index 000000000000..ed86ff2f3611
--- /dev/null
+++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-comp-instruction.ll
@@ -0,0 +1,218 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+;
+; RUN: igc_opt --igc-scalarize --opaque-pointers -S < %s | FileCheck %s
+; REQUIRES: llvm-14-plus
+; ------------------------------------------------
+; ScalarizeFunction
+; ------------------------------------------------
+; The pass should break vector operation into many scalar operations
+; ------------------------------------------------
+
+define <2 x i1> @basic(<2 x i32> %src1, <2 x i32> %src2) {
+; CHECK-LABEL: define <2 x i1> @basic(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i1> [[DOTASSEMBLED_VECT3]]
+;
+  %1 = icmp eq <2 x i32> %src1, %src2
+  ret <2 x i1> %1
+}
+
+define <2 x i1> @should_work_with_different_instruction_type(<2 x float> %src1, <2 x float> %src2) {
+; CHECK-LABEL: define <2 x i1> @should_work_with_different_instruction_type(
+; CHECK-SAME: <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ueq float [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ueq float [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i1> [[DOTASSEMBLED_VECT3]]
+;
+  %1 = fcmp ueq <2 x float> %src1, %src2
+  ret <2 x i1> %1
+}
+
+; triangulating with @should_work_with_different_instruction_type
+; update checks if fast will be preserved
+define <2 x i1> @should_work_with_fast_math_flags(<2 x float> %src1, <2 x float> %src2) {
+; CHECK-LABEL: define <2 x i1> @should_work_with_fast_math_flags(
+; CHECK-SAME: <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ueq float [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ueq float [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i1> [[DOTASSEMBLED_VECT3]]
+;
+  %1 = fcmp fast ueq <2 x float> %src1, %src2
+  ret <2 x i1> %1
+}
+
+define <2 x i1> @should_work_with_different_value_type(<2 x i64> %src1, <2 x i64> %src2) {
+; CHECK-LABEL: define <2 x i1> @should_work_with_different_value_type(
+; CHECK-SAME: <2 x i64> [[SRC1:%.*]], <2 x i64> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i64> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x i64> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i64> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i64> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i1> [[DOTASSEMBLED_VECT3]]
+;
+  %1 = icmp eq <2 x i64> %src1, %src2
+  ret <2 x i1> %1
+}
+
+define <16 x i1> @should_work_with_larger_vector_size(<16 x i32> %src1, <16 x i32> %src2) {
+; CHECK-LABEL: define <16 x i1> @should_work_with_larger_vector_size(
+; CHECK-SAME: <16 x i32> [[SRC1:%.*]], <16 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <16 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR16:%.*]] = extractelement <16 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC2_SCALAR17:%.*]] = extractelement <16 x i32> [[SRC2]], i32 2
+; CHECK-NEXT:    [[SRC2_SCALAR18:%.*]] = extractelement <16 x i32> [[SRC2]], i32 3
+; CHECK-NEXT:    [[SRC2_SCALAR19:%.*]] = extractelement <16 x i32> [[SRC2]], i32 4
+; CHECK-NEXT:    [[SRC2_SCALAR20:%.*]] = extractelement <16 x i32> [[SRC2]], i32 5
+; CHECK-NEXT:    [[SRC2_SCALAR21:%.*]] = extractelement <16 x i32> [[SRC2]], i32 6
+; CHECK-NEXT:    [[SRC2_SCALAR22:%.*]] = extractelement <16 x i32> [[SRC2]], i32 7
+; CHECK-NEXT:    [[SRC2_SCALAR23:%.*]] = extractelement <16 x i32> [[SRC2]], i32 8
+; CHECK-NEXT:    [[SRC2_SCALAR24:%.*]] = extractelement <16 x i32> [[SRC2]], i32 9
+; CHECK-NEXT:    [[SRC2_SCALAR25:%.*]] = extractelement <16 x i32> [[SRC2]], i32 10
+; CHECK-NEXT:    [[SRC2_SCALAR26:%.*]] = extractelement <16 x i32> [[SRC2]], i32 11
+; CHECK-NEXT:    [[SRC2_SCALAR27:%.*]] = extractelement <16 x i32> [[SRC2]], i32 12
+; CHECK-NEXT:    [[SRC2_SCALAR28:%.*]] = extractelement <16 x i32> [[SRC2]], i32 13
+; CHECK-NEXT:    [[SRC2_SCALAR29:%.*]] = extractelement <16 x i32> [[SRC2]], i32 14
+; CHECK-NEXT:    [[SRC2_SCALAR30:%.*]] = extractelement <16 x i32> [[SRC2]], i32 15
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <16 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <16 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR2:%.*]] = extractelement <16 x i32> [[SRC1]], i32 2
+; CHECK-NEXT:    [[SRC1_SCALAR3:%.*]] = extractelement <16 x i32> [[SRC1]], i32 3
+; CHECK-NEXT:    [[SRC1_SCALAR4:%.*]] = extractelement <16 x i32> [[SRC1]], i32 4
+; CHECK-NEXT:    [[SRC1_SCALAR5:%.*]] = extractelement <16 x i32> [[SRC1]], i32 5
+; CHECK-NEXT:    [[SRC1_SCALAR6:%.*]] = extractelement <16 x i32> [[SRC1]], i32 6
+; CHECK-NEXT:    [[SRC1_SCALAR7:%.*]] = extractelement <16 x i32> [[SRC1]], i32 7
+; CHECK-NEXT:    [[SRC1_SCALAR8:%.*]] = extractelement <16 x i32> [[SRC1]], i32 8
+; CHECK-NEXT:    [[SRC1_SCALAR9:%.*]] = extractelement <16 x i32> [[SRC1]], i32 9
+; CHECK-NEXT:    [[SRC1_SCALAR10:%.*]] = extractelement <16 x i32> [[SRC1]], i32 10
+; CHECK-NEXT:    [[SRC1_SCALAR11:%.*]] = extractelement <16 x i32> [[SRC1]], i32 11
+; CHECK-NEXT:    [[SRC1_SCALAR12:%.*]] = extractelement <16 x i32> [[SRC1]], i32 12
+; CHECK-NEXT:    [[SRC1_SCALAR13:%.*]] = extractelement <16 x i32> [[SRC1]], i32 13
+; CHECK-NEXT:    [[SRC1_SCALAR14:%.*]] = extractelement <16 x i32> [[SRC1]], i32 14
+; CHECK-NEXT:    [[SRC1_SCALAR15:%.*]] = extractelement <16 x i32> [[SRC1]], i32 15
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR16]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[SRC1_SCALAR2]], [[SRC2_SCALAR17]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[SRC1_SCALAR3]], [[SRC2_SCALAR18]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[SRC1_SCALAR4]], [[SRC2_SCALAR19]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[SRC1_SCALAR5]], [[SRC2_SCALAR20]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[SRC1_SCALAR6]], [[SRC2_SCALAR21]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[SRC1_SCALAR7]], [[SRC2_SCALAR22]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[SRC1_SCALAR8]], [[SRC2_SCALAR23]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[SRC1_SCALAR9]], [[SRC2_SCALAR24]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[SRC1_SCALAR10]], [[SRC2_SCALAR25]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[SRC1_SCALAR11]], [[SRC2_SCALAR26]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[SRC1_SCALAR12]], [[SRC2_SCALAR27]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[SRC1_SCALAR13]], [[SRC2_SCALAR28]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[SRC1_SCALAR14]], [[SRC2_SCALAR29]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[SRC1_SCALAR15]], [[SRC2_SCALAR30]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <16 x i1> undef, i1 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT31:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT32:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT31]], i1 [[TMP3]], i32 2
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT33:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT32]], i1 [[TMP4]], i32 3
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT34:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT33]], i1 [[TMP5]], i32 4
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT35:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT34]], i1 [[TMP6]], i32 5
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT36:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT35]], i1 [[TMP7]], i32 6
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT37:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT36]], i1 [[TMP8]], i32 7
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT38:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT37]], i1 [[TMP9]], i32 8
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT39:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT38]], i1 [[TMP10]], i32 9
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT40:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT39]], i1 [[TMP11]], i32 10
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT41:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT40]], i1 [[TMP12]], i32 11
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT42:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT41]], i1 [[TMP13]], i32 12
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT43:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT42]], i1 [[TMP14]], i32 13
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT44:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT43]], i1 [[TMP15]], i32 14
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT45:%.*]] = insertelement <16 x i1> [[DOTASSEMBLED_VECT44]], i1 [[TMP16]], i32 15
+; CHECK-NEXT:    ret <16 x i1> [[DOTASSEMBLED_VECT45]]
+;
+  %1 = icmp eq <16 x i32> %src1, %src2
+  ret <16 x i1> %1
+}
+
+define <2 x i1> @should_work_with_constant_value(<2 x i32> %src1) {
+; CHECK-LABEL: define <2 x i1> @should_work_with_constant_value(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[SRC1_SCALAR]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[SRC1_SCALAR1]], 8
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i1> [[DOTASSEMBLED_VECT2]]
+;
+  %1 = icmp eq <2 x i32> %src1, <i32 4, i32 8>
+  ret <2 x i1> %1
+}
+
+define <2 x i1> @should_work_with_different_coparison_type(<2 x i32> %src1, <2 x i32> %src2) {
+; CHECK-LABEL: define <2 x i1> @should_work_with_different_coparison_type(
+; CHECK-SAME: <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i32 [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp uge i32 [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i1> [[DOTASSEMBLED_VECT3]]
+;
+  %1 = icmp uge <2 x i32> %src1, %src2
+  ret <2 x i1> %1
+}
+
+; triangulating with @should_work_with_different_instruction_type
+define <2 x i1> @should_work_with_different_coparison_type_2(<2 x float> %src1, <2 x float> %src2) {
+; CHECK-LABEL: define <2 x i1> @should_work_with_different_coparison_type_2(
+; CHECK-SAME: <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp false float [[SRC1_SCALAR]], [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp false float [[SRC1_SCALAR1]], [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x i1> undef, i1 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT3:%.*]] = insertelement <2 x i1> [[DOTASSEMBLED_VECT]], i1 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i1> [[DOTASSEMBLED_VECT3]]
+;
+  %1 = fcmp false <2 x float> %src1, %src2
+  ret <2 x i1> %1
+}
+
+define <2 x i1> @should_not_scalarize_two_constants() {
+; CHECK-LABEL: define <2 x i1> @should_not_scalarize_two_constants() {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i32> <i32 4, i32 4>, <i32 4, i32 8>
+; CHECK-NEXT:    ret <2 x i1> [[TMP1]]
+;
+  %1 = icmp eq <2 x i32> <i32 4, i32 4>, <i32 4, i32 8>
+  ret <2 x i1> %1
+}
diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-getelementptr-instruction-typed-pointers.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-getelementptr-instruction-typed-pointers.ll
new file mode 100644
index 000000000000..c57b0533194a
--- /dev/null
+++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-getelementptr-instruction-typed-pointers.ll
@@ -0,0 +1,190 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+;
+; RUN: igc_opt --igc-scalarize -S < %s | FileCheck %s
+; ------------------------------------------------
+; ScalarizeFunction
+; ------------------------------------------------
+; The pass should break vector operation into many scalar operations
+; ------------------------------------------------
+
+define double @basic(<2 x double*> %pointers) {
+; CHECK-LABEL: define double @basic(
+; CHECK-SAME: <2 x double*> [[POINTERS:%.*]]) {
+; CHECK-NEXT:    [[POINTERS_SCALAR:%.*]] = extractelement <2 x double*> [[POINTERS]], i32 0
+; CHECK-NEXT:    [[POINTERS_SCALAR1:%.*]] = extractelement <2 x double*> [[POINTERS]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE2:%.*]] = getelementptr double, double* [[POINTERS_SCALAR]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE_ASSEMBLED_VECT:%.*]] = insertelement <2 x double*> undef, double* [[POINTER_TO_DOUBLE2]], i32 0
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE3:%.*]] = getelementptr double, double* [[POINTERS_SCALAR1]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE_ASSEMBLED_VECT4:%.*]] = insertelement <2 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT]], double* [[POINTER_TO_DOUBLE3]], i32 1
+; CHECK-NEXT:    [[VAL0:%.*]] = load double, double* [[POINTER_TO_DOUBLE2]], align 8
+; CHECK-NEXT:    [[VAL1:%.*]] = load double, double* [[POINTER_TO_DOUBLE3]], align 8
+; CHECK-NEXT:    [[RETURN:%.*]] = fadd double [[VAL0]], [[VAL1]]
+; CHECK-NEXT:    ret double [[RETURN]]
+;
+  %pointer_to_double = getelementptr double, <2 x double*> %pointers, i32 1
+
+  %ptr0 = extractelement <2 x double*> %pointer_to_double, i32 0
+  %ptr1 = extractelement <2 x double*> %pointer_to_double, i32 1
+  %val0 = load double, double* %ptr0
+  %val1 = load double, double* %ptr1
+  %return = fadd double %val0, %val1
+  ret double %return
+}
+
+define double @should_work_with_vector_of_indices(<2 x double*> %pointers) {
+; CHECK-LABEL: define double @should_work_with_vector_of_indices(
+; CHECK-SAME: <2 x double*> [[POINTERS:%.*]]) {
+; CHECK-NEXT:    [[POINTERS_SCALAR:%.*]] = extractelement <2 x double*> [[POINTERS]], i32 0
+; CHECK-NEXT:    [[POINTERS_SCALAR1:%.*]] = extractelement <2 x double*> [[POINTERS]], i32 1
+; CHECK-NEXT:    [[POINTERS_TO_DOUBLE2:%.*]] = getelementptr double, double* [[POINTERS_SCALAR]], i32 0
+; CHECK-NEXT:    [[POINTERS_TO_DOUBLE_ASSEMBLED_VECT:%.*]] = insertelement <2 x double*> undef, double* [[POINTERS_TO_DOUBLE2]], i32 0
+; CHECK-NEXT:    [[POINTERS_TO_DOUBLE3:%.*]] = getelementptr double, double* [[POINTERS_SCALAR1]], i32 1
+; CHECK-NEXT:    [[POINTERS_TO_DOUBLE_ASSEMBLED_VECT4:%.*]] = insertelement <2 x double*> [[POINTERS_TO_DOUBLE_ASSEMBLED_VECT]], double* [[POINTERS_TO_DOUBLE3]], i32 1
+; CHECK-NEXT:    [[VAL0:%.*]] = load double, double* [[POINTERS_TO_DOUBLE2]], align 8
+; CHECK-NEXT:    [[VAL1:%.*]] = load double, double* [[POINTERS_TO_DOUBLE3]], align 8
+; CHECK-NEXT:    [[RETURN:%.*]] = fadd double [[VAL0]], [[VAL1]]
+; CHECK-NEXT:    ret double [[RETURN]]
+;
+  %pointers_to_double = getelementptr double, <2 x double*> %pointers, <2 x i32> <i32 0, i32 1>
+
+  %ptr0 = extractelement <2 x double*> %pointers_to_double, i32 0
+  %ptr1 = extractelement <2 x double*> %pointers_to_double, i32 1
+  %val0 = load double, double* %ptr0
+  %val1 = load double, double* %ptr1
+  %return = fadd double %val0, %val1
+  ret double %return
+}
+
+define i64 @should_work_with_different_value_type(<2 x i64*> %pointers) {
+; CHECK-LABEL: define i64 @should_work_with_different_value_type(
+; CHECK-SAME: <2 x i64*> [[POINTERS:%.*]]) {
+; CHECK-NEXT:    [[POINTERS_SCALAR:%.*]] = extractelement <2 x i64*> [[POINTERS]], i32 0
+; CHECK-NEXT:    [[POINTERS_SCALAR1:%.*]] = extractelement <2 x i64*> [[POINTERS]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_I642:%.*]] = getelementptr i64, i64* [[POINTERS_SCALAR]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_I64_ASSEMBLED_VECT:%.*]] = insertelement <2 x i64*> undef, i64* [[POINTER_TO_I642]], i32 0
+; CHECK-NEXT:    [[POINTER_TO_I643:%.*]] = getelementptr i64, i64* [[POINTERS_SCALAR1]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_I64_ASSEMBLED_VECT4:%.*]] = insertelement <2 x i64*> [[POINTER_TO_I64_ASSEMBLED_VECT]], i64* [[POINTER_TO_I643]], i32 1
+; CHECK-NEXT:    [[VAL0:%.*]] = load i64, i64* [[POINTER_TO_I642]], align 4
+; CHECK-NEXT:    [[VAL1:%.*]] = load i64, i64* [[POINTER_TO_I643]], align 4
+; CHECK-NEXT:    [[RETURN:%.*]] = add i64 [[VAL0]], [[VAL1]]
+; CHECK-NEXT:    ret i64 [[RETURN]]
+;
+  %pointer_to_i64 = getelementptr i64, <2 x i64*> %pointers, i32 1
+
+  %ptr0 = extractelement <2 x i64*> %pointer_to_i64, i32 0
+  %ptr1 = extractelement <2 x i64*> %pointer_to_i64, i32 1
+  %val0 = load i64, i64* %ptr0
+  %val1 = load i64, i64* %ptr1
+  %return = add i64 %val0, %val1
+  ret i64 %return
+}
+
+define double @should_work_with_larger_vector_size(<16 x double*> %pointers) {
+; CHECK-LABEL: define double @should_work_with_larger_vector_size(
+; CHECK-SAME: <16 x double*> [[POINTERS:%.*]]) {
+; CHECK-NEXT:    [[POINTERS_SCALAR:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 0
+; CHECK-NEXT:    [[POINTERS_SCALAR1:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 1
+; CHECK-NEXT:    [[POINTERS_SCALAR2:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 2
+; CHECK-NEXT:    [[POINTERS_SCALAR3:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 3
+; CHECK-NEXT:    [[POINTERS_SCALAR4:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 4
+; CHECK-NEXT:    [[POINTERS_SCALAR5:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 5
+; CHECK-NEXT:    [[POINTERS_SCALAR6:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 6
+; CHECK-NEXT:    [[POINTERS_SCALAR7:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 7
+; CHECK-NEXT:    [[POINTERS_SCALAR8:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 8
+; CHECK-NEXT:    [[POINTERS_SCALAR9:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 9
+; CHECK-NEXT:    [[POINTERS_SCALAR10:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 10
+; CHECK-NEXT:    [[POINTERS_SCALAR11:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 11
+; CHECK-NEXT:    [[POINTERS_SCALAR12:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 12
+; CHECK-NEXT:    [[POINTERS_SCALAR13:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 13
+; CHECK-NEXT:    [[POINTERS_SCALAR14:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 14
+; CHECK-NEXT:    [[POINTERS_SCALAR15:%.*]] = extractelement <16 x double*> [[POINTERS]], i32 15
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE16:%.*]] = getelementptr double, double* [[POINTERS_SCALAR]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE_ASSEMBLED_VECT:%.*]] = insertelement <16 x double*> undef, double* [[POINTER_TO_DOUBLE16]], i32 0
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE17:%.*]] = getelementptr double, double* [[POINTERS_SCALAR1]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE_ASSEMBLED_VECT18:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT]], double* [[POINTER_TO_DOUBLE17]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE19:%.*]] = getelementptr double, double* [[POINTERS_SCALAR2]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE_ASSEMBLED_VECT20:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT18]], double* [[POINTER_TO_DOUBLE19]], i32 2
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE21:%.*]] = getelementptr double, double* [[POINTERS_SCALAR3]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE_ASSEMBLED_VECT22:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT20]], double* [[POINTER_TO_DOUBLE21]], i32 3
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE23:%.*]] = getelementptr double, double* [[POINTERS_SCALAR4]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE_ASSEMBLED_VECT24:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT22]], double* [[POINTER_TO_DOUBLE23]], i32 4
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE25:%.*]] = getelementptr double, double* [[POINTERS_SCALAR5]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE_ASSEMBLED_VECT26:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT24]], double* [[POINTER_TO_DOUBLE25]], i32 5
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE27:%.*]] = getelementptr double, double* [[POINTERS_SCALAR6]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE_ASSEMBLED_VECT28:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT26]], double* [[POINTER_TO_DOUBLE27]], i32 6
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE29:%.*]] = getelementptr double, double* [[POINTERS_SCALAR7]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE_ASSEMBLED_VECT30:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT28]], double* [[POINTER_TO_DOUBLE29]], i32 7
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE31:%.*]] = getelementptr double, double* [[POINTERS_SCALAR8]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE_ASSEMBLED_VECT32:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT30]], double* [[POINTER_TO_DOUBLE31]], i32 8
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE33:%.*]] = getelementptr double, double* [[POINTERS_SCALAR9]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE_ASSEMBLED_VECT34:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT32]], double* [[POINTER_TO_DOUBLE33]], i32 9
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE35:%.*]] = getelementptr double, double* [[POINTERS_SCALAR10]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE_ASSEMBLED_VECT36:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT34]], double* [[POINTER_TO_DOUBLE35]], i32 10
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE37:%.*]] = getelementptr double, double* [[POINTERS_SCALAR11]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE_ASSEMBLED_VECT38:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT36]], double* [[POINTER_TO_DOUBLE37]], i32 11
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE39:%.*]] = getelementptr double, double* [[POINTERS_SCALAR12]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE_ASSEMBLED_VECT40:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT38]], double* [[POINTER_TO_DOUBLE39]], i32 12
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE41:%.*]] = getelementptr double, double* [[POINTERS_SCALAR13]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE_ASSEMBLED_VECT42:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT40]], double* [[POINTER_TO_DOUBLE41]], i32 13
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE43:%.*]] = getelementptr double, double* [[POINTERS_SCALAR14]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE_ASSEMBLED_VECT44:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT42]], double* [[POINTER_TO_DOUBLE43]], i32 14
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE45:%.*]] = getelementptr double, double* [[POINTERS_SCALAR15]], i32 1
+; CHECK-NEXT:    [[POINTER_TO_DOUBLE_ASSEMBLED_VECT46:%.*]] = insertelement <16 x double*> [[POINTER_TO_DOUBLE_ASSEMBLED_VECT44]], double* [[POINTER_TO_DOUBLE45]], i32 15
+; CHECK-NEXT:    [[VAL0:%.*]] = load double, double* [[POINTER_TO_DOUBLE16]], align 8
+; CHECK-NEXT:    [[VAL1:%.*]] = load double, double* [[POINTER_TO_DOUBLE17]], align 8
+; CHECK-NEXT:    [[RETURN:%.*]] = fadd double [[VAL0]], [[VAL1]]
+; CHECK-NEXT:    ret double [[RETURN]]
+;
+  %pointer_to_double = getelementptr double, <16 x double*> %pointers, i32 1
+
+  %ptr0 = extractelement <16 x double*> %pointer_to_double, i32 0
+  %ptr1 = extractelement <16 x double*> %pointer_to_double, i32 1
+  %val0 = load double, double* %ptr0
+  %val1 = load double, double* %ptr1
+  %return = fadd double %val0, %val1
+  ret double %return
+}
+
+%some_type = type {i64, i32}
+
+; checks early out for non: <result> = getelementptr <ty>, <N x ptr> <ptrval>, <vector index type> <idx>
+define i64 @should_not_scalarize_with_more_then_one_index(%some_type* %pointer) {
+; CHECK-LABEL: @should_not_scalarize_with_more_then_one_index(
+; CHECK-NEXT:    [[POINTER_TO_INT:%.*]] = getelementptr [[SOME_TYPE:%.*]], %some_type* [[POINTER:%.*]], i32 0, i32 0
+; CHECK-NEXT:    [[VAL0:%.*]] = load i64, i64* [[POINTER_TO_INT]], align 4
+; CHECK-NEXT:    [[RETURN:%.*]] = add i64 [[VAL0]], [[VAL0]]
+; CHECK-NEXT:    ret i64 [[RETURN]]
+;
+  %pointer_to_int = getelementptr %some_type, %some_type* %pointer, i32 0, i32 0
+
+  %val0 = load i64, i64* %pointer_to_int
+  %return = add i64 %val0, %val0
+  ret i64 %return
+}
+
+; checks early out for non: <result> = getelementptr <ty>, <N x ptr> <ptrval>, <vector index type> <idx>
+define i64 @should_scalarize_only_vectors(%some_type* %pointer) {
+; CHECK-LABEL: @should_scalarize_only_vectors(
+; CHECK-NEXT:    [[POINTER_SOME_TYPE:%.*]] = getelementptr [[SOME_TYPE:%.*]], %some_type* [[POINTER:%.*]], i32 1
+; CHECK-NEXT:    [[VAL:%.*]] = load [[SOME_TYPE]], %some_type* [[POINTER_SOME_TYPE]], align 4
+; CHECK-NEXT:    [[VAL0:%.*]] = extractvalue [[SOME_TYPE]] [[VAL]], 0
+; CHECK-NEXT:    [[VAL1:%.*]] = extractvalue [[SOME_TYPE]] [[VAL]], 0
+; CHECK-NEXT:    [[RETURN:%.*]] = add i64 [[VAL0]], [[VAL1]]
+; CHECK-NEXT:    ret i64 [[RETURN]]
+;
+  %pointer_some_type = getelementptr %some_type, %some_type* %pointer, i32 1
+
+  %val = load %some_type, %some_type* %pointer_some_type
+
+  %val0 = extractvalue %some_type %val, 0
+  %val1 = extractvalue %some_type %val, 0
+  %return = add i64 %val0, %val1
+  ret i64 %return
+}
\ No newline at end of file
diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-getelementptr-instruction.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-getelementptr-instruction.ll
new file mode 100644
index 000000000000..7d064bcc58ce
--- /dev/null
+++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-getelementptr-instruction.ll
@@ -0,0 +1,82 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+;
+; RUN: igc_opt --igc-scalarize --opaque-pointers -S < %s | FileCheck %s
+; REQUIRES: llvm-14-plus, opaque-ptr-fix
+; ------------------------------------------------
+; ScalarizeFunction
+; ------------------------------------------------
+; The pass should break vector operation into many scalar operations
+
+; add checks when the pass will support opaque pointers (and remove "opaque-ptr-fix")
+; ------------------------------------------------
+
+define double @basic(<2 x ptr> %pointers) {
+  %pointer_to_double = getelementptr double, <2 x ptr> %pointers, i32 1
+
+  %ptr0 = extractelement <2 x ptr> %pointer_to_double, i32 0
+  %ptr1 = extractelement <2 x ptr> %pointer_to_double, i32 1
+  %val0 = load double, ptr %ptr0
+  %val1 = load double, ptr %ptr1
+  %return = fadd double %val0, %val1
+  ret double %return
+}
+
+define double @should_work_with_vector_of_indices(<2 x ptr> %pointers) {
+  %pointers_to_double = getelementptr double, <2 x ptr> %pointers, <2 x i32> <i32 0, i32 1>
+
+  %ptr0 = extractelement <2 x ptr> %pointers_to_double, i32 0
+  %ptr1 = extractelement <2 x ptr> %pointers_to_double, i32 1
+  %val0 = load double, ptr %ptr0
+  %val1 = load double, ptr %ptr1
+  %return = fadd double %val0, %val1
+  ret double %return
+}
+
+define i64 @should_work_with_different_value_type(<2 x ptr> %pointers) {
+  %pointer_to_i64 = getelementptr i64, <2 x ptr> %pointers, i32 1
+
+  %ptr0 = extractelement <2 x ptr> %pointer_to_i64, i32 0
+  %ptr1 = extractelement <2 x ptr> %pointer_to_i64, i32 1
+  %val0 = load i64, ptr %ptr0
+  %val1 = load i64, ptr %ptr1
+  %return = add i64 %val0, %val1
+  ret i64 %return
+}
+
+define double @should_work_with_larger_vector_size(<16 x ptr> %pointers) {
+  %pointer_to_double = getelementptr double, <16 x ptr> %pointers, i32 1
+
+  %ptr0 = extractelement <16 x ptr> %pointer_to_double, i32 0
+  %ptr1 = extractelement <16 x ptr> %pointer_to_double, i32 1
+  %val0 = load double, ptr %ptr0
+  %val1 = load double, ptr %ptr1
+  %return = fadd double %val0, %val1
+  ret double %return
+}
+
+%some_type = type {i64, i32}
+
+define i64 @should_not_scalarize_with_more_then_index(ptr %pointer) {
+  %pointer_to_int = getelementptr %some_type, ptr %pointer, i32 0, i32 0
+
+  %val0 = load i64, ptr %pointer_to_int
+  %return = add i64 %val0, %val0
+  ret i64 %return
+}
+
+define i64 @should_scalarize_only_vectors(ptr %pointer) {
+  %pointer_some_type = getelementptr %some_type, ptr %pointer, i32 1
+
+  %val = load %some_type, ptr %pointer_some_type
+
+  %val0 = extractvalue %some_type %val, 0
+  %val1 = extractvalue %some_type %val, 0
+  %return = add i64 %val0, %val1
+  ret i64 %return
+}
diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-phi-instruction-typed-pointers.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-phi-instruction-typed-pointers.ll
new file mode 100644
index 000000000000..70b9a72ced7d
--- /dev/null
+++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-phi-instruction-typed-pointers.ll
@@ -0,0 +1,587 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+;
+; RUN: igc_opt --igc-scalarize -S < %s | FileCheck %s
+; ------------------------------------------------
+; ScalarizeFunction
+; ------------------------------------------------
+; The pass should break vector operation into many scalar operations
+; ------------------------------------------------
+
+define <2 x i32> @basic(i1 %switch, <2 x i32> %src1, <2 x i32> %src2) {
+; CHECK-LABEL: define <2 x i32> @basic(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR4:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR3:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RESULT1:%.*]] = phi i32 [ [[SRC1_SCALAR]], %[[FIRST]] ], [ [[SRC2_SCALAR]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT2:%.*]] = phi i32 [ [[SRC1_SCALAR3]], %[[FIRST]] ], [ [[SRC2_SCALAR4]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[RESULT1]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT5:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT2]], i32 1
+; CHECK-NEXT:    ret <2 x i32> [[RESULT_ASSEMBLED_VECT5]]
+;
+entry:
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %result = phi <2 x i32> [ %src1, %first], [ %src2, %second]
+  ret <2 x i32> %result
+}
+
+define <2 x float> @should_work_with_different_value_type(i1 %switch, <2 x float> %src1, <2 x float> %src2) {
+; CHECK-LABEL: define <2 x float> @should_work_with_different_value_type(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR4:%.*]] = extractelement <2 x float> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR3:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RESULT1:%.*]] = phi float [ [[SRC1_SCALAR]], %[[FIRST]] ], [ [[SRC2_SCALAR]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT2:%.*]] = phi float [ [[SRC1_SCALAR3]], %[[FIRST]] ], [ [[SRC2_SCALAR4]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[RESULT1]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT5:%.*]] = insertelement <2 x float> [[RESULT_ASSEMBLED_VECT]], float [[RESULT2]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[RESULT_ASSEMBLED_VECT5]]
+;
+entry:
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %result = phi <2 x float> [ %src1, %first], [ %src2, %second]
+  ret <2 x float> %result
+}
+
+; triangulating with @should_work_with_different_value_type
+; update checks if fast will be preserved
+define <2 x float> @should_work_with_fast_math_flags(i1 %switch, <2 x float> %src1, <2 x float> %src2) {
+; CHECK-LABEL: define <2 x float> @should_work_with_fast_math_flags(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR4:%.*]] = extractelement <2 x float> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR3:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RESULT1:%.*]] = phi float [ [[SRC1_SCALAR]], %[[FIRST]] ], [ [[SRC2_SCALAR]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT2:%.*]] = phi float [ [[SRC1_SCALAR3]], %[[FIRST]] ], [ [[SRC2_SCALAR4]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[RESULT1]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT5:%.*]] = insertelement <2 x float> [[RESULT_ASSEMBLED_VECT]], float [[RESULT2]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[RESULT_ASSEMBLED_VECT5]]
+;
+  entry:
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %result = phi fast <2 x float> [ %src1, %first], [ %src2, %second]
+  ret <2 x float> %result
+}
+
+define <16 x i32> @should_work_with_larger_vector_size(i1 %switch, <16 x i32> %src1, <16 x i32> %src2) {
+; CHECK-LABEL: define <16 x i32> @should_work_with_larger_vector_size(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <16 x i32> [[SRC1:%.*]], <16 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <16 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR32:%.*]] = extractelement <16 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC2_SCALAR33:%.*]] = extractelement <16 x i32> [[SRC2]], i32 2
+; CHECK-NEXT:    [[SRC2_SCALAR34:%.*]] = extractelement <16 x i32> [[SRC2]], i32 3
+; CHECK-NEXT:    [[SRC2_SCALAR35:%.*]] = extractelement <16 x i32> [[SRC2]], i32 4
+; CHECK-NEXT:    [[SRC2_SCALAR36:%.*]] = extractelement <16 x i32> [[SRC2]], i32 5
+; CHECK-NEXT:    [[SRC2_SCALAR37:%.*]] = extractelement <16 x i32> [[SRC2]], i32 6
+; CHECK-NEXT:    [[SRC2_SCALAR38:%.*]] = extractelement <16 x i32> [[SRC2]], i32 7
+; CHECK-NEXT:    [[SRC2_SCALAR39:%.*]] = extractelement <16 x i32> [[SRC2]], i32 8
+; CHECK-NEXT:    [[SRC2_SCALAR40:%.*]] = extractelement <16 x i32> [[SRC2]], i32 9
+; CHECK-NEXT:    [[SRC2_SCALAR41:%.*]] = extractelement <16 x i32> [[SRC2]], i32 10
+; CHECK-NEXT:    [[SRC2_SCALAR42:%.*]] = extractelement <16 x i32> [[SRC2]], i32 11
+; CHECK-NEXT:    [[SRC2_SCALAR43:%.*]] = extractelement <16 x i32> [[SRC2]], i32 12
+; CHECK-NEXT:    [[SRC2_SCALAR44:%.*]] = extractelement <16 x i32> [[SRC2]], i32 13
+; CHECK-NEXT:    [[SRC2_SCALAR45:%.*]] = extractelement <16 x i32> [[SRC2]], i32 14
+; CHECK-NEXT:    [[SRC2_SCALAR46:%.*]] = extractelement <16 x i32> [[SRC2]], i32 15
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <16 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR17:%.*]] = extractelement <16 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR18:%.*]] = extractelement <16 x i32> [[SRC1]], i32 2
+; CHECK-NEXT:    [[SRC1_SCALAR19:%.*]] = extractelement <16 x i32> [[SRC1]], i32 3
+; CHECK-NEXT:    [[SRC1_SCALAR20:%.*]] = extractelement <16 x i32> [[SRC1]], i32 4
+; CHECK-NEXT:    [[SRC1_SCALAR21:%.*]] = extractelement <16 x i32> [[SRC1]], i32 5
+; CHECK-NEXT:    [[SRC1_SCALAR22:%.*]] = extractelement <16 x i32> [[SRC1]], i32 6
+; CHECK-NEXT:    [[SRC1_SCALAR23:%.*]] = extractelement <16 x i32> [[SRC1]], i32 7
+; CHECK-NEXT:    [[SRC1_SCALAR24:%.*]] = extractelement <16 x i32> [[SRC1]], i32 8
+; CHECK-NEXT:    [[SRC1_SCALAR25:%.*]] = extractelement <16 x i32> [[SRC1]], i32 9
+; CHECK-NEXT:    [[SRC1_SCALAR26:%.*]] = extractelement <16 x i32> [[SRC1]], i32 10
+; CHECK-NEXT:    [[SRC1_SCALAR27:%.*]] = extractelement <16 x i32> [[SRC1]], i32 11
+; CHECK-NEXT:    [[SRC1_SCALAR28:%.*]] = extractelement <16 x i32> [[SRC1]], i32 12
+; CHECK-NEXT:    [[SRC1_SCALAR29:%.*]] = extractelement <16 x i32> [[SRC1]], i32 13
+; CHECK-NEXT:    [[SRC1_SCALAR30:%.*]] = extractelement <16 x i32> [[SRC1]], i32 14
+; CHECK-NEXT:    [[SRC1_SCALAR31:%.*]] = extractelement <16 x i32> [[SRC1]], i32 15
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RESULT1:%.*]] = phi i32 [ [[SRC1_SCALAR]], %[[FIRST]] ], [ [[SRC2_SCALAR]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT2:%.*]] = phi i32 [ [[SRC1_SCALAR17]], %[[FIRST]] ], [ [[SRC2_SCALAR32]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT3:%.*]] = phi i32 [ [[SRC1_SCALAR18]], %[[FIRST]] ], [ [[SRC2_SCALAR33]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT4:%.*]] = phi i32 [ [[SRC1_SCALAR19]], %[[FIRST]] ], [ [[SRC2_SCALAR34]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT5:%.*]] = phi i32 [ [[SRC1_SCALAR20]], %[[FIRST]] ], [ [[SRC2_SCALAR35]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT6:%.*]] = phi i32 [ [[SRC1_SCALAR21]], %[[FIRST]] ], [ [[SRC2_SCALAR36]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT7:%.*]] = phi i32 [ [[SRC1_SCALAR22]], %[[FIRST]] ], [ [[SRC2_SCALAR37]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT8:%.*]] = phi i32 [ [[SRC1_SCALAR23]], %[[FIRST]] ], [ [[SRC2_SCALAR38]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT9:%.*]] = phi i32 [ [[SRC1_SCALAR24]], %[[FIRST]] ], [ [[SRC2_SCALAR39]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT10:%.*]] = phi i32 [ [[SRC1_SCALAR25]], %[[FIRST]] ], [ [[SRC2_SCALAR40]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT11:%.*]] = phi i32 [ [[SRC1_SCALAR26]], %[[FIRST]] ], [ [[SRC2_SCALAR41]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT12:%.*]] = phi i32 [ [[SRC1_SCALAR27]], %[[FIRST]] ], [ [[SRC2_SCALAR42]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT13:%.*]] = phi i32 [ [[SRC1_SCALAR28]], %[[FIRST]] ], [ [[SRC2_SCALAR43]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT14:%.*]] = phi i32 [ [[SRC1_SCALAR29]], %[[FIRST]] ], [ [[SRC2_SCALAR44]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT15:%.*]] = phi i32 [ [[SRC1_SCALAR30]], %[[FIRST]] ], [ [[SRC2_SCALAR45]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT16:%.*]] = phi i32 [ [[SRC1_SCALAR31]], %[[FIRST]] ], [ [[SRC2_SCALAR46]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <16 x i32> undef, i32 [[RESULT1]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT47:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT2]], i32 1
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT48:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT47]], i32 [[RESULT3]], i32 2
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT49:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT48]], i32 [[RESULT4]], i32 3
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT50:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT49]], i32 [[RESULT5]], i32 4
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT51:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT50]], i32 [[RESULT6]], i32 5
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT52:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT51]], i32 [[RESULT7]], i32 6
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT53:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT52]], i32 [[RESULT8]], i32 7
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT54:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT53]], i32 [[RESULT9]], i32 8
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT55:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT54]], i32 [[RESULT10]], i32 9
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT56:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT55]], i32 [[RESULT11]], i32 10
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT57:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT56]], i32 [[RESULT12]], i32 11
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT58:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT57]], i32 [[RESULT13]], i32 12
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT59:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT58]], i32 [[RESULT14]], i32 13
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT60:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT59]], i32 [[RESULT15]], i32 14
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT61:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT60]], i32 [[RESULT16]], i32 15
+; CHECK-NEXT:    ret <16 x i32> [[RESULT_ASSEMBLED_VECT61]]
+;
+entry:
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %result = phi <16 x i32> [ %src1, %first], [ %src2, %second]
+  ret <16 x i32> %result
+}
+
+define <2 x i32> @should_work_with_constant_value(i1 %switch, <2 x i32> %src1) {
+; CHECK-LABEL: define <2 x i32> @should_work_with_constant_value(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR3:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RESULT1:%.*]] = phi i32 [ [[SRC1_SCALAR]], %[[FIRST]] ], [ 2, %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT2:%.*]] = phi i32 [ [[SRC1_SCALAR3]], %[[FIRST]] ], [ 4, %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[RESULT1]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT4:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT2]], i32 1
+; CHECK-NEXT:    ret <2 x i32> [[RESULT_ASSEMBLED_VECT4]]
+;
+entry:
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %result = phi <2 x i32> [ %src1, %first], [ <i32 2, i32 4>, %second]
+  ret <2 x i32> %result
+}
+
+
+define <8 x i32> @should_not_scalarize(i1 %switch, <8 x i32> %src1) {
+; CHECK-LABEL: define <8 x i32> @should_not_scalarize(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <8 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SHOULD_NOT_SCALARIZE_ME:%.*]] = call <8 x i32> @llvm.genx.GenISA.vmeSendIME2(<8 x i32> [[SRC1]], i32 1, i32 2, i32 3, i32 0)
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi <8 x i32> [ [[SRC1]], %[[FIRST]] ], [ [[SHOULD_NOT_SCALARIZE_ME]], %[[SECOND]] ]
+; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
+;
+entry:
+  %should_not_scalarize_me = call <8 x i32> @llvm.genx.GenISA.vmeSendIME2(<8 x i32> %src1, i32 1, i32 2, i32 3, i32 0)
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %result = phi <8 x i32> [ %src1, %first], [ %should_not_scalarize_me, %second]
+  ret <8 x i32> %result
+}
+
+define <4 x i32> @should_not_scalarize_2(i1 %switch, <4 x i32> %src1) {
+; CHECK-LABEL: define <4 x i32> @should_not_scalarize_2(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <4 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SHOULD_NOT_SCALARIZE_ME:%.*]] = call <4 x i32> @llvm.genx.GenISA.vmeSendFBR2(<4 x i32> [[SRC1]], i32 1, i32 2, i32 3)
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi <4 x i32> [ [[SRC1]], %[[FIRST]] ], [ [[SHOULD_NOT_SCALARIZE_ME]], %[[SECOND]] ]
+; CHECK-NEXT:    ret <4 x i32> [[RESULT]]
+;
+entry:
+  %should_not_scalarize_me = call <4 x i32> @llvm.genx.GenISA.vmeSendFBR2(<4 x i32> %src1, i32 1, i32 2, i32 3)
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %result = phi <4 x i32> [ %src1, %first], [ %should_not_scalarize_me, %second]
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @should_not_scalarize_3(i1 %switch, <4 x i32> %src1) {
+; CHECK-LABEL: define <4 x i32> @should_not_scalarize_3(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <4 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SHOULD_NOT_SCALARIZE_ME:%.*]] = call <4 x i32> @llvm.genx.GenISA.vmeSendSIC2(<4 x i32> [[SRC1]], i32 1, i32 2, i32 3)
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi <4 x i32> [ [[SRC1]], %[[FIRST]] ], [ [[SHOULD_NOT_SCALARIZE_ME]], %[[SECOND]] ]
+; CHECK-NEXT:    ret <4 x i32> [[RESULT]]
+;
+entry:
+  %should_not_scalarize_me = call <4 x i32> @llvm.genx.GenISA.vmeSendSIC2(<4 x i32> %src1, i32 1, i32 2, i32 3)
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %result = phi <4 x i32> [ %src1, %first], [ %should_not_scalarize_me, %second]
+  ret <4 x i32> %result
+}
+
+define <8 x float> @should_not_scalarize_4(i1 %switch, <8 x float> %src1, <8 x float> %src2) {
+; CHECK-LABEL: define <8 x float> @should_not_scalarize_4(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <8 x float> [[SRC1:%.*]], <8 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, <8 x i16>* @vector.8x.i16, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* @vector.8x.i32, align 32
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <8 x float> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RETURN:%.*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[SHOULD_NOT_SCALARIZE_ME]], <8 x i16> [[TMP0]], <8 x i32> [[TMP1]], i32 11, i32 11, i32 8, i32 8, i1 false)
+; CHECK-NEXT:    ret <8 x float> [[RETURN]]
+;
+entry:
+  %0 = load <8 x i16>, <8 x i16>* @vector.8x.i16
+  %1 = load <8 x i32>, <8 x i32>* @vector.8x.i32
+
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %should_not_scalarize_me = phi <8 x float> [ %src1, %first], [ %src2, %second]
+  %return = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %should_not_scalarize_me, <8 x i16> %0, <8 x i32> %1, i32 11, i32 11, i32 8, i32 8, i1 false)
+  ret <8 x float> %return
+}
+
+define i32 @should_not_scalarize_5(i1 %switch, <8 x i32> %src1, <8 x i32> %src2) {
+; CHECK-LABEL: define i32 @should_not_scalarize_5(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <8 x i32> [[SRC1:%.*]], <8 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* @vector.8x.i32, align 32
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <8 x i32> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RETURN:%.*]] = call i32 @llvm.genx.GenISA.dpas.v8i32(<8 x i32> [[SHOULD_NOT_SCALARIZE_ME]], <8 x i32> [[TMP0]], <8 x i32> [[TMP0]], i32 7, i32 7, i32 8, i32 1, i1 false)
+; CHECK-NEXT:    ret i32 [[RETURN]]
+;
+entry:
+  %0 = load <8 x i32>, <8 x i32>* @vector.8x.i32
+
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %should_not_scalarize_me = phi <8 x i32> [ %src1, %first], [ %src2, %second]
+  %return = call i32 @llvm.genx.GenISA.dpas.v8i32(<8 x i32> %should_not_scalarize_me, <8 x i32> %0, <8 x i32> %0, i32 7, i32 7, i32 8, i32 1, i1 false)
+  ret i32 %return
+}
+
+define spir_kernel void @should_not_scalarize_6(i1 %switch, <2 x i32> addrspace(1)* %src1, <2 x i32> addrspace(1)* %src2) {
+; CHECK-LABEL: define spir_kernel void @should_not_scalarize_6(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x i32> addrspace(1)* [[SRC1:%.*]], <2 x i32> addrspace(1)* [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <2 x i32> addrspace(1)* [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ]
+; CHECK-NEXT:    call void @llvm.genx.GenISA.simdBlockWrite.v2i32.p1v2i32(<2 x i32> addrspace(1)* [[SHOULD_NOT_SCALARIZE_ME]], <2 x i32> <i32 2, i32 4>)
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %should_not_scalarize_me = phi <2 x i32> addrspace(1)* [ %src1, %first], [ %src2, %second]
+  call void @llvm.genx.GenISA.simdBlockWrite.v2i32.p1v2i32(<2 x i32> addrspace(1)* %should_not_scalarize_me, <2 x i32> <i32 2, i32 4>)
+  ret void
+}
+
+define spir_kernel void @should_not_scalarize_7(i1 %switch, <2 x float> %src1, <2 x float> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_not_scalarize_7(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i8, align 1, addrspace(2490368)
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <2 x float> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ]
+; CHECK-NEXT:    call void @llvm.genx.GenISA.simdBlockWriteBindless.p2490368i8.v2f32.i32(i8 addrspace(2490368)* [[TMP0]], <2 x float> [[SHOULD_NOT_SCALARIZE_ME]], i32 0)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = alloca i8, addrspace(2490368)
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %should_not_scalarize_me = phi <2 x float> [ %src1, %first], [ %src2, %second]
+  call void @llvm.genx.GenISA.simdBlockWriteBindless.p2490368i8.v2f32.i32(i8 addrspace(2490368)* %0, <2 x float> %should_not_scalarize_me, i32 0)
+  ret void
+}
+
+define spir_kernel void @should_not_scalarize_8(i1 %switch, <16 x i16> %src1, <16 x i16> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_not_scalarize_8(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <16 x i16> [[SRC1:%.*]], <16 x i16> [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <16 x i16> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ]
+; CHECK-NEXT:    call void @llvm.genx.GenISA.simdMediaBlockWrite.v16i16(i32 1, i32 1, i32 1, i32 1, <16 x i16> [[SHOULD_NOT_SCALARIZE_ME]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %should_not_scalarize_me = phi <16 x i16> [ %src1, %first], [ %src2, %second]
+  call void @llvm.genx.GenISA.simdMediaBlockWrite.v16i16(i32 1, i32 1, i32 1, i32 1, <16 x i16> %should_not_scalarize_me)
+  ret void
+}
+
+define spir_kernel void @should_not_scalarize_9(i1 %switch, <8 x i32> %src1, <8 x i32> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_not_scalarize_9(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <8 x i32> [[SRC1:%.*]], <8 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <8 x i32> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ]
+; CHECK-NEXT:    call void @llvm.genx.GenISA.LSC2DBlockWrite.p0i32(i64 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i1 false, i1 true, i32 11, <8 x i32> [[SHOULD_NOT_SCALARIZE_ME]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %should_not_scalarize_me = phi <8 x i32> [ %src1, %first], [ %src2, %second]
+  call void @llvm.genx.GenISA.LSC2DBlockWrite.p0i32(i64 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i1 false, i1 true, i32 11, <8 x i32> %should_not_scalarize_me)
+  ret void
+}
+
+define spir_kernel void @should_not_scalarize_10(i1 %switch, <16 x i16> %src1, <16 x i16> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_not_scalarize_10(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <16 x i16> [[SRC1:%.*]], <16 x i16> [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <16 x i16> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ]
+; CHECK-NEXT:    call void @llvm.genx.GenISA.LSC2DBlockWriteAddrPayload.p0i32.v16i16(i32* [[TMP0]], i32 0, i32 0, i32 16, i32 16, i32 16, i32 1, i1 false, i1 false, i32 0, <16 x i16> [[SHOULD_NOT_SCALARIZE_ME]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = alloca i32
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %should_not_scalarize_me = phi <16 x i16> [ %src1, %first], [ %src2, %second]
+  call void @llvm.genx.GenISA.LSC2DBlockWriteAddrPayload.p0i32.v16i16(i32* %0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 1, i1 false, i1 false, i32 0, <16 x i16> %should_not_scalarize_me)
+  ret void
+}
+
+define spir_kernel void @should_work_with_nested_phi(i1 %switch) {
+; CHECK-LABEL: @should_work_with_nested_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[VECTINT:%.*]] = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
+; CHECK-NEXT:    [[VECTINT_SCALAR:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 0
+; CHECK-NEXT:    [[VECTINT_SCALAR9:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 1
+; CHECK-NEXT:    [[VECTINT_SCALAR10:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 2
+; CHECK-NEXT:    [[VECTINT_SCALAR11:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 3
+; CHECK-NEXT:    [[VECTINT_SCALAR12:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 4
+; CHECK-NEXT:    [[VECTINT_SCALAR13:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 5
+; CHECK-NEXT:    [[VECTINT_SCALAR14:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 6
+; CHECK-NEXT:    [[VECTINT_SCALAR15:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 7
+; CHECK-NEXT:    br i1 [[SWITCH:%.*]], label [[FIRST:%.*]], label [[SECOND:%.*]]
+; CHECK:       proxy:
+; CHECK-NEXT:    br i1 [[SWITCH]], label [[FIRST]], label [[SECOND]]
+; CHECK:       first:
+; CHECK-NEXT:    [[RESULT11:%.*]] = phi i32 [ 0, [[PROXY:%.*]] ], [ [[VECTINT_SCALAR]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RESULT12:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR9]], [[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT13:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR10]], [[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT14:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR11]], [[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT15:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR12]], [[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT16:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR13]], [[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT17:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR14]], [[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT18:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR15]], [[ENTRY]] ]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       second:
+; CHECK-NEXT:    [[RESULT224:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR]], [[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT225:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR9]], [[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT226:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR10]], [[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT227:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR11]], [[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT228:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR12]], [[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT229:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR13]], [[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT230:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR14]], [[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT231:%.*]] = phi i32 [ 0, [[PROXY]] ], [ [[VECTINT_SCALAR15]], [[ENTRY]] ]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RESULT316:%.*]] = phi i32 [ [[RESULT11]], [[FIRST]] ], [ [[RESULT224]], [[SECOND]] ]
+; CHECK-NEXT:    [[RESULT317:%.*]] = phi i32 [ [[RESULT12]], [[FIRST]] ], [ [[RESULT225]], [[SECOND]] ]
+; CHECK-NEXT:    [[RESULT318:%.*]] = phi i32 [ [[RESULT13]], [[FIRST]] ], [ [[RESULT226]], [[SECOND]] ]
+; CHECK-NEXT:    [[RESULT319:%.*]] = phi i32 [ [[RESULT14]], [[FIRST]] ], [ [[RESULT227]], [[SECOND]] ]
+; CHECK-NEXT:    [[RESULT320:%.*]] = phi i32 [ [[RESULT15]], [[FIRST]] ], [ [[RESULT228]], [[SECOND]] ]
+; CHECK-NEXT:    [[RESULT321:%.*]] = phi i32 [ [[RESULT16]], [[FIRST]] ], [ [[RESULT229]], [[SECOND]] ]
+; CHECK-NEXT:    [[RESULT322:%.*]] = phi i32 [ [[RESULT17]], [[FIRST]] ], [ [[RESULT230]], [[SECOND]] ]
+; CHECK-NEXT:    [[RESULT323:%.*]] = phi i32 [ [[RESULT18]], [[FIRST]] ], [ [[RESULT231]], [[SECOND]] ]
+; CHECK-NEXT:    [[RESULT3_ASSEMBLED_VECT:%.*]] = insertelement <8 x i32> undef, i32 [[RESULT316]], i32 0
+; CHECK-NEXT:    [[RESULT3_ASSEMBLED_VECT32:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT]], i32 [[RESULT317]], i32 1
+; CHECK-NEXT:    [[RESULT3_ASSEMBLED_VECT33:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT32]], i32 [[RESULT318]], i32 2
+; CHECK-NEXT:    [[RESULT3_ASSEMBLED_VECT34:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT33]], i32 [[RESULT319]], i32 3
+; CHECK-NEXT:    [[RESULT3_ASSEMBLED_VECT35:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT34]], i32 [[RESULT320]], i32 4
+; CHECK-NEXT:    [[RESULT3_ASSEMBLED_VECT36:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT35]], i32 [[RESULT321]], i32 5
+; CHECK-NEXT:    [[RESULT3_ASSEMBLED_VECT37:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT36]], i32 [[RESULT322]], i32 6
+; CHECK-NEXT:    [[RESULT3_ASSEMBLED_VECT38:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT37]], i32 [[RESULT323]], i32 7
+; CHECK-NEXT:    [[NEWDATA:%.*]] = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> [[RESULT3_ASSEMBLED_VECT38]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %vectint = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
+  br i1 %switch, label %first, label %second
+proxy:
+  br i1 %switch, label %first, label %second
+first:
+  %result1 = phi <8 x i32> [ zeroinitializer, %proxy ], [ %vectint, %entry]
+  br label %exit
+second:
+  %result2 = phi <8 x i32> [ zeroinitializer, %proxy ], [ %vectint, %entry]
+  br label %exit
+exit:
+  %result3 = phi <8 x i32> [ %result1, %first], [ %result2, %second]
+  %newdata = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> %result3)
+  ret void
+}
+
+@vector.8x.float = global <8 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+@vector.8x.i16 = global <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+@vector.8x.i32 = global <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+
+declare <8 x i32> @llvm.genx.GenISA.vmeSendIME2(<8 x i32>, i32, i32, i32, i32)
+declare <4 x i32> @llvm.genx.GenISA.vmeSendFBR2(<4 x i32>, i32, i32, i32)
+declare <4 x i32> @llvm.genx.GenISA.vmeSendSIC2(<4 x i32>, i32, i32, i32)
+declare <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1)
+declare i32 @llvm.genx.GenISA.dpas.v8i32(<8 x i32>, <8 x i32>, <8 x i32>, i32, i32, i32, i32, i1)
+declare void @llvm.genx.GenISA.simdBlockWrite.v2i32.p1v2i32(<2 x i32> addrspace(1)*, <2 x i32>)
+declare void @llvm.genx.GenISA.simdBlockWriteBindless.p2490368i8.v2f32.i32(i8 addrspace(2490368)*, <2 x float>, i32)
+declare void @llvm.genx.GenISA.simdMediaBlockWrite.v16i16(i32, i32, i32, i32, <16 x i16>)
+declare void @llvm.genx.GenISA.LSC2DBlockWrite.p0i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32, <8 x i32>)
+declare void @llvm.genx.GenISA.LSC2DBlockWriteAddrPayload.p0i32.v16i16(i32*, i32, i32, i32, i32, i32, i32, i1, i1, i32, <16 x i16>)
+declare spir_func <8 x i32> @do_math_v8i32_v8i32(<8 x i32>) #1
diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-phi-instruction.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-phi-instruction.ll
new file mode 100644
index 000000000000..64c7b15f9aff
--- /dev/null
+++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-phi-instruction.ll
@@ -0,0 +1,589 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+;
+; RUN: igc_opt --igc-scalarize --opaque-pointers -S < %s | FileCheck %s
+; REQUIRES: llvm-14-plus
+; ------------------------------------------------
+; ScalarizeFunction
+; ------------------------------------------------
+; The pass should break vector operation into many scalar operations
+; ------------------------------------------------
+
+define <2 x i32> @basic(i1 %switch, <2 x i32> %src1, <2 x i32> %src2) {
+; CHECK-LABEL: define <2 x i32> @basic(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR4:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR3:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RESULT1:%.*]] = phi i32 [ [[SRC1_SCALAR]], %[[FIRST]] ], [ [[SRC2_SCALAR]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT2:%.*]] = phi i32 [ [[SRC1_SCALAR3]], %[[FIRST]] ], [ [[SRC2_SCALAR4]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[RESULT1]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT5:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT2]], i32 1
+; CHECK-NEXT:    ret <2 x i32> [[RESULT_ASSEMBLED_VECT5]]
+;
+entry:
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %result = phi <2 x i32> [ %src1, %first], [ %src2, %second]
+  ret <2 x i32> %result
+}
+
+define <2 x float> @should_work_with_different_value_type(i1 %switch, <2 x float> %src1, <2 x float> %src2) {
+; CHECK-LABEL: define <2 x float> @should_work_with_different_value_type(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR4:%.*]] = extractelement <2 x float> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR3:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RESULT1:%.*]] = phi float [ [[SRC1_SCALAR]], %[[FIRST]] ], [ [[SRC2_SCALAR]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT2:%.*]] = phi float [ [[SRC1_SCALAR3]], %[[FIRST]] ], [ [[SRC2_SCALAR4]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[RESULT1]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT5:%.*]] = insertelement <2 x float> [[RESULT_ASSEMBLED_VECT]], float [[RESULT2]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[RESULT_ASSEMBLED_VECT5]]
+;
+entry:
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %result = phi <2 x float> [ %src1, %first], [ %src2, %second]
+  ret <2 x float> %result
+}
+
+; triangulating with @should_work_with_different_value_type
+; update checks if fast will be preserved
+define <2 x float> @should_work_with_fast_math_flags(i1 %switch, <2 x float> %src1, <2 x float> %src2) {
+; CHECK-LABEL: define <2 x float> @should_work_with_fast_math_flags(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR4:%.*]] = extractelement <2 x float> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR3:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RESULT1:%.*]] = phi float [ [[SRC1_SCALAR]], %[[FIRST]] ], [ [[SRC2_SCALAR]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT2:%.*]] = phi float [ [[SRC1_SCALAR3]], %[[FIRST]] ], [ [[SRC2_SCALAR4]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[RESULT1]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT5:%.*]] = insertelement <2 x float> [[RESULT_ASSEMBLED_VECT]], float [[RESULT2]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[RESULT_ASSEMBLED_VECT5]]
+;
+  entry:
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %result = phi fast <2 x float> [ %src1, %first], [ %src2, %second]
+  ret <2 x float> %result
+}
+
+define <16 x i32> @should_work_with_larger_vector_size(i1 %switch, <16 x i32> %src1, <16 x i32> %src2) {
+; CHECK-LABEL: define <16 x i32> @should_work_with_larger_vector_size(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <16 x i32> [[SRC1:%.*]], <16 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <16 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR32:%.*]] = extractelement <16 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC2_SCALAR33:%.*]] = extractelement <16 x i32> [[SRC2]], i32 2
+; CHECK-NEXT:    [[SRC2_SCALAR34:%.*]] = extractelement <16 x i32> [[SRC2]], i32 3
+; CHECK-NEXT:    [[SRC2_SCALAR35:%.*]] = extractelement <16 x i32> [[SRC2]], i32 4
+; CHECK-NEXT:    [[SRC2_SCALAR36:%.*]] = extractelement <16 x i32> [[SRC2]], i32 5
+; CHECK-NEXT:    [[SRC2_SCALAR37:%.*]] = extractelement <16 x i32> [[SRC2]], i32 6
+; CHECK-NEXT:    [[SRC2_SCALAR38:%.*]] = extractelement <16 x i32> [[SRC2]], i32 7
+; CHECK-NEXT:    [[SRC2_SCALAR39:%.*]] = extractelement <16 x i32> [[SRC2]], i32 8
+; CHECK-NEXT:    [[SRC2_SCALAR40:%.*]] = extractelement <16 x i32> [[SRC2]], i32 9
+; CHECK-NEXT:    [[SRC2_SCALAR41:%.*]] = extractelement <16 x i32> [[SRC2]], i32 10
+; CHECK-NEXT:    [[SRC2_SCALAR42:%.*]] = extractelement <16 x i32> [[SRC2]], i32 11
+; CHECK-NEXT:    [[SRC2_SCALAR43:%.*]] = extractelement <16 x i32> [[SRC2]], i32 12
+; CHECK-NEXT:    [[SRC2_SCALAR44:%.*]] = extractelement <16 x i32> [[SRC2]], i32 13
+; CHECK-NEXT:    [[SRC2_SCALAR45:%.*]] = extractelement <16 x i32> [[SRC2]], i32 14
+; CHECK-NEXT:    [[SRC2_SCALAR46:%.*]] = extractelement <16 x i32> [[SRC2]], i32 15
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <16 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR17:%.*]] = extractelement <16 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR18:%.*]] = extractelement <16 x i32> [[SRC1]], i32 2
+; CHECK-NEXT:    [[SRC1_SCALAR19:%.*]] = extractelement <16 x i32> [[SRC1]], i32 3
+; CHECK-NEXT:    [[SRC1_SCALAR20:%.*]] = extractelement <16 x i32> [[SRC1]], i32 4
+; CHECK-NEXT:    [[SRC1_SCALAR21:%.*]] = extractelement <16 x i32> [[SRC1]], i32 5
+; CHECK-NEXT:    [[SRC1_SCALAR22:%.*]] = extractelement <16 x i32> [[SRC1]], i32 6
+; CHECK-NEXT:    [[SRC1_SCALAR23:%.*]] = extractelement <16 x i32> [[SRC1]], i32 7
+; CHECK-NEXT:    [[SRC1_SCALAR24:%.*]] = extractelement <16 x i32> [[SRC1]], i32 8
+; CHECK-NEXT:    [[SRC1_SCALAR25:%.*]] = extractelement <16 x i32> [[SRC1]], i32 9
+; CHECK-NEXT:    [[SRC1_SCALAR26:%.*]] = extractelement <16 x i32> [[SRC1]], i32 10
+; CHECK-NEXT:    [[SRC1_SCALAR27:%.*]] = extractelement <16 x i32> [[SRC1]], i32 11
+; CHECK-NEXT:    [[SRC1_SCALAR28:%.*]] = extractelement <16 x i32> [[SRC1]], i32 12
+; CHECK-NEXT:    [[SRC1_SCALAR29:%.*]] = extractelement <16 x i32> [[SRC1]], i32 13
+; CHECK-NEXT:    [[SRC1_SCALAR30:%.*]] = extractelement <16 x i32> [[SRC1]], i32 14
+; CHECK-NEXT:    [[SRC1_SCALAR31:%.*]] = extractelement <16 x i32> [[SRC1]], i32 15
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RESULT1:%.*]] = phi i32 [ [[SRC1_SCALAR]], %[[FIRST]] ], [ [[SRC2_SCALAR]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT2:%.*]] = phi i32 [ [[SRC1_SCALAR17]], %[[FIRST]] ], [ [[SRC2_SCALAR32]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT3:%.*]] = phi i32 [ [[SRC1_SCALAR18]], %[[FIRST]] ], [ [[SRC2_SCALAR33]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT4:%.*]] = phi i32 [ [[SRC1_SCALAR19]], %[[FIRST]] ], [ [[SRC2_SCALAR34]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT5:%.*]] = phi i32 [ [[SRC1_SCALAR20]], %[[FIRST]] ], [ [[SRC2_SCALAR35]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT6:%.*]] = phi i32 [ [[SRC1_SCALAR21]], %[[FIRST]] ], [ [[SRC2_SCALAR36]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT7:%.*]] = phi i32 [ [[SRC1_SCALAR22]], %[[FIRST]] ], [ [[SRC2_SCALAR37]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT8:%.*]] = phi i32 [ [[SRC1_SCALAR23]], %[[FIRST]] ], [ [[SRC2_SCALAR38]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT9:%.*]] = phi i32 [ [[SRC1_SCALAR24]], %[[FIRST]] ], [ [[SRC2_SCALAR39]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT10:%.*]] = phi i32 [ [[SRC1_SCALAR25]], %[[FIRST]] ], [ [[SRC2_SCALAR40]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT11:%.*]] = phi i32 [ [[SRC1_SCALAR26]], %[[FIRST]] ], [ [[SRC2_SCALAR41]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT12:%.*]] = phi i32 [ [[SRC1_SCALAR27]], %[[FIRST]] ], [ [[SRC2_SCALAR42]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT13:%.*]] = phi i32 [ [[SRC1_SCALAR28]], %[[FIRST]] ], [ [[SRC2_SCALAR43]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT14:%.*]] = phi i32 [ [[SRC1_SCALAR29]], %[[FIRST]] ], [ [[SRC2_SCALAR44]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT15:%.*]] = phi i32 [ [[SRC1_SCALAR30]], %[[FIRST]] ], [ [[SRC2_SCALAR45]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT16:%.*]] = phi i32 [ [[SRC1_SCALAR31]], %[[FIRST]] ], [ [[SRC2_SCALAR46]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <16 x i32> undef, i32 [[RESULT1]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT47:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT2]], i32 1
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT48:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT47]], i32 [[RESULT3]], i32 2
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT49:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT48]], i32 [[RESULT4]], i32 3
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT50:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT49]], i32 [[RESULT5]], i32 4
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT51:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT50]], i32 [[RESULT6]], i32 5
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT52:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT51]], i32 [[RESULT7]], i32 6
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT53:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT52]], i32 [[RESULT8]], i32 7
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT54:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT53]], i32 [[RESULT9]], i32 8
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT55:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT54]], i32 [[RESULT10]], i32 9
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT56:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT55]], i32 [[RESULT11]], i32 10
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT57:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT56]], i32 [[RESULT12]], i32 11
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT58:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT57]], i32 [[RESULT13]], i32 12
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT59:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT58]], i32 [[RESULT14]], i32 13
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT60:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT59]], i32 [[RESULT15]], i32 14
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT61:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT60]], i32 [[RESULT16]], i32 15
+; CHECK-NEXT:    ret <16 x i32> [[RESULT_ASSEMBLED_VECT61]]
+;
+entry:
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %result = phi <16 x i32> [ %src1, %first], [ %src2, %second]
+  ret <16 x i32> %result
+}
+
+define <2 x i32> @should_work_with_constant_value(i1 %switch, <2 x i32> %src1) {
+; CHECK-LABEL: define <2 x i32> @should_work_with_constant_value(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR3:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RESULT1:%.*]] = phi i32 [ [[SRC1_SCALAR]], %[[FIRST]] ], [ 2, %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT2:%.*]] = phi i32 [ [[SRC1_SCALAR3]], %[[FIRST]] ], [ 4, %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[RESULT1]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT4:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT2]], i32 1
+; CHECK-NEXT:    ret <2 x i32> [[RESULT_ASSEMBLED_VECT4]]
+;
+entry:
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %result = phi <2 x i32> [ %src1, %first], [ <i32 2, i32 4>, %second]
+  ret <2 x i32> %result
+}
+
+
+define <8 x i32> @should_not_scalarize(i1 %switch, <8 x i32> %src1) {
+; CHECK-LABEL: define <8 x i32> @should_not_scalarize(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <8 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SHOULD_NOT_SCALARIZE_ME:%.*]] = call <8 x i32> @llvm.genx.GenISA.vmeSendIME2(<8 x i32> [[SRC1]], i32 1, i32 2, i32 3, i32 0)
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi <8 x i32> [ [[SRC1]], %[[FIRST]] ], [ [[SHOULD_NOT_SCALARIZE_ME]], %[[SECOND]] ]
+; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
+;
+entry:
+  %should_not_scalarize_me = call <8 x i32> @llvm.genx.GenISA.vmeSendIME2(<8 x i32> %src1, i32 1, i32 2, i32 3, i32 0)
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %result = phi <8 x i32> [ %src1, %first], [ %should_not_scalarize_me, %second]
+  ret <8 x i32> %result
+}
+
+define <4 x i32> @should_not_scalarize_2(i1 %switch, <4 x i32> %src1) {
+; CHECK-LABEL: define <4 x i32> @should_not_scalarize_2(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <4 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SHOULD_NOT_SCALARIZE_ME:%.*]] = call <4 x i32> @llvm.genx.GenISA.vmeSendFBR2(<4 x i32> [[SRC1]], i32 1, i32 2, i32 3)
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi <4 x i32> [ [[SRC1]], %[[FIRST]] ], [ [[SHOULD_NOT_SCALARIZE_ME]], %[[SECOND]] ]
+; CHECK-NEXT:    ret <4 x i32> [[RESULT]]
+;
+entry:
+  %should_not_scalarize_me = call <4 x i32> @llvm.genx.GenISA.vmeSendFBR2(<4 x i32> %src1, i32 1, i32 2, i32 3)
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %result = phi <4 x i32> [ %src1, %first], [ %should_not_scalarize_me, %second]
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @should_not_scalarize_3(i1 %switch, <4 x i32> %src1) {
+; CHECK-LABEL: define <4 x i32> @should_not_scalarize_3(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <4 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SHOULD_NOT_SCALARIZE_ME:%.*]] = call <4 x i32> @llvm.genx.GenISA.vmeSendSIC2(<4 x i32> [[SRC1]], i32 1, i32 2, i32 3)
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi <4 x i32> [ [[SRC1]], %[[FIRST]] ], [ [[SHOULD_NOT_SCALARIZE_ME]], %[[SECOND]] ]
+; CHECK-NEXT:    ret <4 x i32> [[RESULT]]
+;
+entry:
+  %should_not_scalarize_me = call <4 x i32> @llvm.genx.GenISA.vmeSendSIC2(<4 x i32> %src1, i32 1, i32 2, i32 3)
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %result = phi <4 x i32> [ %src1, %first], [ %should_not_scalarize_me, %second]
+  ret <4 x i32> %result
+}
+
+define <8 x float> @should_not_scalarize_4(i1 %switch, <8 x float> %src1, <8 x float> %src2) {
+; CHECK-LABEL: define <8 x float> @should_not_scalarize_4(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <8 x float> [[SRC1:%.*]], <8 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr @vector.8x.i16, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @vector.8x.i32, align 32
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <8 x float> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RETURN:%.*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[SHOULD_NOT_SCALARIZE_ME]], <8 x i16> [[TMP0]], <8 x i32> [[TMP1]], i32 11, i32 11, i32 8, i32 8, i1 false)
+; CHECK-NEXT:    ret <8 x float> [[RETURN]]
+;
+entry:
+  %0 = load <8 x i16>, ptr @vector.8x.i16
+  %1 = load <8 x i32>, ptr @vector.8x.i32
+
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %should_not_scalarize_me = phi <8 x float> [ %src1, %first], [ %src2, %second]
+  %return = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %should_not_scalarize_me, <8 x i16> %0, <8 x i32> %1, i32 11, i32 11, i32 8, i32 8, i1 false)
+  ret <8 x float> %return
+}
+
+define i32 @should_not_scalarize_5(i1 %switch, <8 x i32> %src1, <8 x i32> %src2) {
+; CHECK-LABEL: define i32 @should_not_scalarize_5(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <8 x i32> [[SRC1:%.*]], <8 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @vector.8x.i32, align 32
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <8 x i32> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RETURN:%.*]] = call i32 @llvm.genx.GenISA.dpas.v8i32(<8 x i32> [[SHOULD_NOT_SCALARIZE_ME]], <8 x i32> [[TMP0]], <8 x i32> [[TMP0]], i32 7, i32 7, i32 8, i32 1, i1 false)
+; CHECK-NEXT:    ret i32 [[RETURN]]
+;
+entry:
+  %0 = load <8 x i32>, ptr @vector.8x.i32
+
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %should_not_scalarize_me = phi <8 x i32> [ %src1, %first], [ %src2, %second]
+  %return = call i32 @llvm.genx.GenISA.dpas.v8i32(<8 x i32> %should_not_scalarize_me, <8 x i32> %0, <8 x i32> %0, i32 7, i32 7, i32 8, i32 1, i1 false)
+  ret i32 %return
+}
+
+define spir_kernel void @should_not_scalarize_6(i1 %switch, ptr addrspace(1) %src1, ptr addrspace(1) %src2) {
+; CHECK-LABEL: define spir_kernel void @should_not_scalarize_6(
+; CHECK-SAME: i1 [[SWITCH:%.*]], ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi ptr addrspace(1) [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ]
+; CHECK-NEXT:    call void @llvm.genx.GenISA.simdBlockWrite.v2i32.p1v2i32(ptr addrspace(1) [[SHOULD_NOT_SCALARIZE_ME]], <2 x i32> <i32 2, i32 4>)
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %should_not_scalarize_me = phi ptr addrspace(1) [ %src1, %first], [ %src2, %second]
+  call void @llvm.genx.GenISA.simdBlockWrite.v2i32.p1v2i32(ptr addrspace(1) %should_not_scalarize_me, <2 x i32> <i32 2, i32 4>)
+  ret void
+}
+
+define spir_kernel void @should_not_scalarize_7(i1 %switch, <2 x float> %src1, <2 x float> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_not_scalarize_7(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i8, align 1, addrspace(2490368)
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <2 x float> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ]
+; CHECK-NEXT:    call void @llvm.genx.GenISA.simdBlockWriteBindless.p2490368i8.v2f32.i32(ptr addrspace(2490368) [[TMP0]], <2 x float> [[SHOULD_NOT_SCALARIZE_ME]], i32 0)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = alloca i8, addrspace(2490368)
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %should_not_scalarize_me = phi <2 x float> [ %src1, %first], [ %src2, %second]
+  call void @llvm.genx.GenISA.simdBlockWriteBindless.p2490368i8.v2f32.i32(ptr addrspace(2490368) %0, <2 x float> %should_not_scalarize_me, i32 0)
+  ret void
+}
+
+define spir_kernel void @should_not_scalarize_8(i1 %switch, <16 x i16> %src1, <16 x i16> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_not_scalarize_8(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <16 x i16> [[SRC1:%.*]], <16 x i16> [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <16 x i16> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ]
+; CHECK-NEXT:    call void @llvm.genx.GenISA.simdMediaBlockWrite.v16i16(i32 1, i32 1, i32 1, i32 1, <16 x i16> [[SHOULD_NOT_SCALARIZE_ME]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %should_not_scalarize_me = phi <16 x i16> [ %src1, %first], [ %src2, %second]
+  call void @llvm.genx.GenISA.simdMediaBlockWrite.v16i16(i32 1, i32 1, i32 1, i32 1, <16 x i16> %should_not_scalarize_me)
+  ret void
+}
+
+define spir_kernel void @should_not_scalarize_9(i1 %switch, <8 x i32> %src1, <8 x i32> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_not_scalarize_9(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <8 x i32> [[SRC1:%.*]], <8 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <8 x i32> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ]
+; CHECK-NEXT:    call void @llvm.genx.GenISA.LSC2DBlockWrite.p0i32(i64 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i1 false, i1 true, i32 11, <8 x i32> [[SHOULD_NOT_SCALARIZE_ME]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %should_not_scalarize_me = phi <8 x i32> [ %src1, %first], [ %src2, %second]
+  call void @llvm.genx.GenISA.LSC2DBlockWrite.p0i32(i64 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i1 false, i1 true, i32 11, <8 x i32> %should_not_scalarize_me)
+  ret void
+}
+
+define spir_kernel void @should_not_scalarize_10(i1 %switch, <16 x i16> %src1, <16 x i16> %src2) {
+; CHECK-LABEL: define spir_kernel void @should_not_scalarize_10(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <16 x i16> [[SRC1:%.*]], <16 x i16> [[SRC2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SHOULD_NOT_SCALARIZE_ME:%.*]] = phi <16 x i16> [ [[SRC1]], %[[FIRST]] ], [ [[SRC2]], %[[SECOND]] ]
+; CHECK-NEXT:    call void @llvm.genx.GenISA.LSC2DBlockWriteAddrPayload.p0i32.v16i16(ptr [[TMP0]], i32 0, i32 0, i32 16, i32 16, i32 16, i32 1, i1 false, i1 false, i32 0, <16 x i16> [[SHOULD_NOT_SCALARIZE_ME]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = alloca i32
+  br i1 %switch, label %first, label %second
+first:
+  br label %exit
+second:
+  br label %exit
+exit:
+  %should_not_scalarize_me = phi <16 x i16> [ %src1, %first], [ %src2, %second]
+  call void @llvm.genx.GenISA.LSC2DBlockWriteAddrPayload.p0i32.v16i16(ptr %0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 1, i1 false, i1 false, i32 0, <16 x i16> %should_not_scalarize_me)
+  ret void
+}
+
+define spir_kernel void @should_work_with_nested_phi(i1 %switch) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_nested_phi(
+; CHECK-SAME: i1 [[SWITCH:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[VECTINT:%.*]] = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
+; CHECK-NEXT:    [[VECTINT_SCALAR:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 0
+; CHECK-NEXT:    [[VECTINT_SCALAR9:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 1
+; CHECK-NEXT:    [[VECTINT_SCALAR10:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 2
+; CHECK-NEXT:    [[VECTINT_SCALAR11:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 3
+; CHECK-NEXT:    [[VECTINT_SCALAR12:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 4
+; CHECK-NEXT:    [[VECTINT_SCALAR13:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 5
+; CHECK-NEXT:    [[VECTINT_SCALAR14:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 6
+; CHECK-NEXT:    [[VECTINT_SCALAR15:%.*]] = extractelement <8 x i32> [[VECTINT]], i32 7
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[PROXY:.*]]:
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST]], label %[[SECOND]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    [[RESULT11:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT12:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR9]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT13:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR10]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT14:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR11]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT15:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR12]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT16:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR13]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT17:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR14]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT18:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR15]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    [[RESULT224:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT225:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR9]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT226:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR10]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT227:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR11]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT228:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR12]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT229:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR13]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT230:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR14]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RESULT231:%.*]] = phi i32 [ 0, %[[PROXY]] ], [ [[VECTINT_SCALAR15]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RESULT316:%.*]] = phi i32 [ [[RESULT11]], %[[FIRST]] ], [ [[RESULT224]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT317:%.*]] = phi i32 [ [[RESULT12]], %[[FIRST]] ], [ [[RESULT225]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT318:%.*]] = phi i32 [ [[RESULT13]], %[[FIRST]] ], [ [[RESULT226]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT319:%.*]] = phi i32 [ [[RESULT14]], %[[FIRST]] ], [ [[RESULT227]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT320:%.*]] = phi i32 [ [[RESULT15]], %[[FIRST]] ], [ [[RESULT228]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT321:%.*]] = phi i32 [ [[RESULT16]], %[[FIRST]] ], [ [[RESULT229]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT322:%.*]] = phi i32 [ [[RESULT17]], %[[FIRST]] ], [ [[RESULT230]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT323:%.*]] = phi i32 [ [[RESULT18]], %[[FIRST]] ], [ [[RESULT231]], %[[SECOND]] ]
+; CHECK-NEXT:    [[RESULT3_ASSEMBLED_VECT:%.*]] = insertelement <8 x i32> undef, i32 [[RESULT316]], i32 0
+; CHECK-NEXT:    [[RESULT3_ASSEMBLED_VECT32:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT]], i32 [[RESULT317]], i32 1
+; CHECK-NEXT:    [[RESULT3_ASSEMBLED_VECT33:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT32]], i32 [[RESULT318]], i32 2
+; CHECK-NEXT:    [[RESULT3_ASSEMBLED_VECT34:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT33]], i32 [[RESULT319]], i32 3
+; CHECK-NEXT:    [[RESULT3_ASSEMBLED_VECT35:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT34]], i32 [[RESULT320]], i32 4
+; CHECK-NEXT:    [[RESULT3_ASSEMBLED_VECT36:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT35]], i32 [[RESULT321]], i32 5
+; CHECK-NEXT:    [[RESULT3_ASSEMBLED_VECT37:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT36]], i32 [[RESULT322]], i32 6
+; CHECK-NEXT:    [[RESULT3_ASSEMBLED_VECT38:%.*]] = insertelement <8 x i32> [[RESULT3_ASSEMBLED_VECT37]], i32 [[RESULT323]], i32 7
+; CHECK-NEXT:    [[NEWDATA:%.*]] = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> [[RESULT3_ASSEMBLED_VECT38]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %vectint = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
+  br i1 %switch, label %first, label %second
+proxy:
+  br i1 %switch, label %first, label %second
+first:
+  %result1 = phi <8 x i32> [ zeroinitializer, %proxy ], [ %vectint, %entry]
+  br label %exit
+second:
+  %result2 = phi <8 x i32> [ zeroinitializer, %proxy ], [ %vectint, %entry]
+  br label %exit
+exit:
+  %result3 = phi <8 x i32> [ %result1, %first], [ %result2, %second]
+  %newdata = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> %result3)
+  ret void
+}
+
+@vector.8x.float = global <8 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+@vector.8x.i16 = global <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+@vector.8x.i32 = global <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+
+declare <8 x i32> @llvm.genx.GenISA.vmeSendIME2(<8 x i32>, i32, i32, i32, i32)
+declare <4 x i32> @llvm.genx.GenISA.vmeSendFBR2(<4 x i32>, i32, i32, i32)
+declare <4 x i32> @llvm.genx.GenISA.vmeSendSIC2(<4 x i32>, i32, i32, i32)
+declare <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1)
+declare i32 @llvm.genx.GenISA.dpas.v8i32(<8 x i32>, <8 x i32>, <8 x i32>, i32, i32, i32, i32, i1)
+declare void @llvm.genx.GenISA.simdBlockWrite.v2i32.p1v2i32(ptr addrspace(1), <2 x i32>)
+declare void @llvm.genx.GenISA.simdBlockWriteBindless.p2490368i8.v2f32.i32(ptr addrspace(2490368), <2 x float>, i32)
+declare void @llvm.genx.GenISA.simdMediaBlockWrite.v16i16(i32, i32, i32, i32, <16 x i16>)
+declare void @llvm.genx.GenISA.LSC2DBlockWrite.p0i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32, <8 x i32>)
+declare void @llvm.genx.GenISA.LSC2DBlockWriteAddrPayload.p0i32.v16i16(ptr, i32, i32, i32, i32, i32, i32, i1, i1, i32, <16 x i16>)
+declare spir_func <8 x i32> @do_math_v8i32_v8i32(<8 x i32>) #1
diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-select-instruction-typed-pointers.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-select-instruction-typed-pointers.ll
new file mode 100644
index 000000000000..29016636148b
--- /dev/null
+++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-select-instruction-typed-pointers.ll
@@ -0,0 +1,221 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+;
+; RUN: igc_opt --igc-scalarize -S < %s | FileCheck %s
+; ------------------------------------------------
+; ScalarizeFunction
+; ------------------------------------------------
+; The pass should break vector operation into many scalar operations
+; ------------------------------------------------
+
+define <2 x i32> @basic(<2 x i1> %switch, <2 x i32> %src1, <2 x i32> %src2) {
+; CHECK-LABEL: define <2 x i32> @basic(
+; CHECK-SAME: <2 x i1> [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SWITCH_SCALAR:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 0
+; CHECK-NEXT:    [[SWITCH_SCALAR3:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 1
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[RESULT4:%.*]] = select i1 [[SWITCH_SCALAR]], i32 [[SRC1_SCALAR]], i32 [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[RESULT5:%.*]] = select i1 [[SWITCH_SCALAR3]], i32 [[SRC1_SCALAR1]], i32 [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[RESULT4]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT6:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT5]], i32 1
+; CHECK-NEXT:    ret <2 x i32> [[RESULT_ASSEMBLED_VECT6]]
+;
+  %result = select <2 x i1> %switch, <2 x i32> %src1, <2 x i32> %src2
+  ret <2 x i32> %result
+}
+
+define <2 x float> @should_work_with_different_value_type(<2 x i1> %switch, <2 x float> %src1, <2 x float> %src2) {
+; CHECK-LABEL: define <2 x float> @should_work_with_different_value_type(
+; CHECK-SAME: <2 x i1> [[SWITCH:%.*]], <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SWITCH_SCALAR:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 0
+; CHECK-NEXT:    [[SWITCH_SCALAR3:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 1
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    [[RESULT4:%.*]] = select i1 [[SWITCH_SCALAR]], float [[SRC1_SCALAR]], float [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[RESULT5:%.*]] = select i1 [[SWITCH_SCALAR3]], float [[SRC1_SCALAR1]], float [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[RESULT4]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT6:%.*]] = insertelement <2 x float> [[RESULT_ASSEMBLED_VECT]], float [[RESULT5]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[RESULT_ASSEMBLED_VECT6]]
+;
+  %result = select <2 x i1> %switch, <2 x float> %src1, <2 x float> %src2
+  ret <2 x float> %result
+}
+
+; triangulating with @should_work_with_different_value_type
+; update checks if fast will be preserved
+define <2 x float> @should_work_with_fast_math_flags(<2 x i1> %switch, <2 x float> %src1, <2 x float> %src2) {
+; CHECK-LABEL: define <2 x float> @should_work_with_fast_math_flags(
+; CHECK-SAME: <2 x i1> [[SWITCH:%.*]], <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SWITCH_SCALAR:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 0
+; CHECK-NEXT:    [[SWITCH_SCALAR3:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 1
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    [[RESULT4:%.*]] = select i1 [[SWITCH_SCALAR]], float [[SRC1_SCALAR]], float [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[RESULT5:%.*]] = select i1 [[SWITCH_SCALAR3]], float [[SRC1_SCALAR1]], float [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[RESULT4]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT6:%.*]] = insertelement <2 x float> [[RESULT_ASSEMBLED_VECT]], float [[RESULT5]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[RESULT_ASSEMBLED_VECT6]]
+;
+  %result = select fast <2 x i1> %switch, <2 x float> %src1, <2 x float> %src2
+  ret <2 x float> %result
+}
+
+define <16 x i32> @should_work_with_larger_vector_size(<16 x i1> %switch, <16 x i32> %src1, <16 x i32> %src2) {
+; CHECK-LABEL: define <16 x i32> @should_work_with_larger_vector_size(
+; CHECK-SAME: <16 x i1> [[SWITCH:%.*]], <16 x i32> [[SRC1:%.*]], <16 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SWITCH_SCALAR:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 0
+; CHECK-NEXT:    [[SWITCH_SCALAR31:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 1
+; CHECK-NEXT:    [[SWITCH_SCALAR32:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 2
+; CHECK-NEXT:    [[SWITCH_SCALAR33:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 3
+; CHECK-NEXT:    [[SWITCH_SCALAR34:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 4
+; CHECK-NEXT:    [[SWITCH_SCALAR35:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 5
+; CHECK-NEXT:    [[SWITCH_SCALAR36:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 6
+; CHECK-NEXT:    [[SWITCH_SCALAR37:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 7
+; CHECK-NEXT:    [[SWITCH_SCALAR38:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 8
+; CHECK-NEXT:    [[SWITCH_SCALAR39:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 9
+; CHECK-NEXT:    [[SWITCH_SCALAR40:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 10
+; CHECK-NEXT:    [[SWITCH_SCALAR41:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 11
+; CHECK-NEXT:    [[SWITCH_SCALAR42:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 12
+; CHECK-NEXT:    [[SWITCH_SCALAR43:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 13
+; CHECK-NEXT:    [[SWITCH_SCALAR44:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 14
+; CHECK-NEXT:    [[SWITCH_SCALAR45:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 15
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <16 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR16:%.*]] = extractelement <16 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC2_SCALAR17:%.*]] = extractelement <16 x i32> [[SRC2]], i32 2
+; CHECK-NEXT:    [[SRC2_SCALAR18:%.*]] = extractelement <16 x i32> [[SRC2]], i32 3
+; CHECK-NEXT:    [[SRC2_SCALAR19:%.*]] = extractelement <16 x i32> [[SRC2]], i32 4
+; CHECK-NEXT:    [[SRC2_SCALAR20:%.*]] = extractelement <16 x i32> [[SRC2]], i32 5
+; CHECK-NEXT:    [[SRC2_SCALAR21:%.*]] = extractelement <16 x i32> [[SRC2]], i32 6
+; CHECK-NEXT:    [[SRC2_SCALAR22:%.*]] = extractelement <16 x i32> [[SRC2]], i32 7
+; CHECK-NEXT:    [[SRC2_SCALAR23:%.*]] = extractelement <16 x i32> [[SRC2]], i32 8
+; CHECK-NEXT:    [[SRC2_SCALAR24:%.*]] = extractelement <16 x i32> [[SRC2]], i32 9
+; CHECK-NEXT:    [[SRC2_SCALAR25:%.*]] = extractelement <16 x i32> [[SRC2]], i32 10
+; CHECK-NEXT:    [[SRC2_SCALAR26:%.*]] = extractelement <16 x i32> [[SRC2]], i32 11
+; CHECK-NEXT:    [[SRC2_SCALAR27:%.*]] = extractelement <16 x i32> [[SRC2]], i32 12
+; CHECK-NEXT:    [[SRC2_SCALAR28:%.*]] = extractelement <16 x i32> [[SRC2]], i32 13
+; CHECK-NEXT:    [[SRC2_SCALAR29:%.*]] = extractelement <16 x i32> [[SRC2]], i32 14
+; CHECK-NEXT:    [[SRC2_SCALAR30:%.*]] = extractelement <16 x i32> [[SRC2]], i32 15
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <16 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <16 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR2:%.*]] = extractelement <16 x i32> [[SRC1]], i32 2
+; CHECK-NEXT:    [[SRC1_SCALAR3:%.*]] = extractelement <16 x i32> [[SRC1]], i32 3
+; CHECK-NEXT:    [[SRC1_SCALAR4:%.*]] = extractelement <16 x i32> [[SRC1]], i32 4
+; CHECK-NEXT:    [[SRC1_SCALAR5:%.*]] = extractelement <16 x i32> [[SRC1]], i32 5
+; CHECK-NEXT:    [[SRC1_SCALAR6:%.*]] = extractelement <16 x i32> [[SRC1]], i32 6
+; CHECK-NEXT:    [[SRC1_SCALAR7:%.*]] = extractelement <16 x i32> [[SRC1]], i32 7
+; CHECK-NEXT:    [[SRC1_SCALAR8:%.*]] = extractelement <16 x i32> [[SRC1]], i32 8
+; CHECK-NEXT:    [[SRC1_SCALAR9:%.*]] = extractelement <16 x i32> [[SRC1]], i32 9
+; CHECK-NEXT:    [[SRC1_SCALAR10:%.*]] = extractelement <16 x i32> [[SRC1]], i32 10
+; CHECK-NEXT:    [[SRC1_SCALAR11:%.*]] = extractelement <16 x i32> [[SRC1]], i32 11
+; CHECK-NEXT:    [[SRC1_SCALAR12:%.*]] = extractelement <16 x i32> [[SRC1]], i32 12
+; CHECK-NEXT:    [[SRC1_SCALAR13:%.*]] = extractelement <16 x i32> [[SRC1]], i32 13
+; CHECK-NEXT:    [[SRC1_SCALAR14:%.*]] = extractelement <16 x i32> [[SRC1]], i32 14
+; CHECK-NEXT:    [[SRC1_SCALAR15:%.*]] = extractelement <16 x i32> [[SRC1]], i32 15
+; CHECK-NEXT:    [[RESULT46:%.*]] = select i1 [[SWITCH_SCALAR]], i32 [[SRC1_SCALAR]], i32 [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[RESULT47:%.*]] = select i1 [[SWITCH_SCALAR31]], i32 [[SRC1_SCALAR1]], i32 [[SRC2_SCALAR16]]
+; CHECK-NEXT:    [[RESULT48:%.*]] = select i1 [[SWITCH_SCALAR32]], i32 [[SRC1_SCALAR2]], i32 [[SRC2_SCALAR17]]
+; CHECK-NEXT:    [[RESULT49:%.*]] = select i1 [[SWITCH_SCALAR33]], i32 [[SRC1_SCALAR3]], i32 [[SRC2_SCALAR18]]
+; CHECK-NEXT:    [[RESULT50:%.*]] = select i1 [[SWITCH_SCALAR34]], i32 [[SRC1_SCALAR4]], i32 [[SRC2_SCALAR19]]
+; CHECK-NEXT:    [[RESULT51:%.*]] = select i1 [[SWITCH_SCALAR35]], i32 [[SRC1_SCALAR5]], i32 [[SRC2_SCALAR20]]
+; CHECK-NEXT:    [[RESULT52:%.*]] = select i1 [[SWITCH_SCALAR36]], i32 [[SRC1_SCALAR6]], i32 [[SRC2_SCALAR21]]
+; CHECK-NEXT:    [[RESULT53:%.*]] = select i1 [[SWITCH_SCALAR37]], i32 [[SRC1_SCALAR7]], i32 [[SRC2_SCALAR22]]
+; CHECK-NEXT:    [[RESULT54:%.*]] = select i1 [[SWITCH_SCALAR38]], i32 [[SRC1_SCALAR8]], i32 [[SRC2_SCALAR23]]
+; CHECK-NEXT:    [[RESULT55:%.*]] = select i1 [[SWITCH_SCALAR39]], i32 [[SRC1_SCALAR9]], i32 [[SRC2_SCALAR24]]
+; CHECK-NEXT:    [[RESULT56:%.*]] = select i1 [[SWITCH_SCALAR40]], i32 [[SRC1_SCALAR10]], i32 [[SRC2_SCALAR25]]
+; CHECK-NEXT:    [[RESULT57:%.*]] = select i1 [[SWITCH_SCALAR41]], i32 [[SRC1_SCALAR11]], i32 [[SRC2_SCALAR26]]
+; CHECK-NEXT:    [[RESULT58:%.*]] = select i1 [[SWITCH_SCALAR42]], i32 [[SRC1_SCALAR12]], i32 [[SRC2_SCALAR27]]
+; CHECK-NEXT:    [[RESULT59:%.*]] = select i1 [[SWITCH_SCALAR43]], i32 [[SRC1_SCALAR13]], i32 [[SRC2_SCALAR28]]
+; CHECK-NEXT:    [[RESULT60:%.*]] = select i1 [[SWITCH_SCALAR44]], i32 [[SRC1_SCALAR14]], i32 [[SRC2_SCALAR29]]
+; CHECK-NEXT:    [[RESULT61:%.*]] = select i1 [[SWITCH_SCALAR45]], i32 [[SRC1_SCALAR15]], i32 [[SRC2_SCALAR30]]
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <16 x i32> undef, i32 [[RESULT46]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT62:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT47]], i32 1
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT63:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT62]], i32 [[RESULT48]], i32 2
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT64:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT63]], i32 [[RESULT49]], i32 3
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT65:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT64]], i32 [[RESULT50]], i32 4
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT66:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT65]], i32 [[RESULT51]], i32 5
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT67:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT66]], i32 [[RESULT52]], i32 6
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT68:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT67]], i32 [[RESULT53]], i32 7
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT69:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT68]], i32 [[RESULT54]], i32 8
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT70:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT69]], i32 [[RESULT55]], i32 9
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT71:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT70]], i32 [[RESULT56]], i32 10
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT72:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT71]], i32 [[RESULT57]], i32 11
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT73:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT72]], i32 [[RESULT58]], i32 12
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT74:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT73]], i32 [[RESULT59]], i32 13
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT75:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT74]], i32 [[RESULT60]], i32 14
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT76:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT75]], i32 [[RESULT61]], i32 15
+; CHECK-NEXT:    ret <16 x i32> [[RESULT_ASSEMBLED_VECT76]]
+;
+  %result = select <16 x i1> %switch, <16 x i32> %src1, <16 x i32> %src2
+  ret <16 x i32> %result
+}
+
+define <2 x i32> @should_work_with_constant_value(<2 x i1> %switch, <2 x i32> %src1) {
+; CHECK-LABEL: define <2 x i32> @should_work_with_constant_value(
+; CHECK-SAME: <2 x i1> [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SWITCH_SCALAR:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 0
+; CHECK-NEXT:    [[SWITCH_SCALAR2:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[RESULT3:%.*]] = select i1 [[SWITCH_SCALAR]], i32 [[SRC1_SCALAR]], i32 2
+; CHECK-NEXT:    [[RESULT4:%.*]] = select i1 [[SWITCH_SCALAR2]], i32 [[SRC1_SCALAR1]], i32 4
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[RESULT3]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT5:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT4]], i32 1
+; CHECK-NEXT:    ret <2 x i32> [[RESULT_ASSEMBLED_VECT5]]
+;
+  %result = select <2 x i1> %switch, <2 x i32> %src1, <2 x i32> <i32 2, i32 4>
+  ret <2 x i32> %result
+}
+
+define <2 x i32> @should_work_with_non_vector_condition(i1 %switch, <2 x i32> %src1, <2 x i32> %src2) {
+; CHECK-LABEL: define <2 x i32> @should_work_with_non_vector_condition(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[RESULT3:%.*]] = select i1 [[SWITCH]], i32 [[SRC1_SCALAR]], i32 [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[RESULT4:%.*]] = select i1 [[SWITCH]], i32 [[SRC1_SCALAR1]], i32 [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[RESULT3]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT5:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT4]], i32 1
+; CHECK-NEXT:    ret <2 x i32> [[RESULT_ASSEMBLED_VECT5]]
+;
+  %result = select i1 %switch, <2 x i32> %src1, <2 x i32> %src2
+  ret <2 x i32> %result
+}
+
+define <2 x i32> @should_not_select_from_the_same_value(<2 x i1> %switch, <2 x i32> %src1) {
+; CHECK-LABEL: define <2 x i32> @should_not_select_from_the_same_value(
+; CHECK-SAME: <2 x i1> [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SWITCH_SCALAR:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 0
+; CHECK-NEXT:    [[SWITCH_SCALAR2:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[SRC1_SCALAR]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT3:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[SRC1_SCALAR1]], i32 1
+; CHECK-NEXT:    ret <2 x i32> [[RESULT_ASSEMBLED_VECT3]]
+;
+  %result = select <2 x i1> %switch, <2 x i32> %src1, <2 x i32> %src1
+  ret <2 x i32> %result
+}
+
+define i32 @should_not_scalarize_scalar(i1 %switch, i32 %src1, i32 %src2) {
+; CHECK-LABEL: @should_not_scalarize_scalar(
+; CHECK-NEXT:    [[RESULT:%.*]] = select i1 [[SWITCH:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]]
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %result = select i1 %switch, i32 %src1, i32 %src2
+  ret i32 %result
+}
diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-select-instruction.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-select-instruction.ll
new file mode 100644
index 000000000000..0e44c872d8ab
--- /dev/null
+++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-select-instruction.ll
@@ -0,0 +1,223 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+;
+; RUN: igc_opt --igc-scalarize --opaque-pointers -S < %s | FileCheck %s
+; REQUIRES: llvm-14-plus
+; ------------------------------------------------
+; ScalarizeFunction
+; ------------------------------------------------
+; The pass should break vector operation into many scalar operations
+; ------------------------------------------------
+
+define <2 x i32> @basic(<2 x i1> %switch, <2 x i32> %src1, <2 x i32> %src2) {
+; CHECK-LABEL: define <2 x i32> @basic(
+; CHECK-SAME: <2 x i1> [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SWITCH_SCALAR:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 0
+; CHECK-NEXT:    [[SWITCH_SCALAR3:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 1
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[RESULT4:%.*]] = select i1 [[SWITCH_SCALAR]], i32 [[SRC1_SCALAR]], i32 [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[RESULT5:%.*]] = select i1 [[SWITCH_SCALAR3]], i32 [[SRC1_SCALAR1]], i32 [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[RESULT4]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT6:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT5]], i32 1
+; CHECK-NEXT:    ret <2 x i32> [[RESULT_ASSEMBLED_VECT6]]
+;
+  %result = select <2 x i1> %switch, <2 x i32> %src1, <2 x i32> %src2
+  ret <2 x i32> %result
+}
+
+define <2 x float> @should_work_with_different_value_type(<2 x i1> %switch, <2 x float> %src1, <2 x float> %src2) {
+; CHECK-LABEL: define <2 x float> @should_work_with_different_value_type(
+; CHECK-SAME: <2 x i1> [[SWITCH:%.*]], <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SWITCH_SCALAR:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 0
+; CHECK-NEXT:    [[SWITCH_SCALAR3:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 1
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    [[RESULT4:%.*]] = select i1 [[SWITCH_SCALAR]], float [[SRC1_SCALAR]], float [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[RESULT5:%.*]] = select i1 [[SWITCH_SCALAR3]], float [[SRC1_SCALAR1]], float [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[RESULT4]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT6:%.*]] = insertelement <2 x float> [[RESULT_ASSEMBLED_VECT]], float [[RESULT5]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[RESULT_ASSEMBLED_VECT6]]
+;
+  %result = select <2 x i1> %switch, <2 x float> %src1, <2 x float> %src2
+  ret <2 x float> %result
+}
+
+; triangulating with @should_work_with_different_value_type
+; update checks if fast will be preserved
+define <2 x float> @should_work_with_fast_math_flags(<2 x i1> %switch, <2 x float> %src1, <2 x float> %src2) {
+; CHECK-LABEL: define <2 x float> @should_work_with_fast_math_flags(
+; CHECK-SAME: <2 x i1> [[SWITCH:%.*]], <2 x float> [[SRC1:%.*]], <2 x float> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SWITCH_SCALAR:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 0
+; CHECK-NEXT:    [[SWITCH_SCALAR3:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 1
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x float> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x float> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    [[RESULT4:%.*]] = select i1 [[SWITCH_SCALAR]], float [[SRC1_SCALAR]], float [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[RESULT5:%.*]] = select i1 [[SWITCH_SCALAR3]], float [[SRC1_SCALAR1]], float [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[RESULT4]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT6:%.*]] = insertelement <2 x float> [[RESULT_ASSEMBLED_VECT]], float [[RESULT5]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[RESULT_ASSEMBLED_VECT6]]
+;
+  %result = select fast <2 x i1> %switch, <2 x float> %src1, <2 x float> %src2
+  ret <2 x float> %result
+}
+
+define <16 x i32> @should_work_with_larger_vector_size(<16 x i1> %switch, <16 x i32> %src1, <16 x i32> %src2) {
+; CHECK-LABEL: define <16 x i32> @should_work_with_larger_vector_size(
+; CHECK-SAME: <16 x i1> [[SWITCH:%.*]], <16 x i32> [[SRC1:%.*]], <16 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SWITCH_SCALAR:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 0
+; CHECK-NEXT:    [[SWITCH_SCALAR31:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 1
+; CHECK-NEXT:    [[SWITCH_SCALAR32:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 2
+; CHECK-NEXT:    [[SWITCH_SCALAR33:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 3
+; CHECK-NEXT:    [[SWITCH_SCALAR34:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 4
+; CHECK-NEXT:    [[SWITCH_SCALAR35:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 5
+; CHECK-NEXT:    [[SWITCH_SCALAR36:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 6
+; CHECK-NEXT:    [[SWITCH_SCALAR37:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 7
+; CHECK-NEXT:    [[SWITCH_SCALAR38:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 8
+; CHECK-NEXT:    [[SWITCH_SCALAR39:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 9
+; CHECK-NEXT:    [[SWITCH_SCALAR40:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 10
+; CHECK-NEXT:    [[SWITCH_SCALAR41:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 11
+; CHECK-NEXT:    [[SWITCH_SCALAR42:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 12
+; CHECK-NEXT:    [[SWITCH_SCALAR43:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 13
+; CHECK-NEXT:    [[SWITCH_SCALAR44:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 14
+; CHECK-NEXT:    [[SWITCH_SCALAR45:%.*]] = extractelement <16 x i1> [[SWITCH]], i32 15
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <16 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR16:%.*]] = extractelement <16 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC2_SCALAR17:%.*]] = extractelement <16 x i32> [[SRC2]], i32 2
+; CHECK-NEXT:    [[SRC2_SCALAR18:%.*]] = extractelement <16 x i32> [[SRC2]], i32 3
+; CHECK-NEXT:    [[SRC2_SCALAR19:%.*]] = extractelement <16 x i32> [[SRC2]], i32 4
+; CHECK-NEXT:    [[SRC2_SCALAR20:%.*]] = extractelement <16 x i32> [[SRC2]], i32 5
+; CHECK-NEXT:    [[SRC2_SCALAR21:%.*]] = extractelement <16 x i32> [[SRC2]], i32 6
+; CHECK-NEXT:    [[SRC2_SCALAR22:%.*]] = extractelement <16 x i32> [[SRC2]], i32 7
+; CHECK-NEXT:    [[SRC2_SCALAR23:%.*]] = extractelement <16 x i32> [[SRC2]], i32 8
+; CHECK-NEXT:    [[SRC2_SCALAR24:%.*]] = extractelement <16 x i32> [[SRC2]], i32 9
+; CHECK-NEXT:    [[SRC2_SCALAR25:%.*]] = extractelement <16 x i32> [[SRC2]], i32 10
+; CHECK-NEXT:    [[SRC2_SCALAR26:%.*]] = extractelement <16 x i32> [[SRC2]], i32 11
+; CHECK-NEXT:    [[SRC2_SCALAR27:%.*]] = extractelement <16 x i32> [[SRC2]], i32 12
+; CHECK-NEXT:    [[SRC2_SCALAR28:%.*]] = extractelement <16 x i32> [[SRC2]], i32 13
+; CHECK-NEXT:    [[SRC2_SCALAR29:%.*]] = extractelement <16 x i32> [[SRC2]], i32 14
+; CHECK-NEXT:    [[SRC2_SCALAR30:%.*]] = extractelement <16 x i32> [[SRC2]], i32 15
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <16 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <16 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR2:%.*]] = extractelement <16 x i32> [[SRC1]], i32 2
+; CHECK-NEXT:    [[SRC1_SCALAR3:%.*]] = extractelement <16 x i32> [[SRC1]], i32 3
+; CHECK-NEXT:    [[SRC1_SCALAR4:%.*]] = extractelement <16 x i32> [[SRC1]], i32 4
+; CHECK-NEXT:    [[SRC1_SCALAR5:%.*]] = extractelement <16 x i32> [[SRC1]], i32 5
+; CHECK-NEXT:    [[SRC1_SCALAR6:%.*]] = extractelement <16 x i32> [[SRC1]], i32 6
+; CHECK-NEXT:    [[SRC1_SCALAR7:%.*]] = extractelement <16 x i32> [[SRC1]], i32 7
+; CHECK-NEXT:    [[SRC1_SCALAR8:%.*]] = extractelement <16 x i32> [[SRC1]], i32 8
+; CHECK-NEXT:    [[SRC1_SCALAR9:%.*]] = extractelement <16 x i32> [[SRC1]], i32 9
+; CHECK-NEXT:    [[SRC1_SCALAR10:%.*]] = extractelement <16 x i32> [[SRC1]], i32 10
+; CHECK-NEXT:    [[SRC1_SCALAR11:%.*]] = extractelement <16 x i32> [[SRC1]], i32 11
+; CHECK-NEXT:    [[SRC1_SCALAR12:%.*]] = extractelement <16 x i32> [[SRC1]], i32 12
+; CHECK-NEXT:    [[SRC1_SCALAR13:%.*]] = extractelement <16 x i32> [[SRC1]], i32 13
+; CHECK-NEXT:    [[SRC1_SCALAR14:%.*]] = extractelement <16 x i32> [[SRC1]], i32 14
+; CHECK-NEXT:    [[SRC1_SCALAR15:%.*]] = extractelement <16 x i32> [[SRC1]], i32 15
+; CHECK-NEXT:    [[RESULT46:%.*]] = select i1 [[SWITCH_SCALAR]], i32 [[SRC1_SCALAR]], i32 [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[RESULT47:%.*]] = select i1 [[SWITCH_SCALAR31]], i32 [[SRC1_SCALAR1]], i32 [[SRC2_SCALAR16]]
+; CHECK-NEXT:    [[RESULT48:%.*]] = select i1 [[SWITCH_SCALAR32]], i32 [[SRC1_SCALAR2]], i32 [[SRC2_SCALAR17]]
+; CHECK-NEXT:    [[RESULT49:%.*]] = select i1 [[SWITCH_SCALAR33]], i32 [[SRC1_SCALAR3]], i32 [[SRC2_SCALAR18]]
+; CHECK-NEXT:    [[RESULT50:%.*]] = select i1 [[SWITCH_SCALAR34]], i32 [[SRC1_SCALAR4]], i32 [[SRC2_SCALAR19]]
+; CHECK-NEXT:    [[RESULT51:%.*]] = select i1 [[SWITCH_SCALAR35]], i32 [[SRC1_SCALAR5]], i32 [[SRC2_SCALAR20]]
+; CHECK-NEXT:    [[RESULT52:%.*]] = select i1 [[SWITCH_SCALAR36]], i32 [[SRC1_SCALAR6]], i32 [[SRC2_SCALAR21]]
+; CHECK-NEXT:    [[RESULT53:%.*]] = select i1 [[SWITCH_SCALAR37]], i32 [[SRC1_SCALAR7]], i32 [[SRC2_SCALAR22]]
+; CHECK-NEXT:    [[RESULT54:%.*]] = select i1 [[SWITCH_SCALAR38]], i32 [[SRC1_SCALAR8]], i32 [[SRC2_SCALAR23]]
+; CHECK-NEXT:    [[RESULT55:%.*]] = select i1 [[SWITCH_SCALAR39]], i32 [[SRC1_SCALAR9]], i32 [[SRC2_SCALAR24]]
+; CHECK-NEXT:    [[RESULT56:%.*]] = select i1 [[SWITCH_SCALAR40]], i32 [[SRC1_SCALAR10]], i32 [[SRC2_SCALAR25]]
+; CHECK-NEXT:    [[RESULT57:%.*]] = select i1 [[SWITCH_SCALAR41]], i32 [[SRC1_SCALAR11]], i32 [[SRC2_SCALAR26]]
+; CHECK-NEXT:    [[RESULT58:%.*]] = select i1 [[SWITCH_SCALAR42]], i32 [[SRC1_SCALAR12]], i32 [[SRC2_SCALAR27]]
+; CHECK-NEXT:    [[RESULT59:%.*]] = select i1 [[SWITCH_SCALAR43]], i32 [[SRC1_SCALAR13]], i32 [[SRC2_SCALAR28]]
+; CHECK-NEXT:    [[RESULT60:%.*]] = select i1 [[SWITCH_SCALAR44]], i32 [[SRC1_SCALAR14]], i32 [[SRC2_SCALAR29]]
+; CHECK-NEXT:    [[RESULT61:%.*]] = select i1 [[SWITCH_SCALAR45]], i32 [[SRC1_SCALAR15]], i32 [[SRC2_SCALAR30]]
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <16 x i32> undef, i32 [[RESULT46]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT62:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT47]], i32 1
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT63:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT62]], i32 [[RESULT48]], i32 2
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT64:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT63]], i32 [[RESULT49]], i32 3
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT65:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT64]], i32 [[RESULT50]], i32 4
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT66:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT65]], i32 [[RESULT51]], i32 5
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT67:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT66]], i32 [[RESULT52]], i32 6
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT68:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT67]], i32 [[RESULT53]], i32 7
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT69:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT68]], i32 [[RESULT54]], i32 8
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT70:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT69]], i32 [[RESULT55]], i32 9
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT71:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT70]], i32 [[RESULT56]], i32 10
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT72:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT71]], i32 [[RESULT57]], i32 11
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT73:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT72]], i32 [[RESULT58]], i32 12
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT74:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT73]], i32 [[RESULT59]], i32 13
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT75:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT74]], i32 [[RESULT60]], i32 14
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT76:%.*]] = insertelement <16 x i32> [[RESULT_ASSEMBLED_VECT75]], i32 [[RESULT61]], i32 15
+; CHECK-NEXT:    ret <16 x i32> [[RESULT_ASSEMBLED_VECT76]]
+;
+  %result = select <16 x i1> %switch, <16 x i32> %src1, <16 x i32> %src2
+  ret <16 x i32> %result
+}
+
+define <2 x i32> @should_work_with_constant_value(<2 x i1> %switch, <2 x i32> %src1) {
+; CHECK-LABEL: define <2 x i32> @should_work_with_constant_value(
+; CHECK-SAME: <2 x i1> [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SWITCH_SCALAR:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 0
+; CHECK-NEXT:    [[SWITCH_SCALAR2:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[RESULT3:%.*]] = select i1 [[SWITCH_SCALAR]], i32 [[SRC1_SCALAR]], i32 2
+; CHECK-NEXT:    [[RESULT4:%.*]] = select i1 [[SWITCH_SCALAR2]], i32 [[SRC1_SCALAR1]], i32 4
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[RESULT3]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT5:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT4]], i32 1
+; CHECK-NEXT:    ret <2 x i32> [[RESULT_ASSEMBLED_VECT5]]
+;
+  %result = select <2 x i1> %switch, <2 x i32> %src1, <2 x i32> <i32 2, i32 4>
+  ret <2 x i32> %result
+}
+
+define <2 x i32> @should_work_with_non_vector_condition(i1 %switch, <2 x i32> %src1, <2 x i32> %src2) {
+; CHECK-LABEL: define <2 x i32> @should_work_with_non_vector_condition(
+; CHECK-SAME: i1 [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]], <2 x i32> [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[SRC2_SCALAR:%.*]] = extractelement <2 x i32> [[SRC2]], i32 0
+; CHECK-NEXT:    [[SRC2_SCALAR2:%.*]] = extractelement <2 x i32> [[SRC2]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[RESULT3:%.*]] = select i1 [[SWITCH]], i32 [[SRC1_SCALAR]], i32 [[SRC2_SCALAR]]
+; CHECK-NEXT:    [[RESULT4:%.*]] = select i1 [[SWITCH]], i32 [[SRC1_SCALAR1]], i32 [[SRC2_SCALAR2]]
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[RESULT3]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT5:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[RESULT4]], i32 1
+; CHECK-NEXT:    ret <2 x i32> [[RESULT_ASSEMBLED_VECT5]]
+;
+  %result = select i1 %switch, <2 x i32> %src1, <2 x i32> %src2
+  ret <2 x i32> %result
+}
+
+define <2 x i32> @should_not_select_from_the_same_value(<2 x i1> %switch, <2 x i32> %src1) {
+; CHECK-LABEL: define <2 x i32> @should_not_select_from_the_same_value(
+; CHECK-SAME: <2 x i1> [[SWITCH:%.*]], <2 x i32> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SWITCH_SCALAR:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 0
+; CHECK-NEXT:    [[SWITCH_SCALAR2:%.*]] = extractelement <2 x i1> [[SWITCH]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x i32> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x i32> [[SRC1]], i32 1
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[SRC1_SCALAR]], i32 0
+; CHECK-NEXT:    [[RESULT_ASSEMBLED_VECT3:%.*]] = insertelement <2 x i32> [[RESULT_ASSEMBLED_VECT]], i32 [[SRC1_SCALAR1]], i32 1
+; CHECK-NEXT:    ret <2 x i32> [[RESULT_ASSEMBLED_VECT3]]
+;
+  %result = select <2 x i1> %switch, <2 x i32> %src1, <2 x i32> %src1
+  ret <2 x i32> %result
+}
+
+define i32 @should_not_scalarize_scalar(i1 %switch, i32 %src1, i32 %src2) {
+; CHECK-LABEL: define i32 @should_not_scalarize_scalar(
+; CHECK-SAME: i1 [[SWITCH:%.*]], i32 [[SRC1:%.*]], i32 [[SRC2:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = select i1 [[SWITCH]], i32 [[SRC1]], i32 [[SRC2]]
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %result = select i1 %switch, i32 %src1, i32 %src2
+  ret i32 %result
+}
diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-unary-instruction-typed-pointers.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-unary-instruction-typed-pointers.ll
new file mode 100644
index 000000000000..3b33266b812b
--- /dev/null
+++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-unary-instruction-typed-pointers.ll
@@ -0,0 +1,187 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+;
+; RUN: igc_opt --igc-scalarize -S < %s | FileCheck %s
+; ------------------------------------------------
+; ScalarizeFunction
+; ------------------------------------------------
+; The pass should break vector operation into many scalar operations
+; ------------------------------------------------
+
+define spir_kernel void @basic(<2 x float> %src1) {
+; CHECK-LABEL: define spir_kernel void @basic(
+; CHECK-SAME: <2 x float> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x float>, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg float [[SRC1_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fneg float [[SRC1_SCALAR1]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x float> [[DOTASSEMBLED_VECT]], float [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x float> [[DOTASSEMBLED_VECT2]], <2 x float>* [[TMP1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x float>
+  %2 = fneg <2 x float> %src1
+  store <2 x float> %2, <2 x float>* %1
+  ret void
+}
+
+define spir_kernel void @should_work_with_different_type(<2 x double> %src1) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_different_type(
+; CHECK-SAME: <2 x double> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x double> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x double> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x double>, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg double [[SRC1_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fneg double [[SRC1_SCALAR1]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x double> [[DOTASSEMBLED_VECT]], double [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x double> [[DOTASSEMBLED_VECT2]], <2 x double>* [[TMP1]], align 16
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x double>
+  %2 = fneg <2 x double> %src1
+  store <2 x double> %2, <2 x double>* %1
+  ret void
+}
+
+define spir_kernel void @should_work_with_fast_math_flags(<2 x double> %src1) {
+; CHECK-LABEL: @should_work_with_fast_math_flags(
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x double> [[SRC1:%.*]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x double> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x double>, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg fast double [[SRC1_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fneg fast double [[SRC1_SCALAR1]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x double> [[DOTASSEMBLED_VECT]], double [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x double> [[DOTASSEMBLED_VECT2]], <2 x double>* [[TMP1]], align 16
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x double>
+  %2 = fneg fast <2 x double> %src1
+  store <2 x double> %2, <2 x double>* %1
+  ret void
+}
+
+define spir_kernel void @should_work_with_larger_vector_size(<16 x float> %src1) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_larger_vector_size(
+; CHECK-SAME: <16 x float> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <16 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <16 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR2:%.*]] = extractelement <16 x float> [[SRC1]], i32 2
+; CHECK-NEXT:    [[SRC1_SCALAR3:%.*]] = extractelement <16 x float> [[SRC1]], i32 3
+; CHECK-NEXT:    [[SRC1_SCALAR4:%.*]] = extractelement <16 x float> [[SRC1]], i32 4
+; CHECK-NEXT:    [[SRC1_SCALAR5:%.*]] = extractelement <16 x float> [[SRC1]], i32 5
+; CHECK-NEXT:    [[SRC1_SCALAR6:%.*]] = extractelement <16 x float> [[SRC1]], i32 6
+; CHECK-NEXT:    [[SRC1_SCALAR7:%.*]] = extractelement <16 x float> [[SRC1]], i32 7
+; CHECK-NEXT:    [[SRC1_SCALAR8:%.*]] = extractelement <16 x float> [[SRC1]], i32 8
+; CHECK-NEXT:    [[SRC1_SCALAR9:%.*]] = extractelement <16 x float> [[SRC1]], i32 9
+; CHECK-NEXT:    [[SRC1_SCALAR10:%.*]] = extractelement <16 x float> [[SRC1]], i32 10
+; CHECK-NEXT:    [[SRC1_SCALAR11:%.*]] = extractelement <16 x float> [[SRC1]], i32 11
+; CHECK-NEXT:    [[SRC1_SCALAR12:%.*]] = extractelement <16 x float> [[SRC1]], i32 12
+; CHECK-NEXT:    [[SRC1_SCALAR13:%.*]] = extractelement <16 x float> [[SRC1]], i32 13
+; CHECK-NEXT:    [[SRC1_SCALAR14:%.*]] = extractelement <16 x float> [[SRC1]], i32 14
+; CHECK-NEXT:    [[SRC1_SCALAR15:%.*]] = extractelement <16 x float> [[SRC1]], i32 15
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <16 x float>, align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg float [[SRC1_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fneg float [[SRC1_SCALAR1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg float [[SRC1_SCALAR2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fneg float [[SRC1_SCALAR3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fneg float [[SRC1_SCALAR4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fneg float [[SRC1_SCALAR5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fneg float [[SRC1_SCALAR6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fneg float [[SRC1_SCALAR7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fneg float [[SRC1_SCALAR8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fneg float [[SRC1_SCALAR9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fneg float [[SRC1_SCALAR10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fneg float [[SRC1_SCALAR11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fneg float [[SRC1_SCALAR12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fneg float [[SRC1_SCALAR13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = fneg float [[SRC1_SCALAR14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fneg float [[SRC1_SCALAR15]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT16:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT17:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT16]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT18:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT17]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT19:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT18]], float [[TMP6]], i32 4
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT20:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT19]], float [[TMP7]], i32 5
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT21:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT20]], float [[TMP8]], i32 6
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT22:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT21]], float [[TMP9]], i32 7
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT23:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT22]], float [[TMP10]], i32 8
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT24:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT23]], float [[TMP11]], i32 9
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT25:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT24]], float [[TMP12]], i32 10
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT26:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT25]], float [[TMP13]], i32 11
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT27:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT26]], float [[TMP14]], i32 12
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT28:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT27]], float [[TMP15]], i32 13
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT29:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT28]], float [[TMP16]], i32 14
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT30:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT29]], float [[TMP17]], i32 15
+; CHECK-NEXT:    store <16 x float> [[DOTASSEMBLED_VECT30]], <16 x float>* [[TMP1]], align 64
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <16 x float>
+  %2 = fneg <16 x float> %src1
+  store <16 x float> %2, <16 x float>* %1
+  ret void
+}
+
+define void @should_not_scalarize_optnone(<2 x float> %src1) #0 {
+; CHECK-LABEL: @should_not_scalarize_optnone(
+; CHECK: fneg <2 x float> %src1
+  %1 = alloca <2 x float>
+  %2 = fneg <2 x float> %src1
+  store <2 x float> %2, <2 x float>* %1
+  ret void
+}
+
+define <2 x float> @should_not_scalarize_const(<2 x float> %src1) {
+; CHECK-LABEL: @should_not_scalarize_const(
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <2 x float> <float 1.000000e+00, float 2.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[TMP1]]
+;
+  %1 = fneg <2 x float> <float 1.0, float 2.0>
+  ret <2 x float> %1
+}
+
+define spir_kernel void @should_not_scalarize_scalar(float %src1) {
+; CHECK-LABEL: @should_not_scalarize_scalar(
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca float, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg float [[SRC1:%.*]]
+; CHECK-NEXT:    store float [[TMP2]], float* [[TMP1]], align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca float
+  %2 = fneg float %src1
+  store float %2, float* %1
+  ret void
+}
+
+define void @test_fneg_optnone(<4 x float> %src, <3 x float> addrspace(1)* %out) #0 {
+
+; CHECK-LABEL: @test_fneg_optnone(
+;
+; CHECK: [[EE0:%.*]] = extractelement <4 x float> %src, i32 0
+; CHECK: [[EE1:%.*]] = extractelement <4 x float> %src, i32 1
+; CHECK: [[EE2:%.*]] = extractelement <4 x float> %src, i32 2
+; CHECK: [[EE3:%.*]] = extractelement <4 x float> %src, i32 3
+; CHECK: [[IE0:%.*]] = insertelement <3 x float> undef, float [[EE0]], i32 0
+; CHECK: [[IE1:%.*]] = insertelement <3 x float> [[IE0]], float [[EE1]], i32 1
+; CHECK: [[IE2:%.*]] = insertelement <3 x float> [[IE1]], float [[EE2]], i32 2
+; CHECK: [[FNEG:%.*]] = fneg <3 x float> [[IE2]]
+; CHECK: store <3 x float> [[FNEG]], <3 x float> addrspace(1)* %out, align 4
+
+; CHECK-NOT: fneg <3 x float> undef
+
+  %1 = shufflevector <4 x float> %src, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %2 = fneg <3 x float> %1
+  store <3 x float> %2, <3 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { noinline optnone }
\ No newline at end of file
diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-unary-instruction.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-unary-instruction.ll
new file mode 100644
index 000000000000..c8b0720820e6
--- /dev/null
+++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-unary-instruction.ll
@@ -0,0 +1,194 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+;
+; RUN: igc_opt --igc-scalarize --opaque-pointers -S < %s | FileCheck %s
+; REQUIRES: llvm-14-plus
+; ------------------------------------------------
+; ScalarizeFunction
+; ------------------------------------------------
+; The pass should break vector operation into many scalar operations
+; ------------------------------------------------
+
+define spir_kernel void @basic(<2 x float> %src1) {
+; CHECK-LABEL: define spir_kernel void @basic(
+; CHECK-SAME: <2 x float> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x float>, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg float [[SRC1_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fneg float [[SRC1_SCALAR1]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x float> [[DOTASSEMBLED_VECT]], float [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x float> [[DOTASSEMBLED_VECT2]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x float>
+  %2 = fneg <2 x float> %src1
+  store <2 x float> %2, ptr %1
+  ret void
+}
+
+define spir_kernel void @should_work_with_different_type(<2 x double> %src1) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_different_type(
+; CHECK-SAME: <2 x double> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x double> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x double> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x double>, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg double [[SRC1_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fneg double [[SRC1_SCALAR1]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x double> [[DOTASSEMBLED_VECT]], double [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x double> [[DOTASSEMBLED_VECT2]], ptr [[TMP1]], align 16
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x double>
+  %2 = fneg <2 x double> %src1
+  store <2 x double> %2, ptr %1
+  ret void
+}
+
+define spir_kernel void @should_work_with_fast_math_flags(<2 x double> %src1) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_fast_math_flags(
+; CHECK-SAME: <2 x double> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <2 x double> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <2 x double> [[SRC1]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x double>, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg fast double [[SRC1_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fneg fast double [[SRC1_SCALAR1]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT2:%.*]] = insertelement <2 x double> [[DOTASSEMBLED_VECT]], double [[TMP3]], i32 1
+; CHECK-NEXT:    store <2 x double> [[DOTASSEMBLED_VECT2]], ptr [[TMP1]], align 16
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x double>
+  %2 = fneg fast <2 x double> %src1
+  store <2 x double> %2, ptr %1
+  ret void
+}
+
+define spir_kernel void @should_work_with_larger_vector_size(<16 x float> %src1) {
+; CHECK-LABEL: define spir_kernel void @should_work_with_larger_vector_size(
+; CHECK-SAME: <16 x float> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[SRC1_SCALAR:%.*]] = extractelement <16 x float> [[SRC1]], i32 0
+; CHECK-NEXT:    [[SRC1_SCALAR1:%.*]] = extractelement <16 x float> [[SRC1]], i32 1
+; CHECK-NEXT:    [[SRC1_SCALAR2:%.*]] = extractelement <16 x float> [[SRC1]], i32 2
+; CHECK-NEXT:    [[SRC1_SCALAR3:%.*]] = extractelement <16 x float> [[SRC1]], i32 3
+; CHECK-NEXT:    [[SRC1_SCALAR4:%.*]] = extractelement <16 x float> [[SRC1]], i32 4
+; CHECK-NEXT:    [[SRC1_SCALAR5:%.*]] = extractelement <16 x float> [[SRC1]], i32 5
+; CHECK-NEXT:    [[SRC1_SCALAR6:%.*]] = extractelement <16 x float> [[SRC1]], i32 6
+; CHECK-NEXT:    [[SRC1_SCALAR7:%.*]] = extractelement <16 x float> [[SRC1]], i32 7
+; CHECK-NEXT:    [[SRC1_SCALAR8:%.*]] = extractelement <16 x float> [[SRC1]], i32 8
+; CHECK-NEXT:    [[SRC1_SCALAR9:%.*]] = extractelement <16 x float> [[SRC1]], i32 9
+; CHECK-NEXT:    [[SRC1_SCALAR10:%.*]] = extractelement <16 x float> [[SRC1]], i32 10
+; CHECK-NEXT:    [[SRC1_SCALAR11:%.*]] = extractelement <16 x float> [[SRC1]], i32 11
+; CHECK-NEXT:    [[SRC1_SCALAR12:%.*]] = extractelement <16 x float> [[SRC1]], i32 12
+; CHECK-NEXT:    [[SRC1_SCALAR13:%.*]] = extractelement <16 x float> [[SRC1]], i32 13
+; CHECK-NEXT:    [[SRC1_SCALAR14:%.*]] = extractelement <16 x float> [[SRC1]], i32 14
+; CHECK-NEXT:    [[SRC1_SCALAR15:%.*]] = extractelement <16 x float> [[SRC1]], i32 15
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <16 x float>, align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg float [[SRC1_SCALAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fneg float [[SRC1_SCALAR1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg float [[SRC1_SCALAR2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fneg float [[SRC1_SCALAR3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fneg float [[SRC1_SCALAR4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fneg float [[SRC1_SCALAR5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fneg float [[SRC1_SCALAR6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fneg float [[SRC1_SCALAR7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fneg float [[SRC1_SCALAR8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fneg float [[SRC1_SCALAR9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fneg float [[SRC1_SCALAR10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fneg float [[SRC1_SCALAR11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fneg float [[SRC1_SCALAR12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fneg float [[SRC1_SCALAR13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = fneg float [[SRC1_SCALAR14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fneg float [[SRC1_SCALAR15]]
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT16:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT17:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT16]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT18:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT17]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT19:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT18]], float [[TMP6]], i32 4
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT20:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT19]], float [[TMP7]], i32 5
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT21:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT20]], float [[TMP8]], i32 6
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT22:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT21]], float [[TMP9]], i32 7
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT23:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT22]], float [[TMP10]], i32 8
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT24:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT23]], float [[TMP11]], i32 9
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT25:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT24]], float [[TMP12]], i32 10
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT26:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT25]], float [[TMP13]], i32 11
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT27:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT26]], float [[TMP14]], i32 12
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT28:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT27]], float [[TMP15]], i32 13
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT29:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT28]], float [[TMP16]], i32 14
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT30:%.*]] = insertelement <16 x float> [[DOTASSEMBLED_VECT29]], float [[TMP17]], i32 15
+; CHECK-NEXT:    store <16 x float> [[DOTASSEMBLED_VECT30]], ptr [[TMP1]], align 64
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <16 x float>
+  %2 = fneg <16 x float> %src1
+  store <16 x float> %2, ptr %1
+  ret void
+}
+
+define void @should_not_scalarize_optnone(<2 x float> %src1) #0 {
+; CHECK-LABEL: define void @should_not_scalarize_optnone(
+; CHECK-SAME: <2 x float> [[SRC1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <2 x float>, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg <2 x float> [[SRC1]]
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca <2 x float>
+  %2 = fneg <2 x float> %src1
+  store <2 x float> %2, ptr %1
+  ret void
+}
+
+define <2 x float> @should_not_scalarize_const(<2 x float> %src1) {
+; CHECK-LABEL: define <2 x float> @should_not_scalarize_const(
+; CHECK-SAME: <2 x float> [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <2 x float> <float 1.000000e+00, float 2.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[TMP1]]
+;
+  %1 = fneg <2 x float> <float 1.0, float 2.0>
+  ret <2 x float> %1
+}
+
+define spir_kernel void @should_not_scalarize_scalar(float %src1) {
+; CHECK-LABEL: define spir_kernel void @should_not_scalarize_scalar(
+; CHECK-SAME: float [[SRC1:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca float, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg float [[SRC1]]
+; CHECK-NEXT:    store float [[TMP2]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = alloca float
+  %2 = fneg float %src1
+  store float %2, ptr %1
+  ret void
+}
+
+define void @test_fneg_optnone(<4 x float> %src, ptr addrspace(1) %out) #0 {
+; CHECK-LABEL: define void @test_fneg_optnone(
+; CHECK-SAME: <4 x float> [[SRC:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[SRC_SCALAR:%.*]] = extractelement <4 x float> [[SRC]], i32 0
+; CHECK-NEXT:    [[SRC_SCALAR1:%.*]] = extractelement <4 x float> [[SRC]], i32 1
+; CHECK-NEXT:    [[SRC_SCALAR2:%.*]] = extractelement <4 x float> [[SRC]], i32 2
+; CHECK-NEXT:    [[SRC_SCALAR3:%.*]] = extractelement <4 x float> [[SRC]], i32 3
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT:%.*]] = insertelement <3 x float> undef, float [[SRC_SCALAR]], i32 0
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT4:%.*]] = insertelement <3 x float> [[DOTASSEMBLED_VECT]], float [[SRC_SCALAR1]], i32 1
+; CHECK-NEXT:    [[DOTASSEMBLED_VECT5:%.*]] = insertelement <3 x float> [[DOTASSEMBLED_VECT4]], float [[SRC_SCALAR2]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <3 x float> [[DOTASSEMBLED_VECT5]]
+; CHECK-NEXT:    store <3 x float> [[TMP1]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = shufflevector <4 x float> %src, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %2 = fneg <3 x float> %1
+  store <3 x float> %2, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+attributes #0 = { noinline optnone }
diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-vector-instructions-typed-pointers.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-vector-instructions-typed-pointers.ll
new file mode 100644
index 000000000000..af285857e35f
--- /dev/null
+++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-vector-instructions-typed-pointers.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+;
+; RUN: igc_opt --igc-scalarize -S < %s | FileCheck %s
+; ------------------------------------------------
+; ScalarizeFunction
+; ------------------------------------------------
+; The pass should break vector operation into many scalar operations
+; ------------------------------------------------
+
+define i32 @basic(i32 %src) {
+; CHECK-LABEL: define i32 @basic(
+; CHECK-SAME: i32 [[SRC:%.*]]) {
+; CHECK-NEXT:    ret i32 [[SRC]]
+;
+  %vector = insertelement <2 x i32> undef, i32 %src, i32 0
+  %result = extractelement <2 x i32> %vector, i32 0
+  ret i32 %result
+}
+
+define float @should_work_with_different_value_type(float %src) {
+; CHECK-LABEL: define float @should_work_with_different_value_type(
+; CHECK-SAME: float [[SRC:%.*]]) {
+; CHECK-NEXT:    ret float [[SRC]]
+;
+  %vector = insertelement <2 x float> undef, float %src, i32 0
+  %result = extractelement <2 x float> %vector, i32 0
+  ret float %result
+}
+
+define i32 @should_return_undef(i32 %src) {
+; CHECK-LABEL: define i32 @should_return_undef(
+; CHECK-SAME: i32 [[SRC:%.*]]) {
+; CHECK-NEXT:    ret i32 undef
+;
+  %vector = insertelement <2 x i32> undef, i32 %src, i32 0
+  %result = extractelement <2 x i32> %vector, i32 1
+  ret i32 %result
+}
+
+
+define i32 @should_work_with_larger_vector_size() {
+; CHECK-LABEL: define i32 @should_work_with_larger_vector_size() {
+; CHECK-NEXT:    [[RESULT:%.*]] = add i32 0, 8
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %1 = insertelement <16 x i32> undef, i32 0, i32 0
+  %2 = insertelement <16 x i32> %1, i32 1, i32 1
+  %3 = insertelement <16 x i32> %2, i32 2, i32 2
+  %4 = insertelement <16 x i32> %3, i32 3, i32 3
+  %5 = insertelement <16 x i32> %4, i32 4, i32 4
+  %6 = insertelement <16 x i32> %5, i32 5, i32 5
+  %7 = insertelement <16 x i32> %6, i32 6, i32 6
+  %8 = insertelement <16 x i32> %7, i32 7, i32 7
+  %9 = insertelement <16 x i32> %8, i32 8, i32 8
+  %10 = insertelement <16 x i32> %9, i32 9, i32 9
+  %11 = insertelement <16 x i32> %10, i32 10, i32 10
+  %12 = insertelement <16 x i32> %11, i32 11, i32 11
+  %13 = insertelement <16 x i32> %12, i32 12, i32 12
+  %14 = insertelement <16 x i32> %13, i32 13, i32 13
+  %15 = insertelement <16 x i32> %14, i32 14, i32 14
+  %vector = insertelement <16 x i32> %15, i32 15, i32 15
+  %first = extractelement <16 x i32> %vector, i32 0
+  %second = extractelement <16 x i32> %vector, i32 8
+  %result = add i32 %first, %second
+  ret i32 %result
+}
+
+define i32 @should_work_with_shuffle_instruction() {
+; CHECK-LABEL: define i32 @should_work_with_shuffle_instruction() {
+; CHECK-NEXT:    [[RETURN:%.*]] = add i32 1, 7
+; CHECK-NEXT:    ret i32 [[RETURN]]
+;
+  %1 = insertelement <4 x i32> undef, i32 0, i32 0
+  %2 = insertelement <4 x i32> %1, i32 1, i32 1
+  %3 = insertelement <4 x i32> %2, i32 2, i32 2
+  %first_vector = insertelement <4 x i32> %3, i32 3, i32 3
+
+  %4 = insertelement <4 x i32> undef, i32 4, i32 0
+  %5 = insertelement <4 x i32> %4, i32 5, i32 1
+  %6 = insertelement <4 x i32> %5, i32 6, i32 2
+  %second_vector = insertelement <4 x i32> %6, i32 7, i32 3
+
+  %result = shufflevector <4 x i32> %first_vector, <4 x i32> %second_vector, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
+  %first = extractelement <4 x i32> %result, i32 0
+  %second = extractelement <4 x i32> %result, i32 3
+  %return = add i32 %first, %second
+  ret i32 %return
+}
+
+
+define i32 @should_not_scalarize_with_index_as_variable(i32 %data, i32 %index) {
+; CHECK-LABEL: define i32 @should_not_scalarize_with_index_as_variable(
+; CHECK-SAME: i32 [[DATA:%.*]], i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[VECTOR:%.*]] = insertelement <2 x i32> undef, i32 [[DATA]], i32 [[INDEX]]
+; CHECK-NEXT:    [[RESULT:%.*]] = extractelement <2 x i32> [[VECTOR]], i32 [[INDEX]]
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %vector = insertelement <2 x i32> undef, i32 %data, i32 %index
+  %result = extractelement <2 x i32> %vector, i32 %index
+  ret i32 %result
+}
+
+define i32 @should_work_with_shuffle_undef() {
+; CHECK-LABEL: define i32 @should_work_with_shuffle_undef() {
+; CHECK-NEXT:    [[RETURN:%.*]] = add i32 undef, undef
+; CHECK-NEXT:    ret i32 [[RETURN]]
+;
+  %result = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
+  %first = extractelement <4 x i32> %result, i32 0
+  %second = extractelement <4 x i32> %result, i32 3
+  %return = add i32 %first, %second
+  ret i32 %return
+}
+
+define i32 @should_work_with_shuffle_undef_2() {
+; CHECK-LABEL: define i32 @should_work_with_shuffle_undef_2() {
+; CHECK-NEXT:    [[RETURN:%.*]] = add i32 1, undef
+; CHECK-NEXT:    ret i32 [[RETURN]]
+;
+  %1 = insertelement <4 x i32> undef, i32 0, i32 0
+  %2 = insertelement <4 x i32> %1, i32 1, i32 1
+  %3 = insertelement <4 x i32> %2, i32 2, i32 2
+  %first_vector = insertelement <4 x i32> %3, i32 3, i32 3
+
+  %4 = insertelement <4 x i32> undef, i32 4, i32 0
+  %5 = insertelement <4 x i32> %4, i32 5, i32 1
+  %6 = insertelement <4 x i32> %5, i32 6, i32 2
+  %second_vector = insertelement <4 x i32> %6, i32 7, i32 3
+
+  %result = shufflevector <4 x i32> %first_vector, <4 x i32> %second_vector, <4 x i32> <i32 1, i32 3, i32 5, i32 undef>
+
+  %first = extractelement <4 x i32> %result, i32 0
+  %second = extractelement <4 x i32> %result, i32 3
+  %return = add i32 %first, %second
+  ret i32 %return
+}
diff --git a/IGC/Compiler/tests/ScalarizeFunction/scalarize-vector-instructions.ll b/IGC/Compiler/tests/ScalarizeFunction/scalarize-vector-instructions.ll
new file mode 100644
index 000000000000..e1672779d0b6
--- /dev/null
+++ b/IGC/Compiler/tests/ScalarizeFunction/scalarize-vector-instructions.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+;
+; RUN: igc_opt --igc-scalarize --opaque-pointers -S < %s | FileCheck %s
+; REQUIRES: llvm-14-plus
+; ------------------------------------------------
+; ScalarizeFunction
+; ------------------------------------------------
+; The pass should remove redundant vector operations
+; ------------------------------------------------
+
+define i32 @basic(i32 %src) {
+; CHECK-LABEL: define i32 @basic(
+; CHECK-SAME: i32 [[SRC:%.*]]) {
+; CHECK-NEXT:    ret i32 [[SRC]]
+;
+  %vector = insertelement <2 x i32> undef, i32 %src, i32 0
+  %result = extractelement <2 x i32> %vector, i32 0
+  ret i32 %result
+}
+
+define float @should_work_with_different_value_type(float %src) {
+; CHECK-LABEL: define float @should_work_with_different_value_type(
+; CHECK-SAME: float [[SRC:%.*]]) {
+; CHECK-NEXT:    ret float [[SRC]]
+;
+  %vector = insertelement <2 x float> undef, float %src, i32 0
+  %result = extractelement <2 x float> %vector, i32 0
+  ret float %result
+}
+
+define i32 @should_return_undef(i32 %src) {
+; CHECK-LABEL: define i32 @should_return_undef(
+; CHECK-SAME: i32 [[SRC:%.*]]) {
+; CHECK-NEXT:    ret i32 undef
+;
+  %vector = insertelement <2 x i32> undef, i32 %src, i32 0
+  %result = extractelement <2 x i32> %vector, i32 1
+  ret i32 %result
+}
+
+
+define i32 @should_work_with_larger_vector_size() {
+; CHECK-LABEL: define i32 @should_work_with_larger_vector_size() {
+; CHECK-NEXT:    [[RESULT:%.*]] = add i32 0, 8
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %1 = insertelement <16 x i32> undef, i32 0, i32 0
+  %2 = insertelement <16 x i32> %1, i32 1, i32 1
+  %3 = insertelement <16 x i32> %2, i32 2, i32 2
+  %4 = insertelement <16 x i32> %3, i32 3, i32 3
+  %5 = insertelement <16 x i32> %4, i32 4, i32 4
+  %6 = insertelement <16 x i32> %5, i32 5, i32 5
+  %7 = insertelement <16 x i32> %6, i32 6, i32 6
+  %8 = insertelement <16 x i32> %7, i32 7, i32 7
+  %9 = insertelement <16 x i32> %8, i32 8, i32 8
+  %10 = insertelement <16 x i32> %9, i32 9, i32 9
+  %11 = insertelement <16 x i32> %10, i32 10, i32 10
+  %12 = insertelement <16 x i32> %11, i32 11, i32 11
+  %13 = insertelement <16 x i32> %12, i32 12, i32 12
+  %14 = insertelement <16 x i32> %13, i32 13, i32 13
+  %15 = insertelement <16 x i32> %14, i32 14, i32 14
+  %vector = insertelement <16 x i32> %15, i32 15, i32 15
+  %first = extractelement <16 x i32> %vector, i32 0
+  %second = extractelement <16 x i32> %vector, i32 8
+  %result = add i32 %first, %second
+  ret i32 %result
+}
+
+define i32 @should_work_with_shuffle_instruction() {
+; CHECK-LABEL: define i32 @should_work_with_shuffle_instruction() {
+; CHECK-NEXT:    [[RETURN:%.*]] = add i32 1, 7
+; CHECK-NEXT:    ret i32 [[RETURN]]
+;
+  %1 = insertelement <4 x i32> undef, i32 0, i32 0
+  %2 = insertelement <4 x i32> %1, i32 1, i32 1
+  %3 = insertelement <4 x i32> %2, i32 2, i32 2
+  %first_vector = insertelement <4 x i32> %3, i32 3, i32 3
+
+  %4 = insertelement <4 x i32> undef, i32 4, i32 0
+  %5 = insertelement <4 x i32> %4, i32 5, i32 1
+  %6 = insertelement <4 x i32> %5, i32 6, i32 2
+  %second_vector = insertelement <4 x i32> %6, i32 7, i32 3
+
+  %result = shufflevector <4 x i32> %first_vector, <4 x i32> %second_vector, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
+  %first = extractelement <4 x i32> %result, i32 0
+  %second = extractelement <4 x i32> %result, i32 3
+  %return = add i32 %first, %second
+  ret i32 %return
+}
+
+
+define i32 @should_not_scalarize_with_index_as_variable(i32 %data, i32 %index) {
+; CHECK-LABEL: define i32 @should_not_scalarize_with_index_as_variable(
+; CHECK-SAME: i32 [[DATA:%.*]], i32 [[INDEX:%.*]]) {
+; CHECK-NEXT:    [[VECTOR:%.*]] = insertelement <2 x i32> undef, i32 [[DATA]], i32 [[INDEX]]
+; CHECK-NEXT:    [[RESULT:%.*]] = extractelement <2 x i32> [[VECTOR]], i32 [[INDEX]]
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %vector = insertelement <2 x i32> undef, i32 %data, i32 %index
+  %result = extractelement <2 x i32> %vector, i32 %index
+  ret i32 %result
+}
+
+define i32 @should_work_with_shuffle_undef() {
+; CHECK-LABEL: define i32 @should_work_with_shuffle_undef() {
+; CHECK-NEXT:    [[RETURN:%.*]] = add i32 undef, undef
+; CHECK-NEXT:    ret i32 [[RETURN]]
+;
+  %result = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
+  %first = extractelement <4 x i32> %result, i32 0
+  %second = extractelement <4 x i32> %result, i32 3
+  %return = add i32 %first, %second
+  ret i32 %return
+}
+
+define i32 @should_work_with_shuffle_undef_2() {
+; CHECK-LABEL: define i32 @should_work_with_shuffle_undef_2() {
+; CHECK-NEXT:    [[RETURN:%.*]] = add i32 1, undef
+; CHECK-NEXT:    ret i32 [[RETURN]]
+;
+  %1 = insertelement <4 x i32> undef, i32 0, i32 0
+  %2 = insertelement <4 x i32> %1, i32 1, i32 1
+  %3 = insertelement <4 x i32> %2, i32 2, i32 2
+  %first_vector = insertelement <4 x i32> %3, i32 3, i32 3
+
+  %4 = insertelement <4 x i32> undef, i32 4, i32 0
+  %5 = insertelement <4 x i32> %4, i32 5, i32 1
+  %6 = insertelement <4 x i32> %5, i32 6, i32 2
+  %second_vector = insertelement <4 x i32> %6, i32 7, i32 3
+
+  %result = shufflevector <4 x i32> %first_vector, <4 x i32> %second_vector, <4 x i32> <i32 1, i32 3, i32 5, i32 undef>
+
+  %first = extractelement <4 x i32> %result, i32 0
+  %second = extractelement <4 x i32> %result, i32 3
+  %return = add i32 %first, %second
+  ret i32 %return
+}
diff --git a/IGC/Compiler/tests/ScalarizeFunction/selective-typed-pointers.ll b/IGC/Compiler/tests/ScalarizeFunction/selective-typed-pointers.ll
new file mode 100644
index 000000000000..3a63e9914073
--- /dev/null
+++ b/IGC/Compiler/tests/ScalarizeFunction/selective-typed-pointers.ll
@@ -0,0 +1,352 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+;
+; REQUIRES: regkeys
+; RUN: igc_opt --igc-scalarize -S --regkey=EnableSelectiveScalarizer=1 < %s | FileCheck %s
+; ------------------------------------------------
+; ScalarizeFunction
+; ------------------------------------------------
+; This test checks if selective scalarization leaves vectorial instructions un-scalarized.
+; ------------------------------------------------
+
+define spir_kernel void @test_selective_1(i64 %addr) #0 {
+; CHECK-LABEL: @test_selective_1(
+; CHECK-NEXT:    [[VECTINT:%.*]] = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
+; CHECK-NEXT:    [[VECTFLOAT:%.*]] = bitcast <8 x i32> [[VECTINT]] to <8 x float>
+; CHECK-NEXT:    [[VECTCAST:%.*]] = bitcast <8 x float> [[VECTFLOAT]] to <8 x i32>
+; CHECK-NEXT:    call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECTCAST]])
+; CHECK-NEXT:    ret void
+;
+
+; define a vector and do some bitcasts
+; nothing should get scalarized here
+
+  %vectint = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
+  %vectfloat = bitcast <8 x i32> %vectint to <8 x float>
+  %vectcast = bitcast <8 x float> %vectfloat to <8 x i32>
+  call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> %vectcast)
+
+  ret void
+}
+
+define spir_kernel void @test_selective_2(i64 %addr) #0 {
+; CHECK-LABEL: @test_selective_2(
+; CHECK-NEXT:    [[VECTINT:%.*]] = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
+; CHECK-NEXT:    [[VECTFLOAT:%.*]] = bitcast <8 x i32> [[VECTINT]] to <8 x float>
+; CHECK-NEXT:    [[VECTCAST:%.*]] = bitcast <8 x float> [[VECTFLOAT]] to <8 x i32>
+; CHECK-NEXT:    call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECTCAST]])
+; CHECK-NEXT:    [[ANOTHERCAST:%.*]] = bitcast <8 x float> [[VECTFLOAT]] to <8 x i32>
+; CHECK-NEXT:    [[ANOTHERCAST_SCALAR:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 0
+; CHECK-NEXT:    [[ANOTHERCAST_SCALAR1:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 1
+; CHECK-NEXT:    [[ANOTHERCAST_SCALAR2:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 2
+; CHECK-NEXT:    [[ANOTHERCAST_SCALAR3:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 3
+; CHECK-NEXT:    [[ANOTHERCAST_SCALAR4:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 4
+; CHECK-NEXT:    [[ANOTHERCAST_SCALAR5:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 5
+; CHECK-NEXT:    [[ANOTHERCAST_SCALAR6:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 6
+; CHECK-NEXT:    [[ANOTHERCAST_SCALAR7:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 7
+; CHECK-NEXT:    [[V3:%.*]] = add i32 [[ANOTHERCAST_SCALAR3]], [[ANOTHERCAST_SCALAR5]]
+; CHECK-NEXT:    ret void
+;
+; same as before, but %vectfloat is used in another branch of the code
+  %vectint = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
+  %vectfloat = bitcast <8 x i32> %vectint to <8 x float>
+  %vectcast = bitcast <8 x float> %vectfloat to <8 x i32>
+  call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> %vectcast)
+; so scalarization should happen here
+  %anothercast = bitcast <8 x float> %vectfloat to <8 x i32>
+  %v1 = extractelement <8 x i32> %anothercast, i32 3
+  %v2 = extractelement <8 x i32> %anothercast, i32 5
+  %v3 = add i32 %v1, %v2
+  ret void
+}
+
+define spir_kernel void @test_selective_3() {
+; CHECK-LABEL: @test_selective_3(
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[DATA:%.*]] = phi <8 x i32> [ zeroinitializer, [[TMP0]] ], [ [[NEWDATA:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[NEWDATA]] = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> [[DATA]])
+; CHECK-NEXT:    [[NEWOFFSET]] = add i32 [[OFFSET]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 10
+; CHECK-NEXT:    br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+; no scalarization happens here because the vectors %data and %newdata are used as whole
+  br label %loop
+
+loop:
+  %offset  = phi i32 [ 0, %0 ], [ %newoffset, %loop ]
+
+  %data    = phi <8 x i32> [ zeroinitializer, %0 ], [ %newdata, %loop ]
+  %newdata = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> %data)
+
+  %newoffset = add i32 %offset, 1
+  %1 = icmp eq i32 %newoffset, 10
+  br i1 %1, label %end, label %loop
+
+end:
+  ret void
+}
+
+define spir_kernel void @test_selective_4(i64 %addr) #0 {
+; CHECK-LABEL: @test_selective_4(
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[FLOAT_VECTOR:%.*]] = phi <8 x float> [ zeroinitializer, [[TMP0]] ], [ [[NEW_FLOAT_VECTOR:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INT_VECTOR:%.*]] = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 [[OFFSET]], i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0)
+; CHECK-NEXT:    [[NEW_FLOAT_VECTOR]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[FLOAT_VECTOR]], <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, <8 x i32> [[INT_VECTOR]], i32 11, i32 11, i32 8, i32 8, i1 false)
+; CHECK-NEXT:    [[NEWOFFSET]] = add i32 [[OFFSET]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 256
+; CHECK-NEXT:    br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+; same here: no scalarization
+  br label %loop
+
+loop:
+  %offset = phi i32 [ 0, %0 ], [ %newoffset, %loop ]
+  %float_vector = phi <8 x float> [ zeroinitializer, %0 ], [ %new_float_vector, %loop ]
+  %int_vector = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 %addr, i32 1023, i32 511, i32 1023, i32 %offset, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0)
+  %new_float_vector = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %float_vector, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, <8 x i32> %int_vector, i32 11, i32 11, i32 8, i32 8, i1 false)
+  %newoffset = add i32 %offset, 16
+  %1 = icmp eq i32 %newoffset, 256
+  br i1 %1, label %end, label %loop
+
+end:
+  ret void
+}
+
+
+define spir_kernel void @test_selective_5() {
+; CHECK-LABEL: @test_selective_5(
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[DATA1:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[DATA3:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR10:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[DATA4:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR11:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[DATA5:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR12:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[DATA6:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ]
+; CHECK-NEXT:    [[DATA7:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ]
+; CHECK-NEXT:    [[DATA8:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ]
+; CHECK-NEXT:    [[DATA9:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ]
+; CHECK-NEXT:    [[DATA2_ASSEMBLED_VECT:%.*]] = insertelement <4 x i32> undef, i32 [[DATA1]], i32 0
+; CHECK-NEXT:    [[DATA2_ASSEMBLED_VECT13:%.*]] = insertelement <4 x i32> [[DATA2_ASSEMBLED_VECT]], i32 [[DATA3]], i32 1
+; CHECK-NEXT:    [[DATA2_ASSEMBLED_VECT14:%.*]] = insertelement <4 x i32> [[DATA2_ASSEMBLED_VECT13]], i32 [[DATA4]], i32 2
+; CHECK-NEXT:    [[DATA2_ASSEMBLED_VECT15:%.*]] = insertelement <4 x i32> [[DATA2_ASSEMBLED_VECT14]], i32 [[DATA5]], i32 3
+; CHECK-NEXT:    [[NEWDATA:%.*]] = call <4 x i32> @do_math_v4i32_v4i32(<4 x i32> [[DATA2_ASSEMBLED_VECT15]])
+; CHECK-NEXT:    [[NEWDATA_SCALAR]] = extractelement <4 x i32> [[NEWDATA]], i32 0
+; CHECK-NEXT:    [[NEWDATA_SCALAR10]] = extractelement <4 x i32> [[NEWDATA]], i32 1
+; CHECK-NEXT:    [[NEWDATA_SCALAR11]] = extractelement <4 x i32> [[NEWDATA]], i32 2
+; CHECK-NEXT:    [[NEWDATA_SCALAR12]] = extractelement <4 x i32> [[NEWDATA]], i32 3
+; CHECK-NEXT:    [[NEWOFFSET]] = add i32 [[OFFSET]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 10
+; CHECK-NEXT:    br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+; here shufflevectors break vectorial nature of the arguments
+; scalarization should be done
+  br label %loop
+
+loop:
+  %offset   = phi i32 [ 0, %0 ], [ %newoffset, %loop ]
+
+  %data     = phi <8 x i32> [ zeroinitializer, %0 ], [ %newdata2, %loop ]
+  %data2    = shufflevector <8 x i32> %data, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %newdata  = call <4 x i32> @do_math_v4i32_v4i32(<4 x i32> %data2)
+  %newdata2 = shufflevector <4 x i32> %newdata, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+
+  %newoffset = add i32 %offset, 1
+  %1 = icmp eq i32 %newoffset, 10
+  br i1 %1, label %end, label %loop
+
+end:
+  ret void
+}
+
+define spir_kernel void @test_selective_6() {
+; CHECK-LABEL: @test_selective_6(
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[VECTFLOAT1:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT25:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[VECTFLOAT2:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT26:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[VECTFLOAT3:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT27:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[VECTFLOAT4:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT28:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[VECTFLOAT5:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT29:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[VECTFLOAT6:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT30:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[VECTFLOAT7:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT31:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[VECTFLOAT8:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT32:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[VECTINT9:%.*]] = bitcast float [[VECTFLOAT1]] to i32
+; CHECK-NEXT:    [[VECTINT10:%.*]] = bitcast float [[VECTFLOAT2]] to i32
+; CHECK-NEXT:    [[VECTINT11:%.*]] = bitcast float [[VECTFLOAT3]] to i32
+; CHECK-NEXT:    [[VECTINT12:%.*]] = bitcast float [[VECTFLOAT4]] to i32
+; CHECK-NEXT:    [[VECTINT13:%.*]] = bitcast float [[VECTFLOAT5]] to i32
+; CHECK-NEXT:    [[VECTINT14:%.*]] = bitcast float [[VECTFLOAT6]] to i32
+; CHECK-NEXT:    [[VECTINT15:%.*]] = bitcast float [[VECTFLOAT7]] to i32
+; CHECK-NEXT:    [[VECTINT16:%.*]] = bitcast float [[VECTFLOAT8]] to i32
+; CHECK-NEXT:    [[VECTADD17:%.*]] = add i32 [[VECTINT9]], 1
+; CHECK-NEXT:    [[VECTADD18:%.*]] = add i32 [[VECTINT10]], 2
+; CHECK-NEXT:    [[VECTADD19:%.*]] = add i32 [[VECTINT11]], 3
+; CHECK-NEXT:    [[VECTADD20:%.*]] = add i32 [[VECTINT12]], 4
+; CHECK-NEXT:    [[VECTADD21:%.*]] = add i32 [[VECTINT13]], 5
+; CHECK-NEXT:    [[VECTADD22:%.*]] = add i32 [[VECTINT14]], 6
+; CHECK-NEXT:    [[VECTADD23:%.*]] = add i32 [[VECTINT15]], 7
+; CHECK-NEXT:    [[VECTADD24:%.*]] = add i32 [[VECTINT16]], 8
+; CHECK-NEXT:    [[VECTFLOAT_NEXT25]] = bitcast i32 [[VECTADD17]] to float
+; CHECK-NEXT:    [[VECTFLOAT_NEXT26]] = bitcast i32 [[VECTADD18]] to float
+; CHECK-NEXT:    [[VECTFLOAT_NEXT27]] = bitcast i32 [[VECTADD19]] to float
+; CHECK-NEXT:    [[VECTFLOAT_NEXT28]] = bitcast i32 [[VECTADD20]] to float
+; CHECK-NEXT:    [[VECTFLOAT_NEXT29]] = bitcast i32 [[VECTADD21]] to float
+; CHECK-NEXT:    [[VECTFLOAT_NEXT30]] = bitcast i32 [[VECTADD22]] to float
+; CHECK-NEXT:    [[VECTFLOAT_NEXT31]] = bitcast i32 [[VECTADD23]] to float
+; CHECK-NEXT:    [[VECTFLOAT_NEXT32]] = bitcast i32 [[VECTADD24]] to float
+; CHECK-NEXT:    [[NEWOFFSET]] = add i32 [[OFFSET]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 10
+; CHECK-NEXT:    br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+; scalarization should not be prevented due to elementwise bitcasts
+; such bitcasts can be part of a chain of vector instructions, but
+; should not be at the end of it
+  br label %loop
+
+loop:
+  %offset = phi i32 [ 0, %0 ], [ %newoffset, %loop ]
+  %vectfloat = phi <8 x float> [ zeroinitializer, %0 ], [ %vectfloat.next, %loop ]
+
+  %vectint = bitcast <8 x float> %vectfloat to <8 x i32>
+  %vectadd = add <8 x i32> %vectint, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %vectfloat.next = bitcast <8 x i32> %vectadd to <8 x float>
+
+  %newoffset = add i32 %offset, 1
+  %1 = icmp eq i32 %newoffset, 10
+  br i1 %1, label %end, label %loop
+end:
+  ret void
+}
+
+define spir_kernel void @test_selective_7() {
+; CHECK-LABEL: @test_selective_7(
+; CHECK-NEXT:    [[VECTINT:%.*]] = add <4 x i16> <i16 0, i16 1, i16 2, i16 3>, zeroinitializer
+; CHECK-NEXT:    [[VECTFLOAT:%.*]] = bitcast <4 x i16> [[VECTINT]] to <4 x half>
+; CHECK-NEXT:    [[VECTCAST:%.*]] = bitcast <4 x half> [[VECTFLOAT]] to i64
+; CHECK-NEXT:    ret void
+;
+; non-elementwise bitcasts (result type is scalar) should prevent scalarization,
+; thus no scalarization should happen here
+  %vectint = add <4 x i16> <i16 0, i16 1, i16 2, i16 3>, zeroinitializer
+  %vectfloat = bitcast <4 x i16> %vectint to <4 x half>
+  %vectcast = bitcast <4 x half> %vectfloat to i64
+
+  ret void
+}
+
+define spir_kernel void @test_selective_8() {
+; CHECK-LABEL: @test_selective_8(
+; CHECK-NEXT:    [[VECTINT:%.*]] = add <4 x i16> <i16 0, i16 1, i16 2, i16 3>, zeroinitializer
+; CHECK-NEXT:    [[VECTFLOAT:%.*]] = bitcast <4 x i16> [[VECTINT]] to <4 x half>
+; CHECK-NEXT:    [[VECTCAST:%.*]] = bitcast <4 x half> [[VECTFLOAT]] to <2 x i32>
+; CHECK-NEXT:    ret void
+;
+; non-elementwise bitcasts (result is different sized vector) should prevent scalarization,
+; thus no scalarization should happen here
+  %vectint = add <4 x i16> <i16 0, i16 1, i16 2, i16 3>, zeroinitializer
+  %vectfloat = bitcast <4 x i16> %vectint to <4 x half>
+  %vectcast = bitcast <4 x half> %vectfloat to <2 x i32>
+
+  ret void
+}
+
+define <32 x i1> @test_selective_9(i64 %addr) #0 {
+; CHECK-LABEL: @test_selective_9(
+; CHECK-NEXT:    [[INT:%.*]] = add i32 1, 0
+; CHECK-NEXT:    [[FLOAT:%.*]] = bitcast i32 [[INT]] to float
+; CHECK-NEXT:    [[VECTCAST:%.*]] = bitcast float [[FLOAT]] to <32 x i1>
+; CHECK-NEXT:    ret <32 x i1> [[VECTCAST]]
+;
+  %int = add i32 1, zeroinitializer
+  %float = bitcast i32 %int to float
+  %vectcast = bitcast float %float to <32 x i1>
+  ret <32 x i1> %vectcast
+}
+
+define <2 x i32> @should_not_scalarize_if_the_index_is_not_a_constant(i32 %src) {
+; CHECK-LABEL: @should_not_scalarize_if_the_index_is_not_a_constant(
+; CHECK-NEXT:    [[VECTOR_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[SRC:%.*]], i32 0
+; CHECK-NEXT:    [[VECTOR_ASSEMBLED_VECT1:%.*]] = insertelement <2 x i32> [[VECTOR_ASSEMBLED_VECT]], i32 undef, i32 1
+; CHECK-NEXT:    [[RESULT:%.*]] = insertelement <2 x i32> [[VECTOR_ASSEMBLED_VECT1]], i32 3, i32 [[SRC]]
+; CHECK-NEXT:    ret <2 x i32> [[RESULT]]
+;
+  %vector = insertelement <2 x i32> undef, i32 %src, i32 0
+  %result = insertelement <2 x i32> %vector, i32 3, i32 %src
+  ret <2 x i32> %result
+}
+
+define i32 @should_not_scalarize_if_the_index_is_not_a_constant_2(i32 %src) {
+; CHECK-LABEL: @should_not_scalarize_if_the_index_is_not_a_constant_2(
+; CHECK-NEXT:    [[VECTOR_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[SRC:%.*]], i32 0
+; CHECK-NEXT:    [[VECTOR_ASSEMBLED_VECT1:%.*]] = insertelement <2 x i32> [[VECTOR_ASSEMBLED_VECT]], i32 undef, i32 1
+; CHECK-NEXT:    [[RESULT:%.*]] = extractelement <2 x i32> [[VECTOR_ASSEMBLED_VECT1]], i32 [[SRC]]
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %vector = insertelement <2 x i32> undef, i32 %src, i32 0
+  %result = extractelement <2 x i32> %vector, i32 %src
+  ret i32 %result
+}
+
+define spir_kernel void @should_not_scalarize_nested_phi(i1 %switch) {
+; CHECK-LABEL: @should_not_scalarize_nested_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[VECTINT:%.*]] = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
+; CHECK-NEXT:    br i1 [[SWITCH:%.*]], label [[FIRST:%.*]], label [[SECOND:%.*]]
+; CHECK:       proxy:
+; CHECK-NEXT:    br i1 [[SWITCH]], label [[FIRST]], label [[SECOND]]
+; CHECK:       first:
+; CHECK-NEXT:    [[RESULT1:%.*]] = phi <8 x i32> [ zeroinitializer, [[PROXY:%.*]] ], [ [[VECTINT]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       second:
+; CHECK-NEXT:    [[RESULT2:%.*]] = phi <8 x i32> [ zeroinitializer, [[PROXY]] ], [ [[VECTINT]], [[ENTRY]] ]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RESULT3:%.*]] = phi <8 x i32> [ [[RESULT1]], [[FIRST]] ], [ [[RESULT2]], [[SECOND]] ]
+; CHECK-NEXT:    [[NEWDATA:%.*]] = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> [[RESULT3]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %vectint = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
+  br i1 %switch, label %first, label %second
+proxy:
+  br i1 %switch, label %first, label %second
+first:
+  %result1 = phi <8 x i32> [ zeroinitializer, %proxy ], [ %vectint, %entry]
+  br label %exit
+second:
+  %result2 = phi <8 x i32> [ zeroinitializer, %proxy ], [ %vectint, %entry]
+  br label %exit
+exit:
+  %result3 = phi <8 x i32> [ %result1, %first], [ %result2, %second]
+  %newdata = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> %result3)
+  ret void
+}
+
+declare spir_func void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32, <8 x i32>) #1
+declare spir_func <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) #1
+declare spir_func <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) #1
+declare spir_func <4 x i32> @do_math_v4i32_v4i32(<4 x i32>) #1
+declare spir_func <8 x i32> @do_math_v8i32_v8i32(<8 x i32>) #1
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { nounwind }
diff --git a/IGC/Compiler/tests/ScalarizeFunction/selective.ll b/IGC/Compiler/tests/ScalarizeFunction/selective.ll
index 7c39694a61ce..9312bcae45ed 100644
--- a/IGC/Compiler/tests/ScalarizeFunction/selective.ll
+++ b/IGC/Compiler/tests/ScalarizeFunction/selective.ll
@@ -1,14 +1,15 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ;=========================== begin_copyright_notice ============================
 ;
-; Copyright (C) 2022 Intel Corporation
+; Copyright (C) 2024 Intel Corporation
 ;
 ; SPDX-License-Identifier: MIT
 ;
 ;============================ end_copyright_notice =============================
 ;
 ; REQUIRES: regkeys
-; RUN: igc_opt --igc-scalarize -S --regkey=EnableSelectiveScalarizer=1 < %s | FileCheck %s
+; RUN: igc_opt --igc-scalarize --opaque-pointers -S --regkey=EnableSelectiveScalarizer=1 < %s | FileCheck %s
+; REQUIRES: llvm-14-plus
 ; ------------------------------------------------
 ; ScalarizeFunction
 ; ------------------------------------------------
@@ -16,11 +17,12 @@
 ; ------------------------------------------------
 
 define spir_kernel void @test_selective_1(i64 %addr) #0 {
-; CHECK-LABEL: @test_selective_1(
+; CHECK-LABEL: define spir_kernel void @test_selective_1(
+; CHECK-SAME: i64 [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[VECTINT:%.*]] = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
 ; CHECK-NEXT:    [[VECTFLOAT:%.*]] = bitcast <8 x i32> [[VECTINT]] to <8 x float>
 ; CHECK-NEXT:    [[VECTCAST:%.*]] = bitcast <8 x float> [[VECTFLOAT]] to <8 x i32>
-; CHECK-NEXT:    call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECTCAST]])
+; CHECK-NEXT:    call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECTCAST]])
 ; CHECK-NEXT:    ret void
 ;
 
@@ -36,11 +38,12 @@ define spir_kernel void @test_selective_1(i64 %addr) #0 {
 }
 
 define spir_kernel void @test_selective_2(i64 %addr) #0 {
-; CHECK-LABEL: @test_selective_2(
+; CHECK-LABEL: define spir_kernel void @test_selective_2(
+; CHECK-SAME: i64 [[ADDR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[VECTINT:%.*]] = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
 ; CHECK-NEXT:    [[VECTFLOAT:%.*]] = bitcast <8 x i32> [[VECTINT]] to <8 x float>
 ; CHECK-NEXT:    [[VECTCAST:%.*]] = bitcast <8 x float> [[VECTFLOAT]] to <8 x i32>
-; CHECK-NEXT:    call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECTCAST]])
+; CHECK-NEXT:    call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 [[ADDR]], i32 1023, i32 511, i32 1023, i32 0, i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0, <8 x i32> [[VECTCAST]])
 ; CHECK-NEXT:    [[ANOTHERCAST:%.*]] = bitcast <8 x float> [[VECTFLOAT]] to <8 x i32>
 ; CHECK-NEXT:    [[ANOTHERCAST_SCALAR:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 0
 ; CHECK-NEXT:    [[ANOTHERCAST_SCALAR1:%.*]] = extractelement <8 x i32> [[ANOTHERCAST]], i32 1
@@ -67,18 +70,19 @@ define spir_kernel void @test_selective_2(i64 %addr) #0 {
 }
 
 define spir_kernel void @test_selective_3() {
-; CHECK-LABEL: @test_selective_3(
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DATA:%.*]] = phi <8 x i32> [ zeroinitializer, [[TMP0]] ], [ [[NEWDATA:%.*]], [[LOOP]] ]
+; CHECK-LABEL: define spir_kernel void @test_selective_3() {
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[DATA:%.*]] = phi <8 x i32> [ zeroinitializer, [[TMP0]] ], [ [[NEWDATA:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[NEWDATA]] = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> [[DATA]])
 ; CHECK-NEXT:    [[NEWOFFSET]] = add i32 [[OFFSET]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 10
-; CHECK-NEXT:    br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]]
-; CHECK:       end:
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[END:.*]], label %[[LOOP]]
+; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret void
 ;
+
 ; no scalarization happens here because the vectors %data and %newdata are used as whole
   br label %loop
 
@@ -97,19 +101,21 @@ end:
 }
 
 define spir_kernel void @test_selective_4(i64 %addr) #0 {
-; CHECK-LABEL: @test_selective_4(
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[FLOAT_VECTOR:%.*]] = phi <8 x float> [ zeroinitializer, [[TMP0]] ], [ [[NEW_FLOAT_VECTOR:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[INT_VECTOR:%.*]] = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 [[ADDR:%.*]], i32 1023, i32 511, i32 1023, i32 [[OFFSET]], i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0)
+; CHECK-LABEL: define spir_kernel void @test_selective_4(
+; CHECK-SAME: i64 [[ADDR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FLOAT_VECTOR:%.*]] = phi <8 x float> [ zeroinitializer, [[TMP0]] ], [ [[NEW_FLOAT_VECTOR:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[INT_VECTOR:%.*]] = call <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64 [[ADDR]], i32 1023, i32 511, i32 1023, i32 [[OFFSET]], i32 0, i32 32, i32 16, i32 8, i32 1, i1 false, i1 false, i32 0)
 ; CHECK-NEXT:    [[NEW_FLOAT_VECTOR]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[FLOAT_VECTOR]], <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, <8 x i32> [[INT_VECTOR]], i32 11, i32 11, i32 8, i32 8, i1 false)
 ; CHECK-NEXT:    [[NEWOFFSET]] = add i32 [[OFFSET]], 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 256
-; CHECK-NEXT:    br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]]
-; CHECK:       end:
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[END:.*]], label %[[LOOP]]
+; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret void
 ;
+
 ; same here: no scalarization
   br label %loop
 
@@ -128,18 +134,18 @@ end:
 
 
 define spir_kernel void @test_selective_5() {
-; CHECK-LABEL: @test_selective_5(
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DATA1:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DATA3:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR10:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DATA4:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR11:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DATA5:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR12:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DATA6:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ]
-; CHECK-NEXT:    [[DATA7:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ]
-; CHECK-NEXT:    [[DATA8:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ]
-; CHECK-NEXT:    [[DATA9:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, [[LOOP]] ]
+; CHECK-LABEL: define spir_kernel void @test_selective_5() {
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[DATA1:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[DATA3:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR10:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[DATA4:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR11:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[DATA5:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[NEWDATA_SCALAR12:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[DATA6:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, %[[LOOP]] ]
+; CHECK-NEXT:    [[DATA7:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, %[[LOOP]] ]
+; CHECK-NEXT:    [[DATA8:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, %[[LOOP]] ]
+; CHECK-NEXT:    [[DATA9:%.*]] = phi i32 [ 0, [[TMP0]] ], [ undef, %[[LOOP]] ]
 ; CHECK-NEXT:    [[DATA2_ASSEMBLED_VECT:%.*]] = insertelement <4 x i32> undef, i32 [[DATA1]], i32 0
 ; CHECK-NEXT:    [[DATA2_ASSEMBLED_VECT13:%.*]] = insertelement <4 x i32> [[DATA2_ASSEMBLED_VECT]], i32 [[DATA3]], i32 1
 ; CHECK-NEXT:    [[DATA2_ASSEMBLED_VECT14:%.*]] = insertelement <4 x i32> [[DATA2_ASSEMBLED_VECT13]], i32 [[DATA4]], i32 2
@@ -151,10 +157,11 @@ define spir_kernel void @test_selective_5() {
 ; CHECK-NEXT:    [[NEWDATA_SCALAR12]] = extractelement <4 x i32> [[NEWDATA]], i32 3
 ; CHECK-NEXT:    [[NEWOFFSET]] = add i32 [[OFFSET]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 10
-; CHECK-NEXT:    br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]]
-; CHECK:       end:
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[END:.*]], label %[[LOOP]]
+; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret void
 ;
+
 ; here shufflevectors break vectorial nature of the arguments
 ; scalarization should be done
   br label %loop
@@ -176,18 +183,18 @@ end:
 }
 
 define spir_kernel void @test_selective_6() {
-; CHECK-LABEL: @test_selective_6(
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[VECTFLOAT1:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT25:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[VECTFLOAT2:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT26:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[VECTFLOAT3:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT27:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[VECTFLOAT4:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT28:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[VECTFLOAT5:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT29:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[VECTFLOAT6:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT30:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[VECTFLOAT7:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT31:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[VECTFLOAT8:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT32:%.*]], [[LOOP]] ]
+; CHECK-LABEL: define spir_kernel void @test_selective_6() {
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[OFFSET:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[NEWOFFSET:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[VECTFLOAT1:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT25:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[VECTFLOAT2:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT26:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[VECTFLOAT3:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT27:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[VECTFLOAT4:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT28:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[VECTFLOAT5:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT29:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[VECTFLOAT6:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT30:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[VECTFLOAT7:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT31:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[VECTFLOAT8:%.*]] = phi float [ 0.000000e+00, [[TMP0]] ], [ [[VECTFLOAT_NEXT32:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[VECTINT9:%.*]] = bitcast float [[VECTFLOAT1]] to i32
 ; CHECK-NEXT:    [[VECTINT10:%.*]] = bitcast float [[VECTFLOAT2]] to i32
 ; CHECK-NEXT:    [[VECTINT11:%.*]] = bitcast float [[VECTFLOAT3]] to i32
@@ -214,10 +221,11 @@ define spir_kernel void @test_selective_6() {
 ; CHECK-NEXT:    [[VECTFLOAT_NEXT32]] = bitcast i32 [[VECTADD24]] to float
 ; CHECK-NEXT:    [[NEWOFFSET]] = add i32 [[OFFSET]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[NEWOFFSET]], 10
-; CHECK-NEXT:    br i1 [[TMP1]], label [[END:%.*]], label [[LOOP]]
-; CHECK:       end:
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[END:.*]], label %[[LOOP]]
+; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret void
 ;
+
 ; scalarization should not be prevented due to elementwise bitcasts
 ; such bitcasts can be part of a chain of vector instructions, but
 ; should not be at the end of it
@@ -239,12 +247,13 @@ end:
 }
 
 define spir_kernel void @test_selective_7() {
-; CHECK-LABEL: @test_selective_7(
+; CHECK-LABEL: define spir_kernel void @test_selective_7() {
 ; CHECK-NEXT:    [[VECTINT:%.*]] = add <4 x i16> <i16 0, i16 1, i16 2, i16 3>, zeroinitializer
 ; CHECK-NEXT:    [[VECTFLOAT:%.*]] = bitcast <4 x i16> [[VECTINT]] to <4 x half>
 ; CHECK-NEXT:    [[VECTCAST:%.*]] = bitcast <4 x half> [[VECTFLOAT]] to i64
 ; CHECK-NEXT:    ret void
 ;
+
 ; non-elementwise bitcasts (result type is scalar) should prevent scalarization,
 ; thus no scalarization should happen here
   %vectint = add <4 x i16> <i16 0, i16 1, i16 2, i16 3>, zeroinitializer
@@ -255,12 +264,13 @@ define spir_kernel void @test_selective_7() {
 }
 
 define spir_kernel void @test_selective_8() {
-; CHECK-LABEL: @test_selective_8(
+; CHECK-LABEL: define spir_kernel void @test_selective_8() {
 ; CHECK-NEXT:    [[VECTINT:%.*]] = add <4 x i16> <i16 0, i16 1, i16 2, i16 3>, zeroinitializer
 ; CHECK-NEXT:    [[VECTFLOAT:%.*]] = bitcast <4 x i16> [[VECTINT]] to <4 x half>
 ; CHECK-NEXT:    [[VECTCAST:%.*]] = bitcast <4 x half> [[VECTFLOAT]] to <2 x i32>
 ; CHECK-NEXT:    ret void
 ;
+
 ; non-elementwise bitcasts (result is different sized vector) should prevent scalarization,
 ; thus no scalarization should happen here
   %vectint = add <4 x i16> <i16 0, i16 1, i16 2, i16 3>, zeroinitializer
@@ -270,6 +280,82 @@ define spir_kernel void @test_selective_8() {
   ret void
 }
 
+define <32 x i1> @test_selective_9(i64 %addr) #0 {
+; CHECK-LABEL: define <32 x i1> @test_selective_9(
+; CHECK-SAME: i64 [[ADDR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[INT:%.*]] = add i32 1, 0
+; CHECK-NEXT:    [[FLOAT:%.*]] = bitcast i32 [[INT]] to float
+; CHECK-NEXT:    [[VECTCAST:%.*]] = bitcast float [[FLOAT]] to <32 x i1>
+; CHECK-NEXT:    ret <32 x i1> [[VECTCAST]]
+;
+  %int = add i32 1, zeroinitializer
+  %float = bitcast i32 %int to float
+  %vectcast = bitcast float %float to <32 x i1>
+  ret <32 x i1> %vectcast
+}
+
+define <2 x i32> @should_not_scalarize_if_the_index_is_not_a_constant(i32 %src) {
+; CHECK-LABEL: define <2 x i32> @should_not_scalarize_if_the_index_is_not_a_constant(
+; CHECK-SAME: i32 [[SRC:%.*]]) {
+; CHECK-NEXT:    [[VECTOR_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[SRC]], i32 0
+; CHECK-NEXT:    [[VECTOR_ASSEMBLED_VECT1:%.*]] = insertelement <2 x i32> [[VECTOR_ASSEMBLED_VECT]], i32 undef, i32 1
+; CHECK-NEXT:    [[RESULT:%.*]] = insertelement <2 x i32> [[VECTOR_ASSEMBLED_VECT1]], i32 3, i32 [[SRC]]
+; CHECK-NEXT:    ret <2 x i32> [[RESULT]]
+;
+  %vector = insertelement <2 x i32> undef, i32 %src, i32 0
+  %result = insertelement <2 x i32> %vector, i32 3, i32 %src
+  ret <2 x i32> %result
+}
+
+define i32 @should_not_scalarize_if_the_index_is_not_a_constant_2(i32 %src) {
+; CHECK-LABEL: define i32 @should_not_scalarize_if_the_index_is_not_a_constant_2(
+; CHECK-SAME: i32 [[SRC:%.*]]) {
+; CHECK-NEXT:    [[VECTOR_ASSEMBLED_VECT:%.*]] = insertelement <2 x i32> undef, i32 [[SRC]], i32 0
+; CHECK-NEXT:    [[VECTOR_ASSEMBLED_VECT1:%.*]] = insertelement <2 x i32> [[VECTOR_ASSEMBLED_VECT]], i32 undef, i32 1
+; CHECK-NEXT:    [[RESULT:%.*]] = extractelement <2 x i32> [[VECTOR_ASSEMBLED_VECT1]], i32 [[SRC]]
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %vector = insertelement <2 x i32> undef, i32 %src, i32 0
+  %result = extractelement <2 x i32> %vector, i32 %src
+  ret i32 %result
+}
+
+define spir_kernel void @should_not_scalarize_nested_phi(i1 %switch) {
+; CHECK-LABEL: define spir_kernel void @should_not_scalarize_nested_phi(
+; CHECK-SAME: i1 [[SWITCH:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[VECTINT:%.*]] = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; CHECK:       [[PROXY:.*]]:
+; CHECK-NEXT:    br i1 [[SWITCH]], label %[[FIRST]], label %[[SECOND]]
+; CHECK:       [[FIRST]]:
+; CHECK-NEXT:    [[RESULT1:%.*]] = phi <8 x i32> [ zeroinitializer, %[[PROXY]] ], [ [[VECTINT]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SECOND]]:
+; CHECK-NEXT:    [[RESULT2:%.*]] = phi <8 x i32> [ zeroinitializer, %[[PROXY]] ], [ [[VECTINT]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RESULT3:%.*]] = phi <8 x i32> [ [[RESULT1]], %[[FIRST]] ], [ [[RESULT2]], %[[SECOND]] ]
+; CHECK-NEXT:    [[NEWDATA:%.*]] = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> [[RESULT3]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %vectint = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, zeroinitializer
+  br i1 %switch, label %first, label %second
+proxy:
+  br i1 %switch, label %first, label %second
+first:
+  %result1 = phi <8 x i32> [ zeroinitializer, %proxy ], [ %vectint, %entry]
+  br label %exit
+second:
+  %result2 = phi <8 x i32> [ zeroinitializer, %proxy ], [ %vectint, %entry]
+  br label %exit
+exit:
+  %result3 = phi <8 x i32> [ %result1, %first], [ %result2, %second]
+  %newdata = call <8 x i32> @do_math_v8i32_v8i32(<8 x i32> %result3)
+  ret void
+}
+
 declare spir_func void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32, <8 x i32>) #1
 declare spir_func <8 x i32> @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) #1
 declare spir_func <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) #1