diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 92e4f4264a..4f1096001a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -337,11 +337,22 @@ set( MIOpen_Source
     solver/softmarginloss/forward_softmarginloss.cpp
     solver/softmax/attn_softmax.cpp
     solver/softmax/softmax.cpp
+    solver/tensorOp/Op1dTensorGeneric.cpp
+    solver/tensorOp/Op2dTensorGeneric.cpp
+    solver/tensorOp/Op2dTensorLite.cpp
+    solver/tensorOp/Op2dTensorSquash.cpp
+    solver/tensorOp/Op3dTensorGeneric.cpp
+    solver/tensorOp/OpTensorFwdBias.cpp
+    solver/tensorOp/Op4dTensorLite.cpp
+    solver/tensorOp/OpTensorLeadingOnes.cpp
+    solver/tensorOp/Op4dTensorGeneric.cpp
+    solver/tensorOp/Op5dTensorGeneric.cpp
     subbuffers.cpp
     t5layernorm_api.cpp
     target_properties.cpp
     temp_file.cpp
     tensor.cpp
+    tensorOp/problem_description.cpp
     tensor_api.cpp
     transformers_adam_w_api.cpp
     seq_tensor.cpp
diff --git a/src/include/miopen/names.hpp b/src/include/miopen/names.hpp
index 17b96b8732..bdf59c361c 100644
--- a/src/include/miopen/names.hpp
+++ b/src/include/miopen/names.hpp
@@ -34,6 +34,7 @@ struct NetworkConfig
 {
     NetworkConfig() = default;
     explicit NetworkConfig(const std::string& value_) : value(value_) {}
+    explicit NetworkConfig(std::string&& value_) noexcept : value(std::move(value_)) {}
     operator std::string() const { return value; }
     const std::string& ToString() const { return value; }
 
diff --git a/src/include/miopen/tensorOp/invoke_params.hpp b/src/include/miopen/tensorOp/invoke_params.hpp
new file mode 100644
index 0000000000..6b8f2ca88c
--- /dev/null
+++ b/src/include/miopen/tensorOp/invoke_params.hpp
@@ -0,0 +1,78 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+namespace tensorOp {
+
+struct InvokeParams : public miopen::InvokeParams
+{
+    InvokeParams(const void* alpha0_,
+                 ConstData_t ATensor_,
+                 const void* alpha1_,
+                 ConstData_t BTensor_,
+                 const void* beta_,
+                 Data_t CTensor_,
+                 const size_t Aoffset_,
+                 const size_t Boffset_,
+                 const size_t Coffset_)
+        : alpha0(alpha0_),
+          alpha1(alpha1_),
+          beta(beta_),
+          ATensor(ATensor_),
+          BTensor(BTensor_),
+          CTensor(CTensor_),
+          Aoffset(Aoffset_),
+          Boffset(Boffset_),
+          Coffset(Coffset_)
+    {
+    }
+
+    size_t GetWorkspaceSize() const { return 0; }
+    Data_t GetWorkspace() const { return nullptr; }
+
+public:
+    const void* alpha0;
+    const void* alpha1;
+    const void* beta;
+
+    ConstData_t ATensor;
+    ConstData_t BTensor;
+    Data_t CTensor;
+
+    size_t Aoffset;
+    size_t Boffset;
+    size_t Coffset;
+};
+
+} // namespace tensorOp
+
+} // namespace miopen
diff --git a/src/include/miopen/tensorOp/problem_description.hpp b/src/include/miopen/tensorOp/problem_description.hpp
new file mode 100644
index 0000000000..dc60a3c7c9
--- /dev/null
+++ b/src/include/miopen/tensorOp/problem_description.hpp
@@ -0,0 +1,130 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+struct NetworkConfig;
+
+namespace tensorOp {
+
+struct ProblemDescription : ProblemDescriptionBase
+{
+    ProblemDescription(const miopenTensorOp_t tensorOp_,
+                       const void* beta_,
+                       const TensorDescriptor& aTensorDesc_,
+                       const TensorDescriptor& bTensorDesc_,
+                       const TensorDescriptor& cTensorDesc_,
+                       const bool nonStandardSquash_)
+        : tensorOp(tensorOp_),
+          aTensorDesc(aTensorDesc_),
+          bTensorDesc(bTensorDesc_),
+          cTensorDesc(cTensorDesc_),
+          nonStandardSquash(nonStandardSquash_)
+    {
+        if(beta_ == nullptr)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Beta value is nullptr");
+        }
+
+        beta = *(static_cast<const float*>(beta_));
+
+        if(aTensorDesc.GetElementSize() != cTensorDesc.GetElementSize())
+        {
+            MIOPEN_THROW("A and C Tensors do not match");
+        }
+
+        if(bTensorDesc.GetType() != cTensorDesc.GetType())
+        {
+            MIOPEN_THROW("Datatypes for B and C tensors do not match !");
+        }
+
+        const auto& blens = bTensorDesc.GetLengths();
+        const auto& clens = cTensorDesc.GetLengths();
+
+        if(clens.size() > 5)
+        {
+            MIOPEN_THROW("Tensor dimension larger than 5: " + std::to_string(clens.size()));
+        }
+
+        if(blens.size() != clens.size())
+        {
+            MIOPEN_THROW("Number of dims in B and C Tensors do not match: " +
+                         std::to_string(blens.size()) + ", " + std::to_string(clens.size()));
+        }
+
+        if(!nonStandardSquash)
+        {
+            constexpr auto comparator = [](size_t c, size_t b) { return b == 1 || b == c; };
+            const auto [c_diff, b_diff] =
+                std::mismatch(clens.begin(), clens.end(), blens.begin(), comparator);
+            if(c_diff != clens.end())
+                MIOPEN_THROW("BTensor dim != 1 && BTensor dim != CTensor dim:" +
+                             std::to_string(std::distance(clens.begin(), c_diff)));
+        }
+        else
+        {
+            // non standard behavior because blens[1] can be not equalt to clens[1]
+            if(!(clens.size() == 3 && blens[0] == 1 && clens[0] == 1 && blens[2] == clens[2]))
+            {
+                MIOPEN_THROW(
+                    "Non standard squashed operation supported only for 3d tensors and for "
+                    "the specific configuration");
+            }
+        }
+    }
+
+    miopenTensorOp_t GetTensorOp() const { return tensorOp; }
+
+    float GetBeta() const { return beta; }
+
+    const TensorDescriptor& GetATensorDesc() const { return aTensorDesc; }
+    const TensorDescriptor& GetBTensorDesc() const { return bTensorDesc; }
+    const TensorDescriptor& GetCTensorDesc() const { return cTensorDesc; }
+
+    bool GetNonStandardSquash() const { return nonStandardSquash; }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+private:
+    const miopenTensorOp_t tensorOp;
+
+    float beta;
+
+    TensorDescriptor aTensorDesc;
+    TensorDescriptor bTensorDesc;
+    TensorDescriptor cTensorDesc;
+
+    const bool nonStandardSquash;
+};
+
+} // namespace tensorOp
+
+} // namespace miopen
diff --git a/src/include/miopen/tensorOp/solvers.hpp b/src/include/miopen/tensorOp/solvers.hpp
new file mode 100644
index 0000000000..635d0ab777
--- /dev/null
+++ b/src/include/miopen/tensorOp/solvers.hpp
@@ -0,0 +1,216 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/solver.hpp>
+#include <miopen/tensorOp/problem_description.hpp>
+
+#include <utility>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensorOp {
+
+using TensorOpSolver = NonTunableSolverBase<ExecutionContext, miopen::tensorOp::ProblemDescription>;
+
+struct Op1dTensorGeneric final : TensorOpSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<Op1dTensorGeneric>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
+};
+
+struct Op2dTensorGeneric final : TensorOpSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<Op2dTensorGeneric>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
+};
+
+struct Op2dTensorLite final : TensorOpSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<Op2dTensorLite>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
+};
+
+struct Op2dTensorSquash final : TensorOpSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<Op2dTensorSquash>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
+};
+
+struct Op3dTensorGeneric final : TensorOpSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<Op3dTensorGeneric>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
+};
+
+struct OpTensorFwdBias final : TensorOpSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<OpTensorFwdBias>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
+};
+
+struct Op4dTensorLite final : TensorOpSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<Op4dTensorLite>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
+};
+
+struct OpTensorLeadingOnes final : TensorOpSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<OpTensorLeadingOnes>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
+};
+
+struct Op4dTensorGeneric final : TensorOpSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<Op4dTensorGeneric>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
+};
+
+struct Op5dTensorGeneric final : TensorOpSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<Op5dTensorGeneric>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::tensorOp::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
+};
+
+} // namespace tensorOp
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/include/miopen/tensor_ops.hpp b/src/include/miopen/tensor_ops.hpp
index 25d838598b..c19eb333f2 100644
--- a/src/include/miopen/tensor_ops.hpp
+++ b/src/include/miopen/tensor_ops.hpp
@@ -189,6 +189,22 @@ MIOPEN_INTERNALS_EXPORT void OpTensor(const Handle& handle,
                                       size_t Coffset         = 0,
                                       bool nonStandardSquash = false);
 
+MIOPEN_INTERNALS_EXPORT void OpTensor2(Handle& handle,
+                                       miopenTensorOp_t tensorOp,
+                                       const void* alpha0,
+                                       const TensorDescriptor& aTensorDesc,
+                                       ConstData_t ATensor,
+                                       const void* alpha1,
+                                       const TensorDescriptor& bTensorDesc,
+                                       ConstData_t BTensor,
+                                       const void* beta,
+                                       const TensorDescriptor& cTensorDesc,
+                                       Data_t CTensor,
+                                       size_t Aoffset         = 0,
+                                       size_t Boffset         = 0,
+                                       size_t Coffset         = 0,
+                                       bool nonStandardSquash = false);
+
 MIOPEN_INTERNALS_EXPORT void CopyTensor(const Handle& handle,
                                         const TensorDescriptor& srcDesc,
                                         ConstData_t src,
diff --git a/src/kernels/MIOpenTensorKernels.cl b/src/kernels/MIOpenTensorKernels.cl
index cc47d8e6ce..842d3d4d6b 100644
--- a/src/kernels/MIOpenTensorKernels.cl
+++ b/src/kernels/MIOpenTensorKernels.cl
@@ -24,24 +24,6 @@
  *
  *******************************************************************************/
 
-#if MIOPEN_USE_FP16 == 1
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#define _FLOAT half
-#ifndef HALF_MAX
-#define MAX_VAL 65504 /* max value */
-#else
-#define MAX_VAL HALF_MAX
-#endif
-#endif
-#if MIOPEN_USE_FP32 == 1
-#define _FLOAT float
-#ifndef FLT_MAX
-#define MAX_VAL 3.402823466e+38F /* max value */
-#else
-#define MAX_VAL FLT_MAX
-#endif
-#endif
-
 /* Only works for NCHW
  * bitmap tracks which dims are the same between 'a' and 'c'.
  * Example: 0, 1, 1, 0 means that C and H dims are the same and the rest are ones
diff --git a/src/solver/tensorOp/Op1dTensorGeneric.cpp b/src/solver/tensorOp/Op1dTensorGeneric.cpp
new file mode 100644
index 0000000000..896d75d50c
--- /dev/null
+++ b/src/solver/tensorOp/Op1dTensorGeneric.cpp
@@ -0,0 +1,173 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "tensor_op_helpers.hpp"
+#include <miopen/tensorOp/solvers.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensorOp {
+
+bool Op1dTensorGeneric::IsApplicable([[maybe_unused]] const ExecutionContext& context,
+                                     const miopen::tensorOp::ProblemDescription& problem) const
+{
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& alens       = aTensorDesc.GetLengths();
+    auto asize              = alens.size();
+
+    if(aTensorDesc.GetType() == miopenDouble)
+    {
+        return false;
+    }
+
+    if(asize == 1)
+    {
+        return true;
+    }
+
+    return false;
+}
+
+std::size_t Op1dTensorGeneric::GetWorkspaceSize(
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const
+{
+    return 0;
+}
+
+ConvSolution
+Op1dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                               const miopen::tensorOp::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    const size_t b_n = bTensorDesc.GetLengths()[0];
+    const size_t c_n = cTensorDesc.GetLengths()[0];
+
+    const size_t a_nstrides = aTensorDesc.GetStrides()[0];
+    const size_t b_nstrides = bTensorDesc.GetStrides()[0];
+    const size_t c_nstrides = cTensorDesc.GetStrides()[0];
+
+    miopenDataType_t data_type = bTensorDesc.GetType();
+    bool fit_into_int          = aTensorDesc.AllDimsFitIntoInt();
+
+    size_t local_threads = 256;
+    size_t max_num_wg    = 4096;
+
+    auto num_wg           = std::clamp(c_n / local_threads, size_t(1), size_t(max_num_wg));
+    num_wg                = num_wg > max_num_wg ? max_num_wg : num_wg;
+    size_t global_threads = num_wg * local_threads;
+
+    const std::array<size_t, 3> vld{local_threads, 1, 1};
+    const std::array<size_t, 3> vgd{global_threads, 1, 1};
+
+    KernelBuildParameters build_params = KernelBuildParameters{};
+
+    GetCommonParams(build_params, problem, true);
+
+    build_params.Define("USE_1D_TENSOR_GENERIC");
+
+    auto kernel = KernelInfo{};
+
+    kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+    kernel.kernel_file  = "MIOpenTensorKernelsHip.cpp";
+    kernel.kernel_name  = "Op1dTensorGeneric";
+
+    using std::begin, std::end;
+
+    kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
+    kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
+
+    result.invoker_factory =
+        [data_type, fit_into_int, b_n, c_n, a_nstrides, b_nstrides, c_nstrides](
+            const std::vector<Kernel> kernels) {
+            return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+                decltype(auto) kernel = handle_.Run(kernels.front());
+                decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+                visit_float(data_type, [&](auto as_float) {
+                    auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                    auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                    auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                    if(fit_into_int)
+                    {
+                        kernel(params.ATensor,
+                               params.BTensor,
+                               params.CTensor,
+                               static_cast<uint64_t>(params.Aoffset),
+                               static_cast<uint64_t>(params.Boffset),
+                               static_cast<uint64_t>(params.Coffset),
+                               static_cast<uint32_t>(a_nstrides),
+                               static_cast<uint32_t>(b_n == 1 ? 0 : b_nstrides),
+                               static_cast<uint32_t>(c_nstrides),
+                               miopen_alpha0,
+                               miopen_alpha1,
+                               miopen_beta,
+                               static_cast<uint32_t>(c_n),
+                               !float_equal(miopen_beta, 0.0));
+                    }
+                    else
+                    {
+                        kernel(params.ATensor,
+                               params.BTensor,
+                               params.CTensor,
+                               static_cast<uint64_t>(params.Aoffset),
+                               static_cast<uint64_t>(params.Boffset),
+                               static_cast<uint64_t>(params.Coffset),
+                               static_cast<uint64_t>(a_nstrides),
+                               static_cast<uint64_t>(b_n == 1 ? 0 : b_nstrides),
+                               static_cast<uint64_t>(c_nstrides),
+                               miopen_alpha0,
+                               miopen_alpha1,
+                               miopen_beta,
+                               static_cast<uint64_t>(c_n),
+                               !float_equal(miopen_beta, 0.0));
+                    }
+                });
+            };
+        };
+    result.construction_params.push_back(kernel);
+
+    return result;
+}
+
+} // namespace tensorOp
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/tensorOp/Op2dTensorGeneric.cpp b/src/solver/tensorOp/Op2dTensorGeneric.cpp
new file mode 100644
index 0000000000..41fca78068
--- /dev/null
+++ b/src/solver/tensorOp/Op2dTensorGeneric.cpp
@@ -0,0 +1,186 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "tensor_op_helpers.hpp"
+#include <miopen/tensorOp/solvers.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensorOp {
+
+bool Op2dTensorGeneric::IsApplicable([[maybe_unused]] const ExecutionContext& context,
+                                     const miopen::tensorOp::ProblemDescription& problem) const
+{
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& alens       = aTensorDesc.GetLengths();
+    auto asize              = alens.size();
+
+    if(aTensorDesc.GetType() == miopenDouble)
+    {
+        return false;
+    }
+
+    if(asize == 2)
+    {
+        return true;
+    }
+
+    return false;
+}
+
+std::size_t Op2dTensorGeneric::GetWorkspaceSize(
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const
+{
+    return 0;
+}
+
+ConvSolution
+Op2dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                               const miopen::tensorOp::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    std::array<size_t, 2> blens;
+    std::array<size_t, 2> clens;
+    std::tie(blens[0], blens[1]) = miopen::tien<2>(bTensorDesc.GetLengths());
+    std::tie(clens[0], clens[1]) = miopen::tien<2>(cTensorDesc.GetLengths());
+
+    std::array<size_t, 2> astrides;
+    std::array<size_t, 2> bstrides;
+    std::array<size_t, 2> cstrides;
+    std::tie(astrides[0], astrides[1]) = miopen::tien<2>(aTensorDesc.GetStrides());
+    std::tie(bstrides[0], bstrides[1]) = miopen::tien<2>(bTensorDesc.GetStrides());
+    std::tie(cstrides[0], cstrides[1]) = miopen::tien<2>(cTensorDesc.GetStrides());
+
+    miopenDataType_t data_type = bTensorDesc.GetType();
+    bool fit_into_int          = aTensorDesc.AllDimsFitIntoInt();
+
+    size_t local_threads = 32;
+    size_t max_num_wg    = 4096;
+
+    auto num_wg = std::clamp((clens[0] * clens[1]) / local_threads, size_t(1), size_t(max_num_wg));
+    num_wg      = num_wg > max_num_wg ? max_num_wg : num_wg;
+    size_t global_threads = num_wg * local_threads;
+
+    const std::array<size_t, 3> vld{local_threads, 1, 1};
+    const std::array<size_t, 3> vgd{global_threads, 1, 1};
+
+    KernelBuildParameters build_params = KernelBuildParameters{};
+
+    GetCommonParams(build_params, problem, true);
+
+    build_params.Define("USE_2D_TENSOR_GENERIC");
+
+    auto kernel = KernelInfo{};
+
+    kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+    kernel.kernel_file  = "MIOpenTensorKernelsHip.cpp";
+    kernel.kernel_name  = "Op2dTensorGeneric";
+
+    using std::begin, std::end;
+
+    kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
+    kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
+
+    result.invoker_factory = [data_type, fit_into_int, blens, clens, astrides, bstrides, cstrides](
+                                 const std::vector<Kernel> kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+            visit_float(data_type, [&](auto as_float) {
+                auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                if(fit_into_int)
+                {
+                    kernel(params.ATensor,
+                           params.BTensor,
+                           params.CTensor,
+                           static_cast<uint64_t>(params.Aoffset),
+                           static_cast<uint64_t>(params.Boffset),
+                           static_cast<uint64_t>(params.Coffset),
+                           static_cast<uint32_t>(blens[1] == 1 ? clens[1] : blens[1]),
+                           static_cast<uint32_t>(clens[1]),
+                           static_cast<uint32_t>(astrides[0]),
+                           static_cast<uint32_t>(astrides[1]),
+                           static_cast<uint32_t>(blens[0] == 1 ? 0 : bstrides[0]),
+                           static_cast<uint32_t>(blens[1] == 1 ? 0 : bstrides[1]),
+                           static_cast<uint32_t>(cstrides[0]),
+                           static_cast<uint32_t>(cstrides[1]),
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           static_cast<uint32_t>(clens[0]),
+                           !float_equal(miopen_beta, 0.0));
+                }
+                else
+                {
+                    kernel(params.ATensor,
+                           params.BTensor,
+                           params.CTensor,
+                           static_cast<uint64_t>(params.Aoffset),
+                           static_cast<uint64_t>(params.Boffset),
+                           static_cast<uint64_t>(params.Coffset),
+                           static_cast<uint64_t>(blens[1] == 1 ? clens[1] : blens[1]),
+                           static_cast<uint64_t>(clens[1]),
+                           static_cast<uint64_t>(astrides[0]),
+                           static_cast<uint64_t>(astrides[1]),
+                           static_cast<uint64_t>(blens[0] == 1 ? 0 : bstrides[0]),
+                           static_cast<uint64_t>(blens[1] == 1 ? 0 : bstrides[1]),
+                           static_cast<uint64_t>(cstrides[0]),
+                           static_cast<uint64_t>(cstrides[1]),
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           static_cast<uint64_t>(clens[0]),
+                           !float_equal(miopen_beta, 0.0));
+                }
+            });
+        };
+    };
+    result.construction_params.push_back(kernel);
+
+    return result;
+}
+
+} // namespace tensorOp
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/tensorOp/Op2dTensorLite.cpp b/src/solver/tensorOp/Op2dTensorLite.cpp
new file mode 100644
index 0000000000..2b7b030a2f
--- /dev/null
+++ b/src/solver/tensorOp/Op2dTensorLite.cpp
@@ -0,0 +1,193 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "tensor_op_helpers.hpp"
+
+#include <miopen/tensorOp/solvers.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensorOp {
+
+bool Op2dTensorLite::IsApplicable([[maybe_unused]] const ExecutionContext& context,
+                                  const miopen::tensorOp::ProblemDescription& problem) const
+{
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    const auto& alens = aTensorDesc.GetLengths();
+    const auto& blens = bTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
+
+    auto asize = alens.size();
+
+    if(aTensorDesc.GetType() == miopenDouble)
+    {
+        return false;
+    }
+
+    if(asize == 3)
+    {
+        size_t local_threads = 256;
+        int max_num_wg       = 4096;
+
+        // for naive tensor ops
+        size_t RD_BLCK    = (clens[2] % 4 == 0) ? 4 : (clens[2] % 2 == 0) ? 2 : 1;
+        size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1));
+        size_t grp_sz     = (total_work + local_threads - 1) / local_threads;
+
+        // opencl kernels are no longer supported, fallback to generic case
+        bool lite_applicable = grp_sz <= size_t(max_num_wg);
+
+        bool is_lite = clens[0] == 1 && blens[0] == 1 && alens[0] == 1 &&
+                       (blens[1] == clens[1] || blens[1] == 1) && blens[2] == clens[2];
+
+        if(lite_applicable && is_lite)
+        {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+std::size_t Op2dTensorLite::GetWorkspaceSize(
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const
+{
+    return 0;
+}
+
+ConvSolution Op2dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                                         const miopen::tensorOp::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    const auto& blens = bTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
+
+    const size_t a_cstride = aTensorDesc.GetStrides()[1];
+    const size_t b_cstride = bTensorDesc.GetStrides()[1];
+    const size_t c_cstride = cTensorDesc.GetStrides()[1];
+
+    miopenDataType_t data_type = bTensorDesc.GetType();
+
+    auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
+
+    int max_num_wg = 4096;
+    num_wg         = num_wg > max_num_wg ? max_num_wg : num_wg;
+
+    size_t local_threads = 256;
+
+    // for naive tensor ops
+    auto&& [RD_BLCK, READ_TYPE] = GetRDBLCKandREADTYPE(clens[2], data_type);
+
+    size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1));
+    size_t grp_sz     = (total_work + local_threads - 1) / local_threads;
+
+    grp_sz        = std::min(size_t(max_num_wg), grp_sz);
+    size_t glb_sz = local_threads * grp_sz;
+
+    size_t local_threads2 = 64;
+    size_t total_work2    = clens[1];
+    size_t grp_sz2        = (total_work2 + local_threads2 - 1) / local_threads2;
+    grp_sz2               = std::min(size_t(max_num_wg / grp_sz), grp_sz2);
+    size_t glb_sz2        = local_threads2 * grp_sz2;
+
+    const std::array<size_t, 3> vld{local_threads, 1, 1};
+    const std::array<size_t, 3> vgd{glb_sz, glb_sz2, 1};
+
+    KernelBuildParameters build_params = KernelBuildParameters{};
+
+    GetCommonParams(build_params, problem, false);
+
+    build_params.Define("USE_2D_TENSOR_LITE");
+    build_params.Define("RD_BLCK", std::to_string(RD_BLCK));
+    build_params.Define("READ_TYPE", READ_TYPE);
+
+    auto kernel = KernelInfo{};
+
+    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+    kernel.kernel_file  = "MIOpenTensorKernels.cl";
+    kernel.kernel_name  = "Op2dTensorLite";
+
+    using std::begin, std::end;
+
+    kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
+    kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
+
+    result.invoker_factory =
+        [data_type, b_c = blens[1], a_cstride, b_cstride, c_cstride, total_work, total_work2](
+            const std::vector<Kernel> kernels) {
+            return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+                decltype(auto) kernel = handle_.Run(kernels.front());
+                decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+                visit_float(data_type, [&](auto as_float) {
+                    auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                    auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                    auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                    kernel(params.ATensor,
+                           static_cast<int>(a_cstride),
+                           params.BTensor,
+                           static_cast<int>(b_cstride),
+                           params.CTensor,
+                           static_cast<int>(c_cstride),
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           static_cast<int64_t>(params.Aoffset),
+                           static_cast<int64_t>(params.Boffset),
+                           static_cast<int64_t>(params.Coffset),
+                           static_cast<int64_t>(total_work),
+                           static_cast<int64_t>(total_work2),
+                           static_cast<int>(!float_equal(miopen_beta, 0.0)),
+                           static_cast<int>(b_c == 1));
+                });
+            };
+        };
+    result.construction_params.push_back(kernel);
+
+    return result;
+}
+
+} // namespace tensorOp
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/tensorOp/Op2dTensorSquash.cpp b/src/solver/tensorOp/Op2dTensorSquash.cpp
new file mode 100644
index 0000000000..d6ca7cfa3b
--- /dev/null
+++ b/src/solver/tensorOp/Op2dTensorSquash.cpp
@@ -0,0 +1,175 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "tensor_op_helpers.hpp"
+#include <miopen/tensorOp/solvers.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensorOp {
+
+bool Op2dTensorSquash::IsApplicable([[maybe_unused]] const ExecutionContext& context,
+                                    const miopen::tensorOp::ProblemDescription& problem) const
+{
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    const auto& alens = aTensorDesc.GetLengths();
+    const auto& blens = bTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
+
+    auto asize = alens.size();
+
+    if(aTensorDesc.GetType() == miopenDouble)
+    {
+        return false;
+    }
+
+    if(asize == 3)
+    {
+        bool is_lite = clens[0] == 1 && blens[0] == 1 && alens[0] == 1 &&
+                       (blens[1] == clens[1] || blens[1] == 1) && blens[2] == clens[2];
+
+        bool is_squashed =
+            problem.GetNonStandardSquash() && !is_lite &&
+            (blens[0] == 1 && clens[0] == 1 && clens[1] == 1 && blens[2] == clens[2]);
+
+        if(is_squashed)
+        {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+std::size_t Op2dTensorSquash::GetWorkspaceSize(
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const
+{
+    return 0;
+}
+
+ConvSolution
+Op2dTensorSquash::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                              const miopen::tensorOp::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    const auto& blens = bTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
+
+    const size_t b_nstride = bTensorDesc.GetStrides()[1];
+
+    miopenDataType_t data_type = bTensorDesc.GetType();
+
+    auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
+
+    int max_num_wg = 4096;
+    num_wg         = num_wg > max_num_wg ? max_num_wg : num_wg;
+
+    size_t local_threads = 256;
+
+    // for naive tensor ops
+    auto&& [RD_BLCK, READ_TYPE] = GetRDBLCKandREADTYPE(clens[2], data_type);
+
+    size_t total_work = std::max(clens[2] / RD_BLCK, size_t(1));
+    size_t grp_sz     = (total_work + local_threads - 1) / local_threads;
+
+    grp_sz        = std::min(size_t(max_num_wg), grp_sz);
+    size_t glb_sz = local_threads * grp_sz;
+
+    const std::array<size_t, 3> vld{local_threads, 1, 1};
+    const std::array<size_t, 3> vgd{glb_sz, 1, 1};
+
+    KernelBuildParameters build_params = KernelBuildParameters{};
+
+    GetCommonParams(build_params, problem, false);
+
+    build_params.Define("USE_2D_TENSOR_SQUASH");
+    build_params.Define("RD_BLCK", std::to_string(RD_BLCK));
+    build_params.Define("READ_TYPE", READ_TYPE);
+
+    auto kernel = KernelInfo{};
+
+    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+    kernel.kernel_file  = "MIOpenTensorKernels.cl";
+    kernel.kernel_name  = "Op2dTensorSquash";
+
+    using std::begin, std::end;
+
+    kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
+    kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
+
+    result.invoker_factory =
+        [data_type, b_c = blens[1], b_nstride, total_work](const std::vector<Kernel> kernels) {
+            return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+                decltype(auto) kernel = handle_.Run(kernels.front());
+                decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+                visit_float(data_type, [&](auto as_float) {
+                    auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                    auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                    auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                    kernel(params.ATensor,
+                           params.BTensor,
+                           static_cast<int>(b_c),
+                           static_cast<int>(b_nstride),
+                           params.CTensor,
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           static_cast<int64_t>(params.Aoffset),
+                           static_cast<int64_t>(params.Boffset),
+                           static_cast<int64_t>(params.Coffset),
+                           static_cast<int64_t>(total_work),
+                           static_cast<int>(!float_equal(miopen_alpha0, 0.0)),
+                           static_cast<int>(!float_equal(miopen_alpha1, 0.0)),
+                           static_cast<int>(!float_equal(miopen_beta, 0.0)));
+                });
+            };
+        };
+    result.construction_params.push_back(kernel);
+
+    return result;
+}
+
+} // namespace tensorOp
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/tensorOp/Op3dTensorGeneric.cpp b/src/solver/tensorOp/Op3dTensorGeneric.cpp
new file mode 100644
index 0000000000..2bafc6abaa
--- /dev/null
+++ b/src/solver/tensorOp/Op3dTensorGeneric.cpp
@@ -0,0 +1,164 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "tensor_op_helpers.hpp"
+#include <miopen/tensorOp/solvers.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensorOp {
+
+bool Op3dTensorGeneric::IsApplicable([[maybe_unused]] const ExecutionContext& context,
+                                     const miopen::tensorOp::ProblemDescription& problem) const
+{
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& alens       = aTensorDesc.GetLengths();
+    auto asize              = alens.size();
+
+    if(aTensorDesc.GetType() == miopenDouble)
+    {
+        return false;
+    }
+
+    if(asize == 3)
+    {
+        return true;
+    }
+
+    return false;
+}
+
+std::size_t Op3dTensorGeneric::GetWorkspaceSize(
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const
+{
+    return 0;
+}
+
+ConvSolution
+Op3dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                               const miopen::tensorOp::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    const auto& blens = bTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
+
+    std::array<size_t, 3> astrides;
+    std::array<size_t, 3> bstrides;
+    std::array<size_t, 3> cstrides;
+    std::tie(astrides[0], astrides[1], astrides[2]) = miopen::tien<3>(aTensorDesc.GetStrides());
+    std::tie(bstrides[0], bstrides[1], bstrides[2]) = miopen::tien<3>(bTensorDesc.GetStrides());
+    std::tie(cstrides[0], cstrides[1], cstrides[2]) = miopen::tien<3>(cTensorDesc.GetStrides());
+
+    miopenDataType_t data_type = bTensorDesc.GetType();
+
+    auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
+
+    int max_num_wg = 4096;
+    num_wg         = num_wg > max_num_wg ? max_num_wg : num_wg;
+
+    size_t local_threads  = 256;
+    size_t global_threads = num_wg * local_threads;
+
+    const std::array<size_t, 3> vld{local_threads, 1, 1};
+    const std::array<size_t, 3> vgd{global_threads, 1, 1};
+
+    KernelBuildParameters build_params = KernelBuildParameters{};
+
+    GetCommonParams(build_params, problem, false);
+
+    build_params.Define("USE_3D_TENSOR_GENERIC");
+
+    auto kernel = KernelInfo{};
+
+    kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+    kernel.kernel_file  = "MIOpenTensorKernelsHip.cpp";
+    kernel.kernel_name  = "Op3dTensorGeneric";
+
+    using std::begin, std::end;
+
+    kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
+    kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
+
+    result.invoker_factory =
+        [data_type, blens, clens, astrides, bstrides, cstrides](const std::vector<Kernel> kernels) {
+            return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+                decltype(auto) kernel = handle_.Run(kernels.front());
+                decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+                visit_float(data_type, [&](auto as_float) {
+                    auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                    auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                    auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                    kernel(params.ATensor,
+                           params.BTensor,
+                           params.CTensor,
+                           static_cast<uint64_t>(params.Aoffset),
+                           static_cast<uint64_t>(params.Boffset),
+                           static_cast<uint64_t>(params.Coffset),
+                           static_cast<uint32_t>(blens[1] == 1 ? clens[1] : blens[1]), // b_c,
+                           static_cast<uint32_t>(blens[2] == 1 ? clens[2] : blens[2]), // b_h,
+                           static_cast<uint32_t>(clens[1]),                            // c_c,
+                           static_cast<uint32_t>(clens[2]),                            // c_h,
+                           static_cast<uint32_t>(astrides[0]),                         // a_nstride,
+                           static_cast<uint32_t>(astrides[1]),                         // a_cstride,
+                           static_cast<uint32_t>(astrides[2]),                         // a_hstride,
+                           static_cast<uint32_t>(blens[0] == 1 ? 0 : bstrides[0]),     // b_nstride,
+                           static_cast<uint32_t>(blens[1] == 1 ? 0 : bstrides[1]),     // b_cstride,
+                           static_cast<uint32_t>(blens[2] == 1 ? 0 : bstrides[2]),     // b_hstride,
+                           static_cast<uint32_t>(cstrides[0]),                         // c_nstride,
+                           static_cast<uint32_t>(cstrides[1]),                         // c_cstride,
+                           static_cast<uint32_t>(cstrides[2]),                         // c_hstride,
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           static_cast<uint32_t>(clens[0]),
+                           !float_equal(miopen_beta, 0.0));
+                });
+            };
+        };
+    result.construction_params.push_back(kernel);
+
+    return result;
+}
+
+} // namespace tensorOp
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/tensorOp/Op4dTensorGeneric.cpp b/src/solver/tensorOp/Op4dTensorGeneric.cpp
new file mode 100644
index 0000000000..3c67a3411f
--- /dev/null
+++ b/src/solver/tensorOp/Op4dTensorGeneric.cpp
@@ -0,0 +1,170 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "tensor_op_helpers.hpp"
+#include <miopen/tensorOp/solvers.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensorOp {
+
+bool Op4dTensorGeneric::IsApplicable([[maybe_unused]] const ExecutionContext& context,
+                                     const miopen::tensorOp::ProblemDescription& problem) const
+{
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& alens       = aTensorDesc.GetLengths();
+    auto asize              = alens.size();
+
+    if(aTensorDesc.GetType() == miopenDouble)
+    {
+        return false;
+    }
+
+    if(asize == 4)
+    {
+        return true;
+    }
+
+    return false;
+}
+
+std::size_t Op4dTensorGeneric::GetWorkspaceSize(
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const
+{
+    return 0;
+}
+
+ConvSolution
+Op4dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                               const miopen::tensorOp::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    std::array<size_t, 4> blens;
+    std::array<size_t, 4> clens;
+    std::tie(blens[0], blens[1], blens[2], blens[3]) = miopen::tien<4>(bTensorDesc.GetLengths());
+    std::tie(clens[0], clens[1], clens[2], clens[3]) = miopen::tien<4>(cTensorDesc.GetLengths());
+
+    std::array<size_t, 4> astrides;
+    std::array<size_t, 4> bstrides;
+    std::array<size_t, 4> cstrides;
+    std::tie(astrides[0], astrides[1], astrides[2], astrides[3]) =
+        miopen::tien<4>(aTensorDesc.GetStrides());
+    std::tie(bstrides[0], bstrides[1], bstrides[2], bstrides[3]) =
+        miopen::tien<4>(bTensorDesc.GetStrides());
+    std::tie(cstrides[0], cstrides[1], cstrides[2], cstrides[3]) =
+        miopen::tien<4>(cTensorDesc.GetStrides());
+
+    miopenDataType_t data_type = bTensorDesc.GetType();
+
+    int max_num_wg = 4096;
+
+    auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] =
+        Get4dParams(problem, false);
+
+    const std::array<size_t, 3> vld{local_threads, 1, 1};
+    const std::array<size_t, 3> vgd{global_threads, 1, 1};
+
+    KernelBuildParameters build_params = KernelBuildParameters{};
+
+    GetCommonParams(build_params, problem, false);
+
+    build_params.Define("USE_4D_TENSOR_GENERIC");
+    build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg));
+    auto kernel = KernelInfo{};
+
+    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+    kernel.kernel_file  = "MIOpenTensorKernels.cl";
+    kernel.kernel_name  = "Op4dTensorGeneric";
+
+    using std::begin, std::end;
+
+    kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
+    kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
+
+    result.invoker_factory =
+        [data_type, blens, clens, astrides, bstrides, cstrides, work_per_wg, num_wg_orig, bitmap](
+            const std::vector<Kernel> kernels) {
+            return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+                decltype(auto) kernel = handle_.Run(kernels.front());
+                decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+                visit_float(data_type, [&](auto as_float) {
+                    auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                    auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                    auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                    kernel(params.ATensor,
+                           static_cast<int>(astrides[0]), // a_nstride,
+                           static_cast<int>(astrides[1]), // a_cstride,
+                           static_cast<int>(astrides[2]), // a_hstride,
+                           params.BTensor,
+                           static_cast<int>(blens[1]),    // b_c,
+                           static_cast<int>(blens[2]),    // b_h,
+                           static_cast<int>(blens[3]),    // b_w,
+                           static_cast<int>(bstrides[0]), // b_nstride,
+                           static_cast<int>(bstrides[1]), // b_cstride,
+                           static_cast<int>(bstrides[2]), // b_hstride,
+                           params.CTensor,
+                           static_cast<int>(clens[1]),    // c_c,
+                           static_cast<int>(clens[2]),    // c_h,
+                           static_cast<int>(clens[3]),    // c_w,
+                           static_cast<int>(cstrides[0]), // c_nstride,
+                           static_cast<int>(cstrides[1]), // c_cstride,
+                           static_cast<int>(cstrides[2]), // c_hstride,
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           bitmap,
+                           work_per_wg,
+                           static_cast<int64_t>(params.Aoffset),
+                           static_cast<int64_t>(params.Boffset),
+                           static_cast<int64_t>(params.Coffset),
+                           static_cast<int>(num_wg_orig));
+                });
+            };
+        };
+    result.construction_params.push_back(kernel);
+
+    return result;
+}
+
+} // namespace tensorOp
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/tensorOp/Op4dTensorLite.cpp b/src/solver/tensorOp/Op4dTensorLite.cpp
new file mode 100644
index 0000000000..a53174507e
--- /dev/null
+++ b/src/solver/tensorOp/Op4dTensorLite.cpp
@@ -0,0 +1,165 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "tensor_op_helpers.hpp"
+#include <miopen/tensorOp/solvers.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensorOp {
+
+bool Op4dTensorLite::IsApplicable([[maybe_unused]] const ExecutionContext& context,
+                                  const miopen::tensorOp::ProblemDescription& problem) const
+{
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    const auto& alens = aTensorDesc.GetLengths();
+    const auto& blens = bTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
+
+    auto asize = alens.size();
+
+    if(aTensorDesc.GetType() == miopenDouble)
+    {
+        return false;
+    }
+
+    if(asize == 4)
+    {
+        auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
+
+        // quick fix for btensor = <1, 1, 1, 1>
+        if(bTensorDesc.GetElementSize() == 1)
+            bitmap = 4;
+
+        bool fwd_conv_bias = (bitmap == (1 << 2));
+
+        bool packed_tensor = true;
+        packed_tensor &= aTensorDesc.IsPacked();
+        packed_tensor &= bTensorDesc.IsPacked();
+        packed_tensor &= cTensorDesc.IsPacked();
+
+        bool packed_equal_tensor =
+            packed_tensor && (bTensorDesc.GetElementSize() == cTensorDesc.GetElementSize());
+
+        if(!fwd_conv_bias && packed_equal_tensor)
+        {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+std::size_t Op4dTensorLite::GetWorkspaceSize(
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const
+{
+    return 0;
+}
+
+ConvSolution Op4dTensorLite::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                                         const miopen::tensorOp::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    miopenDataType_t data_type = bTensorDesc.GetType();
+
+    auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] =
+        Get4dParams(problem, true);
+
+    auto&& [RD_BLCK, READ_TYPE] =
+        GetRDBLCKandREADTYPE(cTensorDesc.GetElementSize(), bTensorDesc.GetType());
+
+    size_t total_work = std::max(cTensorDesc.GetElementSize() / RD_BLCK, size_t(1));
+
+    const std::array<size_t, 3> vld{local_threads, 1, 1};
+    const std::array<size_t, 3> vgd{global_threads, 1, 1};
+
+    KernelBuildParameters build_params = KernelBuildParameters{};
+
+    GetCommonParams(build_params, problem, false);
+
+    build_params.Define("USE_4D_TENSOR_LITE");
+    build_params.Define("RD_BLCK", std::to_string(RD_BLCK));
+    build_params.Define("READ_TYPE", READ_TYPE);
+
+    auto kernel = KernelInfo{};
+
+    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+    kernel.kernel_file  = "MIOpenTensorKernels.cl";
+    kernel.kernel_name  = "Op4dTensorLite";
+
+    using std::begin, std::end;
+
+    kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
+    kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
+
+    result.invoker_factory = [data_type, total_work](const std::vector<Kernel> kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+            visit_float(data_type, [&](auto as_float) {
+                auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                kernel(params.ATensor,
+                       params.BTensor,
+                       params.CTensor,
+                       miopen_alpha0,
+                       miopen_alpha1,
+                       miopen_beta,
+                       static_cast<int64_t>(params.Aoffset),
+                       static_cast<int64_t>(params.Boffset),
+                       static_cast<int64_t>(params.Coffset),
+                       static_cast<int64_t>(total_work),
+                       static_cast<int>(!float_equal(miopen_beta, 0.0)));
+            });
+        };
+    };
+    result.construction_params.push_back(kernel);
+
+    return result;
+}
+
+} // namespace tensorOp
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/tensorOp/Op5dTensorGeneric.cpp b/src/solver/tensorOp/Op5dTensorGeneric.cpp
new file mode 100644
index 0000000000..35ef705f5b
--- /dev/null
+++ b/src/solver/tensorOp/Op5dTensorGeneric.cpp
@@ -0,0 +1,179 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "tensor_op_helpers.hpp"
+#include <miopen/tensorOp/solvers.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensorOp {
+
+bool Op5dTensorGeneric::IsApplicable([[maybe_unused]] const ExecutionContext& context,
+                                     const miopen::tensorOp::ProblemDescription& problem) const
+{
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& alens       = aTensorDesc.GetLengths();
+    auto asize              = alens.size();
+
+    if(aTensorDesc.GetType() == miopenDouble)
+    {
+        return false;
+    }
+
+    if(asize == 5)
+    {
+        return true;
+    }
+
+    return false;
+}
+
+std::size_t Op5dTensorGeneric::GetWorkspaceSize(
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const
+{
+    return 0;
+}
+
+ConvSolution
+Op5dTensorGeneric::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                               const miopen::tensorOp::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    const auto& blens = bTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
+
+    std::array<size_t, 5> astrides;
+    std::array<size_t, 5> bstrides;
+    std::array<size_t, 5> cstrides;
+    std::tie(astrides[0], astrides[1], astrides[2], astrides[3], astrides[4]) =
+        miopen::tien<5>(aTensorDesc.GetStrides());
+    std::tie(bstrides[0], bstrides[1], bstrides[2], bstrides[3], bstrides[4]) =
+        miopen::tien<5>(bTensorDesc.GetStrides());
+    std::tie(cstrides[0], cstrides[1], cstrides[2], cstrides[3], cstrides[4]) =
+        miopen::tien<5>(cTensorDesc.GetStrides());
+
+    miopenDataType_t data_type = bTensorDesc.GetType();
+
+    auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
+
+    int num_wg_orig = num_wg;
+    int max_num_wg  = 4096;
+    num_wg          = num_wg > max_num_wg ? max_num_wg : num_wg;
+
+    size_t local_threads  = 256;
+    size_t global_threads = num_wg * local_threads;
+
+    const std::array<size_t, 3> vld{local_threads, 1, 1};
+    const std::array<size_t, 3> vgd{global_threads, 1, 1};
+
+    KernelBuildParameters build_params = KernelBuildParameters{};
+
+    GetCommonParams(build_params, problem, false);
+
+    build_params.Define("USE_5D_TENSOR_GENERIC");
+    build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg));
+
+    auto kernel = KernelInfo{};
+
+    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+    kernel.kernel_file  = "MIOpenTensorKernels.cl";
+    kernel.kernel_name  = "Op5dTensorGeneric";
+
+    using std::begin, std::end;
+
+    kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
+    kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
+
+    result.invoker_factory =
+        [data_type, blens, clens, astrides, bstrides, cstrides, bitmap, work_per_wg, num_wg_orig](
+            const std::vector<Kernel> kernels) {
+            return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+                decltype(auto) kernel = handle_.Run(kernels.front());
+                decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+                visit_float(data_type, [&](auto as_float) {
+                    auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                    auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                    auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                    kernel(params.ATensor,
+                           static_cast<int>(astrides[0]),
+                           static_cast<int>(astrides[1]),
+                           static_cast<int>(astrides[2]),
+                           static_cast<int>(astrides[3]),
+                           params.BTensor,
+                           static_cast<int>(blens[1]),    // b_c,
+                           static_cast<int>(blens[2]),    // b_d,
+                           static_cast<int>(blens[3]),    // b_h,
+                           static_cast<int>(blens[4]),    // b_w,
+                           static_cast<int>(bstrides[0]), // b_nstride,
+                           static_cast<int>(bstrides[1]), // b_cstride,
+                           static_cast<int>(bstrides[2]), // b_dstride,
+                           static_cast<int>(bstrides[3]), // b_hstride,
+                           params.CTensor,
+                           static_cast<int>(clens[1]),    // c_c,
+                           static_cast<int>(clens[2]),    // c_d,
+                           static_cast<int>(clens[3]),    // c_h,
+                           static_cast<int>(clens[4]),    // c_w,
+                           static_cast<int>(cstrides[0]), // c_nstride,
+                           static_cast<int>(cstrides[1]), // c_cstride,
+                           static_cast<int>(cstrides[2]), // c_dstride,
+                           static_cast<int>(cstrides[3]), // c_hstride,
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           bitmap,
+                           work_per_wg,
+                           static_cast<int64_t>(params.Aoffset),
+                           static_cast<int64_t>(params.Boffset),
+                           static_cast<int64_t>(params.Coffset),
+                           static_cast<int>(num_wg_orig));
+                });
+            };
+        };
+    result.construction_params.push_back(kernel);
+
+    return result;
+}
+
+} // namespace tensorOp
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/tensorOp/OpTensorFwdBias.cpp b/src/solver/tensorOp/OpTensorFwdBias.cpp
new file mode 100644
index 0000000000..9df036df8c
--- /dev/null
+++ b/src/solver/tensorOp/OpTensorFwdBias.cpp
@@ -0,0 +1,224 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "tensor_op_helpers.hpp"
+#include <miopen/tensorOp/solvers.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensorOp {
+
+bool OpTensorFwdBias::IsApplicable([[maybe_unused]] const ExecutionContext& context,
+                                   const miopen::tensorOp::ProblemDescription& problem) const
+{
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    const auto& alens = aTensorDesc.GetLengths();
+    const auto& blens = bTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
+
+    auto asize = alens.size();
+
+    if(aTensorDesc.GetType() == miopenDouble)
+    {
+        return false;
+    }
+
+    if(asize == 4)
+    {
+        auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
+
+        // quick fix for btensor = <1, 1, 1, 1>
+        if(bTensorDesc.GetElementSize() == 1)
+            bitmap = 4;
+
+        bool fwd_conv_bias = (bitmap == (1 << 2));
+
+        if(fwd_conv_bias)
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+std::size_t OpTensorFwdBias::GetWorkspaceSize(
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const
+{
+    return 0;
+}
+
+ConvSolution OpTensorFwdBias::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                                          const miopen::tensorOp::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    std::array<size_t, 4> blens;
+    std::array<size_t, 4> clens;
+    std::tie(blens[0], blens[1], blens[2], blens[3]) = miopen::tien<4>(bTensorDesc.GetLengths());
+    std::tie(clens[0], clens[1], clens[2], clens[3]) = miopen::tien<4>(cTensorDesc.GetLengths());
+
+    std::array<size_t, 4> astrides;
+    std::array<size_t, 4> bstrides;
+    std::array<size_t, 4> cstrides;
+    std::tie(astrides[0], astrides[1], astrides[2], astrides[3]) =
+        miopen::tien<4>(aTensorDesc.GetStrides());
+    std::tie(bstrides[0], bstrides[1], bstrides[2], bstrides[3]) =
+        miopen::tien<4>(bTensorDesc.GetStrides());
+    std::tie(cstrides[0], cstrides[1], cstrides[2], cstrides[3]) =
+        miopen::tien<4>(cTensorDesc.GetStrides());
+
+    miopenDataType_t data_type = bTensorDesc.GetType();
+
+    int max_num_wg = 4096;
+
+    auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] =
+        Get4dParams(problem, false);
+
+    const std::array<size_t, 3> vld{local_threads, 1, 1};
+    const std::array<size_t, 3> vgd{global_threads, 1, 1};
+
+    bool packed_tensor = true;
+    packed_tensor &= aTensorDesc.IsPacked();
+    packed_tensor &= bTensorDesc.IsPacked();
+    packed_tensor &= cTensorDesc.IsPacked();
+
+    KernelBuildParameters build_params = KernelBuildParameters{};
+
+    GetCommonParams(build_params, problem, false);
+
+    build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg));
+
+    auto kernel = KernelInfo{};
+
+    if(packed_tensor)
+    {
+        build_params.Define("USE_FWD_BIAS");
+        kernel.kernel_name = "OpTensorFwdBias";
+    }
+    else
+    {
+        build_params.Define("USE_FWD_BIAS_GENERIC");
+        kernel.kernel_name = "OpTensorFwdBiasGeneric";
+    }
+
+    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+    kernel.kernel_file  = "MIOpenTensorKernels.cl";
+
+    using std::begin, std::end;
+
+    kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
+    kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
+
+    result.invoker_factory = [data_type,
+                              blens,
+                              clens,
+                              astrides,
+                              bstrides,
+                              cstrides,
+                              work_per_wg,
+                              num_wg_orig,
+                              incr_wg,
+                              packed_tensor](const std::vector<Kernel> kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+            visit_float(data_type, [&](auto as_float) {
+                auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                if(packed_tensor)
+                { // OpTensorFwdBias
+                    kernel(params.ATensor,
+                           params.BTensor,
+                           static_cast<int>(blens[1]),
+                           params.CTensor,
+                           static_cast<int>(clens[0]),
+                           static_cast<int>(cstrides[0]),
+                           static_cast<int>(cstrides[1]),
+                           work_per_wg,
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           static_cast<int64_t>(params.Aoffset),
+                           static_cast<int64_t>(params.Boffset),
+                           static_cast<int64_t>(params.Coffset),
+                           static_cast<int>(num_wg_orig),
+                           static_cast<int>(incr_wg));
+                }
+                else
+                { // OpTensorFwdBiasGeneric
+                    kernel(params.ATensor,
+                           static_cast<int>(astrides[0]),
+                           static_cast<int>(astrides[1]),
+                           static_cast<int>(astrides[2]),
+                           params.BTensor,
+                           static_cast<int>(blens[1]),
+                           static_cast<int>(bstrides[1]),
+                           params.CTensor,
+                           static_cast<int>(clens[0]),
+                           static_cast<int>(clens[3]),
+                           static_cast<int>(cstrides[0]),
+                           static_cast<int>(cstrides[1]),
+                           static_cast<int>(cstrides[2]),
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           work_per_wg,
+                           static_cast<int64_t>(params.Aoffset),
+                           static_cast<int64_t>(params.Boffset),
+                           static_cast<int64_t>(params.Coffset),
+                           static_cast<int>(num_wg_orig),
+                           static_cast<int>(incr_wg));
+                }
+            });
+        };
+    };
+    result.construction_params.push_back(kernel);
+
+    return result;
+}
+
+} // namespace tensorOp
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/tensorOp/OpTensorLeadingOnes.cpp b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
new file mode 100644
index 0000000000..d930da0da6
--- /dev/null
+++ b/src/solver/tensorOp/OpTensorLeadingOnes.cpp
@@ -0,0 +1,241 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "tensor_op_helpers.hpp"
+#include <miopen/tensorOp/solvers.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/datatype.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensorOp {
+
+bool OpTensorLeadingOnes::IsApplicable([[maybe_unused]] const ExecutionContext& context,
+                                       const miopen::tensorOp::ProblemDescription& problem) const
+{
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    const auto& alens = aTensorDesc.GetLengths();
+    const auto& blens = bTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
+
+    auto asize = alens.size();
+
+    if(aTensorDesc.GetType() == miopenDouble)
+    {
+        return false;
+    }
+
+    if(asize == 4)
+    {
+
+        auto&& [num_wg, work_per_wg, bitmap] = GetBitmapAndWgInfo(blens, clens);
+
+        // quick fix for btensor = <1, 1, 1, 1>
+        if(bTensorDesc.GetElementSize() == 1)
+            bitmap = 4;
+
+        bool fwd_conv_bias = (bitmap == (1 << 2));
+
+        bool packed_tensor = true;
+        packed_tensor &= aTensorDesc.IsPacked();
+        packed_tensor &= bTensorDesc.IsPacked();
+        packed_tensor &= cTensorDesc.IsPacked();
+
+        bool packed_equal_tensor =
+            packed_tensor && (bTensorDesc.GetElementSize() == cTensorDesc.GetElementSize());
+
+        // first_not_one is incorrect if btensor size equal to 1
+        auto first_not_one =
+            std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; });
+        auto d = std::distance(blens.begin(), first_not_one.base());
+
+        bool leading_ones = IsBitmapLeadingOnes(bitmap, clens.size(), static_cast<int>(d - 2));
+
+        if(!fwd_conv_bias && !packed_equal_tensor && leading_ones)
+        {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+std::size_t OpTensorLeadingOnes::GetWorkspaceSize(
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::tensorOp::ProblemDescription& problem) const
+{
+    return 0;
+}
+
+ConvSolution
+OpTensorLeadingOnes::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                                 const miopen::tensorOp::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    const auto& aTensorDesc = problem.GetATensorDesc();
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    std::array<size_t, 4> clens;
+    std::tie(clens[0], clens[1], clens[2], clens[3]) = miopen::tien<4>(cTensorDesc.GetLengths());
+
+    std::array<size_t, 4> astrides;
+    std::array<size_t, 4> bstrides;
+    std::array<size_t, 4> cstrides;
+    std::tie(astrides[0], astrides[1], astrides[2], astrides[3]) =
+        miopen::tien<4>(aTensorDesc.GetStrides());
+    std::tie(bstrides[0], bstrides[1], bstrides[2], bstrides[3]) =
+        miopen::tien<4>(bTensorDesc.GetStrides());
+    std::tie(cstrides[0], cstrides[1], cstrides[2], cstrides[3]) =
+        miopen::tien<4>(cTensorDesc.GetStrides());
+
+    miopenDataType_t data_type = bTensorDesc.GetType();
+
+    int max_num_wg = 4096;
+
+    auto&& [num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads] =
+        Get4dParams(problem, false);
+
+    const std::array<size_t, 3> vld{local_threads, 1, 1};
+    const std::array<size_t, 3> vgd{global_threads, 1, 1};
+
+    bool packed_tensor = true;
+    packed_tensor &= aTensorDesc.IsPacked();
+    packed_tensor &= bTensorDesc.IsPacked();
+    packed_tensor &= cTensorDesc.IsPacked();
+
+    KernelBuildParameters build_params = KernelBuildParameters{};
+
+    GetCommonParams(build_params, problem, false);
+
+    build_params.Define("MAX_NUM_WG", std::to_string(max_num_wg));
+    auto kernel = KernelInfo{};
+
+    if(packed_tensor)
+    {
+        build_params.Define("USE_LEADING_ONES");
+        kernel.kernel_name = "OpTensorLeadingOnes";
+    }
+    else
+    {
+        build_params.Define("USE_LEADING_ONES_GENERIC");
+        kernel.kernel_name = "OpTensorLeadingOnesGeneric";
+    }
+
+    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+    kernel.kernel_file  = "MIOpenTensorKernels.cl";
+
+    using std::begin, std::end;
+
+    kernel.l_wk.insert(end(kernel.l_wk), begin(vld), end(vld));
+    kernel.g_wk.insert(end(kernel.g_wk), begin(vgd), end(vgd));
+
+    result.invoker_factory = [data_type,
+                              clens,
+                              astrides,
+                              bstrides,
+                              cstrides,
+                              work_per_wg,
+                              num_wg_orig,
+                              bitmap,
+                              packed_tensor](const std::vector<Kernel> kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::tensorOp::InvokeParams>();
+
+            visit_float(data_type, [&](auto as_float) {
+                auto miopen_alpha0 = as_float(*(static_cast<const float*>(params.alpha0)));
+                auto miopen_alpha1 = as_float(*(static_cast<const float*>(params.alpha1)));
+                auto miopen_beta   = as_float(*(static_cast<const float*>(params.beta)));
+
+                if(packed_tensor)
+                { // OpTensorLeadingOnes
+                    kernel(params.ATensor,
+                           params.BTensor,
+                           params.CTensor,
+                           static_cast<int>(clens[1]),
+                           static_cast<int>(clens[2]),
+                           static_cast<int>(clens[3]),
+                           static_cast<int>(cstrides[0]),
+                           static_cast<int>(cstrides[1]),
+                           work_per_wg,
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           static_cast<int64_t>(params.Aoffset),
+                           static_cast<int64_t>(params.Boffset),
+                           static_cast<int64_t>(params.Coffset),
+                           static_cast<int>(num_wg_orig),
+                           bitmap);
+                }
+                else
+                { // OpTensorLeadingOnesGeneric
+                    kernel(params.ATensor,
+                           static_cast<int>(astrides[0]),
+                           static_cast<int>(astrides[1]),
+                           static_cast<int>(astrides[2]),
+                           params.BTensor,
+                           static_cast<int>(bstrides[0]),
+                           static_cast<int>(bstrides[1]),
+                           static_cast<int>(bstrides[2]),
+                           params.CTensor,
+                           static_cast<int>(clens[1]),
+                           static_cast<int>(clens[2]),
+                           static_cast<int>(clens[3]),
+                           static_cast<int>(cstrides[0]),
+                           static_cast<int>(cstrides[1]),
+                           static_cast<int>(cstrides[2]),
+                           miopen_alpha0,
+                           miopen_alpha1,
+                           miopen_beta,
+                           work_per_wg,
+                           static_cast<int64_t>(params.Aoffset),
+                           static_cast<int64_t>(params.Boffset),
+                           static_cast<int64_t>(params.Coffset),
+                           static_cast<int>(num_wg_orig),
+                           bitmap);
+                }
+            });
+        };
+    };
+    result.construction_params.push_back(kernel);
+
+    return result;
+}
+
+} // namespace tensorOp
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/tensorOp/tensor_op_helpers.hpp b/src/solver/tensorOp/tensor_op_helpers.hpp
new file mode 100644
index 0000000000..46ce39e4a0
--- /dev/null
+++ b/src/solver/tensorOp/tensor_op_helpers.hpp
@@ -0,0 +1,218 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/tensorOp/problem_description.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/datatype.hpp>
+
+#include <tuple>
+
+namespace miopen {
+
+namespace solver {
+
+namespace tensorOp {
+
+inline void GetCommonParams(KernelBuildParameters& build_params,
+                            const miopen::tensorOp::ProblemDescription& problem,
+                            bool is64bSupported)
+{
+    build_params.Define("MIOPEN_TYPE", miopen::GetDataType(problem.GetBTensorDesc().GetType()));
+
+    switch(problem.GetTensorOp())
+    {
+    case 0: build_params.Define("MIOPEN_TENSOR_OP", "miopenAdd"); break;
+    case 1: build_params.Define("MIOPEN_TENSOR_OP", "miopenMul"); break;
+    case 2: build_params.Define("MIOPEN_TENSOR_OP", "miopenMin"); break;
+    case 3: build_params.Define("MIOPEN_TENSOR_OP", "miopenMax"); break;
+    }
+
+    if(is64bSupported && problem.GetATensorDesc().AllDimsFitIntoInt())
+    {
+        build_params.Define("DIM_TYPE", "uint32_t");
+    }
+    else
+    {
+        build_params.Define("DIM_TYPE", "uint64_t");
+    }
+}
+
+inline std::tuple<size_t, std::string> GetRDBLCKandREADTYPE(size_t len, miopenDataType_t type)
+{
+    const std::string data_type = GetDataType(type);
+    size_t RD_BLCK              = (len % 4 == 0) ? 4 : (len % 2 == 0) ? 2 : 1;
+    return std::make_tuple(RD_BLCK,
+                           (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK));
+}
+
+inline std::tuple<int, int, unsigned int> GetBitmapAndWgInfo(const std::vector<size_t>& blens,
+                                                             const std::vector<size_t>& clens)
+{
+    // first_not_one is incorrect if btensor size equal to 1
+    auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; });
+    auto d             = std::distance(blens.begin(), first_not_one.base());
+
+    // quick fix
+    int num_wg = first_not_one != blens.rend()
+                     ? static_cast<int>(*first_not_one == 0 ? 1 : *first_not_one)
+                     : 1;
+
+    int work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies<int>());
+
+    unsigned int bitmap = 0;
+    // update bitmap for first_not_one
+    bitmap |= (1 << (blens.size() - d));
+
+    for(int i = (d - 2); i >= 0; i--)
+    {
+        if(blens[i] != 1)
+        {
+            bitmap |= (1 << (blens.size() - (i + 1)));
+            num_wg *= blens[i];
+        }
+        else
+        {
+            work_per_wg *= clens[i];
+        }
+    }
+
+    return std::make_tuple(num_wg, work_per_wg, bitmap);
+}
+
+inline bool IsBitmapLeadingOnes(unsigned int bitmap, int n_size, int first_not_one)
+{
+    bool leading_ones = true;
+    for(int i = first_not_one; i >= 0; i--)
+    {
+        bool is_one = (bitmap & (1 << (n_size - 1 - i))) != 0u;
+        leading_ones &= is_one;
+    }
+    return leading_ones;
+}
+
+inline std::tuple<int, int, int, unsigned int, size_t, size_t>
+Get4dParams(const miopen::tensorOp::ProblemDescription& problem, bool is4dLite)
+{
+    const auto& bTensorDesc = problem.GetBTensorDesc();
+    const auto& cTensorDesc = problem.GetCTensorDesc();
+
+    const auto& blens = bTensorDesc.GetLengths();
+    const auto& clens = cTensorDesc.GetLengths();
+
+    auto dims = clens.size();
+
+    // first_not_one is incorrect if btensor size equal to 1
+    auto first_not_one = std::find_if(blens.rbegin(), blens.rend(), [](int i) { return i != 1; });
+    auto d             = std::distance(blens.begin(), first_not_one.base());
+
+    // quick fix
+    int num_wg = first_not_one != blens.rend()
+                     ? static_cast<int>(*first_not_one == 0 ? 1 : *first_not_one)
+                     : 1;
+
+    int work_per_wg = std::accumulate(clens.begin() + d, clens.end(), 1, std::multiplies<int>());
+
+    unsigned int bitmap = 0;
+    // update bitmap for first_not_one
+    bitmap |= (1 << (blens.size() - d));
+
+    for(int i = (d - 2); i >= 0; i--)
+    {
+        if(blens[i] != 1)
+        {
+            bitmap |= (1 << (blens.size() - (i + 1)));
+            num_wg *= blens[i];
+        }
+        else
+        {
+            work_per_wg *= clens[i];
+        }
+    }
+
+    // quick fix for btensor = <1, 1, 1, 1>
+    if(bTensorDesc.GetElementSize() == 1)
+        bitmap = 4;
+
+    int incr_wg = 0;
+    // Forward Convolution Bias specialization
+    // for fwd-bias, bitmap looks like <0, 1, 0, 0>
+    // Is the no. of work-groups and the work for each wg balanced?
+    auto fwd_conv_bias = bitmap == (1 << 2) ? 1 : 0;
+    // This block gives off indexing for 5d tensors, skipping
+    if(fwd_conv_bias == 1 && dims < 5 && num_wg < 640 && work_per_wg > 256 && clens[0] > 0)
+    { // 640 workgroups of size 256 needed to completely fill the GPU
+
+        work_per_wg /= clens[0]; // c_n;
+        num_wg *= clens[0];      // c_n;
+        incr_wg = 1;
+    }
+
+    int num_wg_orig = num_wg;
+    int max_num_wg  = 4096;
+    num_wg          = num_wg > max_num_wg ? max_num_wg : num_wg;
+
+    size_t local_threads = 256;
+
+    bool leading_ones = IsBitmapLeadingOnes(bitmap, clens.size(), static_cast<int>(d - 2));
+
+    if(leading_ones && work_per_wg < 64)
+    {
+        local_threads = 64;
+    }
+
+    // Special case for adding tensors in place
+    size_t global_threads =
+        (static_cast<int>(leading_ones) == 1 && (d - 1) == 3) ? num_wg : num_wg * local_threads;
+    global_threads = (global_threads < local_threads) ? local_threads : global_threads;
+
+    if(is4dLite)
+    {
+        // for naive tensor ops
+        const std::string data_type = GetDataType(bTensorDesc.GetType());
+
+        size_t TENS_LEN = cTensorDesc.GetElementSize();
+        size_t RD_BLCK  = (TENS_LEN % 4 == 0) ? 4 : (TENS_LEN % 2 == 0) ? 2 : 1;
+        const std::string READ_TYPE =
+            (RD_BLCK == 1) ? data_type : data_type + std::to_string(RD_BLCK);
+
+        size_t total_work = std::max(TENS_LEN / RD_BLCK, size_t(1));
+        size_t grp_sz     = (total_work + local_threads - 1) / local_threads;
+        grp_sz            = std::min(size_t(max_num_wg), grp_sz);
+        size_t glb_sz     = local_threads * grp_sz;
+
+        global_threads = glb_sz;
+    }
+
+    return std::make_tuple(
+        num_wg_orig, work_per_wg, incr_wg, bitmap, local_threads, global_threads);
+}
+
+} // namespace tensorOp
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/tensor.cpp b/src/tensor.cpp
index 3e5190bc25..c1bd709267 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -28,6 +28,10 @@
 #include <miopen/errors.hpp>
 #include <miopen/logger.hpp>
 #include <miopen/tensor_layout.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/tensorOp/invoke_params.hpp>
+#include <miopen/tensorOp/solvers.hpp>
+#include <miopen/find_solution.hpp>
 
 #include <nlohmann/json.hpp>
 
@@ -868,6 +872,57 @@ void from_json(const nlohmann::json& j, TensorDescriptor& descriptor)
     j.at("type").get_to(descriptor.type);
 }
 
+void OpTensor2(Handle& handle,
+               miopenTensorOp_t tensorOp,
+               const void* alpha0,
+               const TensorDescriptor& aTensorDesc,
+               ConstData_t ATensor,
+               const void* alpha1,
+               const TensorDescriptor& bTensorDesc,
+               ConstData_t BTensor,
+               const void* beta,
+               const TensorDescriptor& cTensorDesc,
+               Data_t CTensor,
+               const size_t Aoffset,
+               const size_t Boffset,
+               const size_t Coffset,
+               bool nonStandardSquash)
+{
+    if(ATensor == nullptr || BTensor == nullptr || CTensor == nullptr)
+    {
+        MIOPEN_THROW(miopenStatusBadParm);
+    }
+
+    if(alpha0 == nullptr)
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "Alpha0 value is nullptr");
+    }
+
+    if(alpha1 == nullptr)
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "Alpha1 value is nullptr");
+    }
+
+    const auto problem = tensorOp::ProblemDescription{
+        tensorOp, beta, aTensorDesc, bTensorDesc, cTensorDesc, nonStandardSquash};
+
+    const auto invoke_params = tensorOp::InvokeParams{
+        alpha0, ATensor, alpha1, BTensor, beta, CTensor, Aoffset, Boffset, Coffset};
+
+    const auto algo    = AlgorithmName{"TensorOpSolver"};
+    const auto solvers = solver::SolverContainer<solver::tensorOp::OpTensorFwdBias>{} +
+                         solver::SolverContainer<solver::tensorOp::Op4dTensorLite>{} +
+                         solver::SolverContainer<solver::tensorOp::OpTensorLeadingOnes>{} +
+                         solver::SolverContainer<solver::tensorOp::Op2dTensorLite>{} +
+                         solver::SolverContainer<solver::tensorOp::Op2dTensorSquash>{} +
+                         solver::SolverContainer<solver::tensorOp::Op5dTensorGeneric>{} +
+                         solver::SolverContainer<solver::tensorOp::Op4dTensorGeneric>{} +
+                         solver::SolverContainer<solver::tensorOp::Op3dTensorGeneric>{} +
+                         solver::SolverContainer<solver::tensorOp::Op2dTensorGeneric>{} +
+                         solver::SolverContainer<solver::tensorOp::Op1dTensorGeneric>{};
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+}
+
 } // namespace miopen
 
 int miopenGetTensorIndex(miopenTensorDescriptor_t tensorDesc, std::initializer_list<int> indices)
diff --git a/src/tensorOp/problem_description.cpp b/src/tensorOp/problem_description.cpp
new file mode 100644
index 0000000000..6053e7f1a0
--- /dev/null
+++ b/src/tensorOp/problem_description.cpp
@@ -0,0 +1,76 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/tensorOp/problem_description.hpp>
+#include <miopen/names.hpp>
+#include <miopen/float_equal.hpp>
+
+namespace miopen {
+
+namespace tensorOp {
+
+NetworkConfig ProblemDescription::MakeNetworkConfig() const
+{
+    std::string ss;
+
+    const auto& alens = aTensorDesc.GetLengths();
+    const auto& blens = bTensorDesc.GetLengths();
+
+    const auto& astrides = aTensorDesc.GetStrides();
+    const auto& bstrides = bTensorDesc.GetStrides();
+    const auto& cstrides = cTensorDesc.GetStrides();
+
+    auto printDims = [&ss, dims = alens.size() - 1](const auto& dim) {
+        for(uint32_t i = 0; i < dims; i++)
+        {
+            ss.append(std::to_string(dim[i]));
+            ss += 'x';
+        }
+        ss += std::to_string(dim.back());
+        ss += '-';
+    };
+
+    ss.reserve(1024);
+    ss.append(std::string_view("TensorOp-"));
+    ss += std::to_string(aTensorDesc.GetType());
+    ss += '-';
+    ss += std::to_string(tensorOp);
+    ss += '-';
+
+    printDims(alens);
+    printDims(blens);
+    printDims(astrides);
+    printDims(bstrides);
+    printDims(cstrides);
+
+    ss += (float_equal(beta, 0.0f) ? '1' : '0');
+
+    return NetworkConfig(std::move(ss));
+}
+
+} // namespace tensorOp
+
+} // namespace miopen
diff --git a/test/tensor_ops.cpp b/test/tensor_ops.cpp
index 3121715e8a..1df83044b2 100644
--- a/test/tensor_ops.cpp
+++ b/test/tensor_ops.cpp
@@ -181,24 +181,24 @@ struct verify_tensor_ops
         auto a_dev = handle.Write(a.data);
         auto b_dev = handle.Write(b.data);
 
-        miopen::OpTensor(handle,
-                         // miopenTensorOpAdd,
-                         // miopenTensorOpMax,
-                         // miopenTensorOpMin,
-                         miopenTensorOpMul,
-                         &alpha0,
-                         a.desc,
-                         a_dev.get(),
-                         &alpha1,
-                         b.desc,
-                         b_dev.get(),
-                         &beta,
-                         c.desc,
-                         c_dev.get(),
-                         Aoffset,
-                         Boffset,
-                         Coffset,
-                         false); // it does not verify non-standard behaviour
+        miopen::OpTensor2(handle,
+                          // miopenTensorOpAdd,
+                          // miopenTensorOpMax,
+                          // miopenTensorOpMin,
+                          miopenTensorOpMul,
+                          &alpha0,
+                          a.desc,
+                          a_dev.get(),
+                          &alpha1,
+                          b.desc,
+                          b_dev.get(),
+                          &beta,
+                          c.desc,
+                          c_dev.get(),
+                          Aoffset,
+                          Boffset,
+                          Coffset,
+                          false); // it does not verify non-standard behaviour
 
         if(not no_validate)
         {