From d4dc46e6465e47ce8d8b3b4e74b35479638d17e2 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 16 May 2024 09:32:45 +0200
Subject: [PATCH 01/23] Copying kernels to implement NN clusterizer

---
 GPU/GPUTracking/Global/GPUChainTracking.cxx   |   2 +-
 GPU/GPUTracking/Global/GPUChainTracking.h     |   2 +-
 .../Global/GPUChainTrackingClusterizer.cxx    |  19 +-
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 271 ++++++++++++++++++
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  76 +++++
 5 files changed, 364 insertions(+), 6 deletions(-)
 create mode 100644 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
 create mode 100644 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h

diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
index bd1cd9859cbd2..68615f47d05db 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -722,7 +722,7 @@ int GPUChainTracking::RunChain()
       return 1;
     }
   } else if (mIOPtrs.tpcPackedDigits || mIOPtrs.tpcZS) {
-    if (runRecoStep(RecoStep::TPCClusterFinding, &GPUChainTracking::RunTPCClusterizer, false)) {
+    if (runRecoStep(RecoStep::TPCClusterFinding, &GPUChainTracking::RunTPCClusterizer, false, true)) { // FIXME: This enables the neural network clusterization -> Need to actually set this as configurable
       return 1;
     }
   }
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h
index 89f2ecd10f65f..032ad0524ccff 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.h
+++ b/GPU/GPUTracking/Global/GPUChainTracking.h
@@ -161,7 +161,7 @@ class GPUChainTracking : public GPUChain, GPUReconstructionHelpers::helperDelega
   void SetQAFromForeignChain(GPUChainTracking* chain) { mQAFromForeignChain = chain; }
 
   // Processing functions
-  int RunTPCClusterizer(bool synchronizeOutput = true);
+  int RunTPCClusterizer(bool synchronizeOutput = true, bool applyNNclusterizer = false);
   int ForwardTPCDigits();
   int RunTPCTrackingSlices();
   int RunTPCTrackingMerger(bool synchronizeOutput = true);
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 29bbf34b46135..7b2c5539439be 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -566,7 +566,7 @@ int GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
 #endif
 
 // TODO: Clusterizer not working with OCL1 (Clusterizer on CPU, Tracking on GPU)
-int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
+int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclusterizer)
 {
   if (param().rec.fwdTPCDigitsAsClusters) {
     return ForwardTPCDigits();
@@ -835,8 +835,14 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (clusterer.mPmemory->counters.nPeaks == 0) {
           continue;
         }
-        runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
-        runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+        if(!applyNNclusterizer){
+          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+        } else {
+          // FIXME: This needs to be removed when I actually apply the NN! For now its onyl to make the code work
+          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+        }
         DoDebugAndDump(RecoStep::TPCClusterFinding, 262144, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaks, *mDebugFile);
 
         RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 1, doGPU, lane);
@@ -870,7 +876,12 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           if (doGPU) {
             SynchronizeStream(lane);
           }
-          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+          if(!applyNNclusterizer){
+            runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+          } else {
+            // FIXME: Here I need to apply the neural network
+            runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+          }
         }
         if (GetProcessingSettings().debugLevel >= 3) {
           GPUInfo("Sector %02d Fragment %02d Lane %d: Found clusters: digits %u peaks %u clusters %u", iSlice, fragment.index, lane, (int)clusterer.mPmemory->counters.nPositions, (int)clusterer.mPmemory->counters.nPeaks, (int)clusterer.mPmemory->counters.nClusters);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
new file mode 100644
index 0000000000000..3097d3adecb3d
--- /dev/null
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -0,0 +1,271 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCNNClusterizer.cxx
+/// \author Christian Sonnabend
+
+#include "GPUTPCNNClusterizer.h"
+
+#include "CfConsts.h"
+#include "CfUtils.h"
+#include "ClusterAccumulator.h"
+#if !defined(GPUCA_GPUCODE)
+#include "GPUHostDataTypes.h"
+#include "MCLabelAccumulator.h"
+#endif
+
+using namespace GPUCA_NAMESPACE::gpu;
+using namespace GPUCA_NAMESPACE::gpu::tpccf;
+
+template <>
+GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC)
+{
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CPU_ONLY(
+    MCLabelAccumulator labelAcc(clusterer));
+
+  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+
+  GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
+}
+
+GPUdii() void GPUTPCNNClusterizer::computeClustersImpl(int nBlocks, int nThreads, int iBlock, int iThread,
+                                                       processorType& clusterer,
+                                                       const CfFragment& fragment,
+                                                       GPUSharedMemory& smem,
+                                                       const Array2D<PackedCharge>& chargeMap,
+                                                       const ChargePos* filteredPeakPositions,
+                                                       const GPUSettingsRec& calib,
+                                                       MCLabelAccumulator* labelAcc,
+                                                       uint clusternum,
+                                                       uint maxClusterPerRow,
+                                                       uint* clusterInRow,
+                                                       tpc::ClusterNative* clusterByRow,
+                                                       uint* clusterPosInRow)
+{
+  uint idx = get_global_id(0);
+
+  // For certain configurations dummy work items are added, so the total
+  // number of work items is dividable by 64.
+  // These dummy items also compute the last cluster but discard the result.
+  ChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)];
+  Charge charge = chargeMap[pos].unpack();
+
+  ClusterAccumulator pc;
+  CPU_ONLY(labelAcc->collect(pos, charge));
+
+  buildCluster(
+    calib,
+    chargeMap,
+    pos,
+    smem.posBcast,
+    smem.buf,
+    smem.innerAboveThreshold,
+    &pc,
+    labelAcc);
+
+  if (idx >= clusternum) {
+    return;
+  }
+  if (fragment.isOverlap(pos.time())) {
+    if (clusterPosInRow) {
+      clusterPosInRow[idx] = maxClusterPerRow;
+    }
+    return;
+  }
+  pc.finalize(pos, charge, fragment.start, clusterer.Param().tpcGeometry);
+
+  tpc::ClusterNative myCluster;
+  bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param());
+
+  if (rejectCluster) {
+    if (clusterPosInRow) {
+      clusterPosInRow[idx] = maxClusterPerRow;
+    }
+    return;
+  }
+
+  uint rowIndex = 0;
+  if (clusterByRow != nullptr) {
+    rowIndex = sortIntoBuckets(
+      clusterer,
+      myCluster,
+      pos.row(),
+      maxClusterPerRow,
+      clusterInRow,
+      clusterByRow);
+    if (clusterPosInRow != nullptr) {
+      clusterPosInRow[idx] = rowIndex;
+    }
+  } else if (clusterPosInRow) {
+    rowIndex = clusterPosInRow[idx];
+  }
+
+  CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow));
+}
+
+GPUdii() void GPUTPCNNClusterizer::updateClusterInner(
+  const GPUSettingsRec& calib,
+  ushort lid,
+  ushort N,
+  const PackedCharge* buf,
+  const ChargePos& pos,
+  ClusterAccumulator* cluster,
+  MCLabelAccumulator* labelAcc,
+  uchar* innerAboveThreshold)
+{
+  uchar aboveThreshold = 0;
+
+  GPUCA_UNROLL(U(), U())
+  for (ushort i = 0; i < N; i++) {
+    Delta2 d = cfconsts::InnerNeighbors[i];
+
+    PackedCharge p = buf[N * lid + i];
+
+    Charge q = cluster->updateInner(p, d);
+
+    CPU_ONLY(
+      labelAcc->collect(pos.delta(d), q));
+
+    aboveThreshold |= (uchar(q > calib.tpc.cfInnerThreshold) << i);
+  }
+
+  innerAboveThreshold[lid] = aboveThreshold;
+
+  GPUbarrier();
+}
+
+GPUdii() void GPUTPCNNClusterizer::updateClusterOuter(
+  ushort lid,
+  ushort N,
+  ushort M,
+  ushort offset,
+  const PackedCharge* buf,
+  const ChargePos& pos,
+  ClusterAccumulator* cluster,
+  MCLabelAccumulator* labelAcc)
+{
+  GPUCA_UNROLL(U(), U())
+  for (ushort i = offset; i < M + offset; i++) {
+    PackedCharge p = buf[N * lid + i];
+
+    Delta2 d = cfconsts::OuterNeighbors[i];
+
+    Charge q = cluster->updateOuter(p, d);
+    static_cast<void>(q); // Avoid unused varible warning on GPU.
+
+    CPU_ONLY(
+      labelAcc->collect(pos.delta(d), q));
+  }
+}
+
+GPUdii() void GPUTPCNNClusterizer::buildCluster(
+  const GPUSettingsRec& calib,
+  const Array2D<PackedCharge>& chargeMap,
+  ChargePos pos,
+  ChargePos* posBcast,
+  PackedCharge* buf,
+  uchar* innerAboveThreshold,
+  ClusterAccumulator* myCluster,
+  MCLabelAccumulator* labelAcc)
+{
+  ushort ll = get_local_id(0);
+
+  posBcast[ll] = pos;
+  GPUbarrier();
+
+  CfUtils::blockLoad<PackedCharge>(
+    chargeMap,
+    SCRATCH_PAD_WORK_GROUP_SIZE,
+    SCRATCH_PAD_WORK_GROUP_SIZE,
+    ll,
+    0,
+    8,
+    cfconsts::InnerNeighbors,
+    posBcast,
+    buf);
+  updateClusterInner(
+    calib,
+    ll,
+    8,
+    buf,
+    pos,
+    myCluster,
+    labelAcc,
+    innerAboveThreshold);
+
+  ushort wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1) / 2;
+
+  bool inGroup1 = ll < wgSizeHalf;
+
+  ushort llhalf = (inGroup1) ? ll : (ll - wgSizeHalf);
+
+  CfUtils::condBlockLoad(
+    chargeMap,
+    wgSizeHalf,
+    SCRATCH_PAD_WORK_GROUP_SIZE,
+    ll,
+    0,
+    16,
+    cfconsts::OuterNeighbors,
+    posBcast,
+    innerAboveThreshold,
+    buf);
+
+  if (inGroup1) {
+    updateClusterOuter(
+      llhalf,
+      16,
+      16,
+      0,
+      buf,
+      pos,
+      myCluster,
+      labelAcc);
+  }
+
+#if defined(GPUCA_GPUCODE)
+  CfUtils::condBlockLoad(
+    chargeMap,
+    wgSizeHalf,
+    SCRATCH_PAD_WORK_GROUP_SIZE,
+    ll,
+    0,
+    16,
+    cfconsts::OuterNeighbors,
+    posBcast + wgSizeHalf,
+    innerAboveThreshold + wgSizeHalf,
+    buf);
+  if (!inGroup1) {
+    updateClusterOuter(
+      llhalf,
+      16,
+      16,
+      0,
+      buf,
+      pos,
+      myCluster,
+      labelAcc);
+  }
+#endif
+}
+
+GPUd() uint GPUTPCNNClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint row, uint maxElemsPerBucket, uint* elemsInBucket, tpc::ClusterNative* buckets)
+{
+  uint index = CAMath::AtomicAdd(&elemsInBucket[row], 1u);
+  if (index < maxElemsPerBucket) {
+    buckets[maxElemsPerBucket * row + index] = cluster;
+  } else {
+    clusterer.raiseError(GPUErrors::ERROR_CF_ROW_CLUSTER_OVERFLOW, clusterer.mISlice * 1000 + row, index, maxElemsPerBucket);
+    CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket);
+  }
+  return index;
+}
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
new file mode 100644
index 0000000000000..f2b92c5f50d40
--- /dev/null
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -0,0 +1,76 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCNNClusterizer.h
+/// \author Christian Sonnabend
+
+#ifndef O2_GPU_CLUSTERIZER_H
+#define O2_GPU_CLUSTERIZER_H
+
+#include "clusterFinderDefs.h"
+#include "GPUGeneralKernels.h"
+#include "GPUConstantMem.h"
+#include "GPUTPCClusterFinder.h"
+#include "Array2D.h"
+#include "PackedCharge.h"
+
+namespace o2::tpc
+{
+struct ClusterNative;
+} // namespace o2::tpc
+
+namespace GPUCA_NAMESPACE::gpu
+{
+
+class ClusterAccumulator;
+class MCLabelAccumulator;
+
+class GPUTPCNNClusterizer : public GPUKernelTemplate
+{
+ public:
+  static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFClusterizer);
+  struct GPUSharedMemory {
+    ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
+    PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N];
+    uchar innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE];
+  };
+
+#ifdef GPUCA_HAVE_O2HEADERS
+  typedef GPUTPCClusterFinder processorType;
+  GPUhdi() static processorType* Processor(GPUConstantMem& processors)
+  {
+    return processors.tpcClusterer;
+  }
+#endif
+
+  GPUhdi() CONSTEXPR static GPUDataTypes::RecoStep GetRecoStep()
+  {
+    return GPUDataTypes::RecoStep::TPCClusterFinding;
+  }
+
+  template <int iKernel = defaultKernel>
+  GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char);
+
+  static GPUd() void computeClustersImpl(int, int, int, int, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D<PackedCharge>&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint, uint, uint*, tpc::ClusterNative*, uint*);
+
+ private:
+  static GPUd() void updateClusterInner(const GPUSettingsRec&, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uchar*);
+
+  static GPUd() void updateClusterOuter(ushort, ushort, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*);
+
+  static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uchar*, ClusterAccumulator*, MCLabelAccumulator*);
+
+  static GPUd() uint sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint, uint, uint*, tpc::ClusterNative*);
+};
+
+} // namespace GPUCA_NAMESPACE::gpu
+
+#endif

From 05831efed4629001198fbc3b053c8bb41b2e13f7 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 27 May 2024 10:16:18 +0200
Subject: [PATCH 02/23] First version of clusterizer in GPU code

---
 Common/ML/CMakeLists.txt                      |  16 ++
 Common/ML/include/ML/onnx_interface.h         |  88 +++++++++
 Common/ML/src/onnx_interface.cxx              | 184 ++++++++++++++++++
 GPU/GPUTracking/CMakeLists.txt                |   3 +
 .../Global/GPUChainTrackingClusterizer.cxx    |   5 +-
 GPU/GPUTracking/ML/onnx_interface.cxx         | 184 ++++++++++++++++++
 GPU/GPUTracking/ML/onnx_interface.h           |  88 +++++++++
 .../TPCClusterFinder/ClusterAccumulator.h     |  17 ++
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 133 +++++++++++++
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  26 +++
 10 files changed, 743 insertions(+), 1 deletion(-)
 create mode 100644 Common/ML/CMakeLists.txt
 create mode 100644 Common/ML/include/ML/onnx_interface.h
 create mode 100644 Common/ML/src/onnx_interface.cxx
 create mode 100644 GPU/GPUTracking/ML/onnx_interface.cxx
 create mode 100644 GPU/GPUTracking/ML/onnx_interface.h

diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt
new file mode 100644
index 0000000000000..60a07041da2e0
--- /dev/null
+++ b/Common/ML/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+# See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+# All rights not expressly granted are reserved.
+#
+# This software is distributed under the terms of the GNU General Public
+# License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+#
+# In applying this license CERN does not waive the privileges and immunities
+# granted to it by virtue of its status as an Intergovernmental Organization
+# or submit itself to any jurisdiction.
+
+o2_add_library(ML
+               SOURCES src/onnx_interface.cxx
+               TARGETVARNAME targetName
+               PUBLIC_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime
+)
\ No newline at end of file
diff --git a/Common/ML/include/ML/onnx_interface.h b/Common/ML/include/ML/onnx_interface.h
new file mode 100644
index 0000000000000..506311c067351
--- /dev/null
+++ b/Common/ML/include/ML/onnx_interface.h
@@ -0,0 +1,88 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+///
+/// \file     model.h
+///
+/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
+///
+/// \brief    A general-purpose class for ONNX models
+///
+
+#ifndef GPU_ML_ONNX_INTERFACE_H
+#define GPU_ML_ONNX_INTERFACE_H
+
+// C++ and system includes
+#include <onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>
+#include <vector>
+#include <string>
+#include <memory>
+#include <map>
+#include <thread>
+
+// O2 includes
+#include "Framework/Logger.h"
+
+namespace o2
+{
+
+namespace ml
+{
+
+class OnnxModel
+{
+
+ public:
+  OnnxModel() = default;
+  ~OnnxModel() = default;
+
+  // Inferencing
+  void init(std::string, bool = false, int = 0);
+  // float* inference(std::vector<Ort::Value>, int = 0);
+  // float* inference(std::vector<float>, int = 0);
+  template<class T> float* inference(T input, unsigned int size);
+  template<class T> std::vector<float> inference_vector(T input, unsigned int size);
+
+  // Reset session
+  void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); }
+
+  // Getters & Setters
+  Ort::SessionOptions* getSessionOptions() { return &sessionOptions; } // For optimizations in post
+  std::shared_ptr<Ort::Experimental::Session> getSession() { return mSession; }
+  std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
+  std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
+  void setActiveThreads(int);
+
+ private:
+  // Environment variables for the ONNX runtime
+  std::shared_ptr<Ort::Env> mEnv = nullptr;
+  std::shared_ptr<Ort::Experimental::Session> mSession = nullptr;
+  Ort::SessionOptions sessionOptions;
+
+  // Input & Output specifications of the loaded network
+  std::vector<std::string> mInputNames;
+  std::vector<std::vector<int64_t>> mInputShapes;
+  std::vector<std::string> mOutputNames;
+  std::vector<std::vector<int64_t>> mOutputShapes;
+
+  // Environment settings
+  std::string modelPath;
+  int activeThreads = 0;
+
+  // Internal function for printing the shape of tensors
+  std::string printShape(const std::vector<int64_t>&);
+};
+
+} // namespace gpu
+
+} // namespace GPUCA_NAMESPACE
+
+#endif // GPU_ML_ONNX_INTERFACE_H
\ No newline at end of file
diff --git a/Common/ML/src/onnx_interface.cxx b/Common/ML/src/onnx_interface.cxx
new file mode 100644
index 0000000000000..e7c952d6b8cdc
--- /dev/null
+++ b/Common/ML/src/onnx_interface.cxx
@@ -0,0 +1,184 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+///
+/// \file     model.cxx
+///
+/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
+///
+/// \brief    A general-purpose class with functions for ONNX model applications
+///
+
+// ONNX includes
+#include "ML/onnx_interface.h"
+
+namespace o2
+{
+
+namespace ml
+{
+
+std::string OnnxModel::printShape(const std::vector<int64_t>& v)
+{
+  std::stringstream ss("");
+  for (size_t i = 0; i < v.size() - 1; i++)
+    ss << v[i] << "x";
+  ss << v[v.size() - 1];
+  return ss.str();
+}
+
+void OnnxModel::init(std::string localPath, bool enableOptimizations, int threads)
+{
+
+  LOG(info) << "--- ONNX-ML model ---";
+  LOG(info) << "Taking model from: " << localPath;
+  modelPath = localPath;
+  activeThreads = threads;
+
+  /// Enableing optimizations
+  if(threads != 0){
+    // sessionOptions.SetInterOpNumThreads(1);
+    if(threads == 1){
+      sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+    }
+    else{
+      sessionOptions.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
+      sessionOptions.SetIntraOpNumThreads(threads);
+    }
+  }
+  if (enableOptimizations) {
+    // sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
+    sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
+    // uint32_t coreml_flags = 0;
+    // coreml_flags |= COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE;
+    // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(sessionOptions, coreml_flags));
+  }
+
+  mEnv = std::make_shared<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "onnx-model");
+  mSession = std::make_shared<Ort::Experimental::Session>(*mEnv, modelPath, sessionOptions);
+
+  mInputNames = mSession->GetInputNames();
+  mInputShapes = mSession->GetInputShapes();
+  mOutputNames = mSession->GetOutputNames();
+  mOutputShapes = mSession->GetOutputShapes();
+
+  LOG(info) << "Input Nodes:";
+  for (size_t i = 0; i < mInputNames.size(); i++) {
+    LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
+  }
+
+  LOG(info) << "Output Nodes:";
+  for (size_t i = 0; i < mOutputNames.size(); i++) {
+    LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]);
+  }
+  
+  LOG(info) << "--- Model initialized! ---";
+}
+
+// float* OnnxModel::inference(std::vector<Ort::Value> input, int device_id)
+// {
+
+//   // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id));
+
+//   try {
+//     auto outputTensors = mSession->Run(mInputNames, input, mOutputNames);
+//     float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+//     return outputValues;
+//   } catch (const Ort::Exception& exception) {
+//     LOG(error) << "Error running model inference: " << exception.what();
+//   }
+//   return nullptr;
+// }
+
+// float* OnnxModel::inference(std::vector<float> input, int device_id)
+// {
+// 
+//   // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id));
+// 
+//   int64_t size = input.size();
+//   assert(size % mInputShapes[0][1] == 0);
+//   std::vector<int64_t> inputShape{size / mInputShapes[0][1], mInputShapes[0][1]};
+//   std::vector<Ort::Value> inputTensors;
+//   inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), size, inputShape));
+//   try {
+//     auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
+//     float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+//     return outputValues;
+//   } catch (const Ort::Exception& exception) {
+//     LOG(error) << "Error running model inference: " << exception.what();
+//   }
+//   return nullptr;
+// }
+
+template<class T>
+float* OnnxModel::inference(T input, unsigned int size)
+{
+
+  std::vector<int64_t> inputShape = mInputShapes[0];
+  inputShape[0] = size;
+  std::vector<Ort::Value> inputTensors;
+  size_t mem_size = 1;
+  for(auto elem : inputShape){
+    mem_size*=elem;
+  }
+  inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
+  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
+  try {
+    auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
+    float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+    return outputValues;
+  } catch (const Ort::Exception& exception) {
+    LOG(error) << "Error running model inference: " << exception.what();
+  }
+  return nullptr;
+}
+
+template<class T>
+std::vector<float> OnnxModel::inference_vector(T input, unsigned int size)
+{
+
+  std::vector<int64_t> inputShape = mInputShapes[0];
+  inputShape[0] = size;
+  std::vector<Ort::Value> inputTensors;
+  // std::vector<float> outputValues;
+  size_t mem_size = 1;
+  for(auto elem : inputShape){
+    mem_size*=elem;
+  }
+  inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
+  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
+  try {
+    auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
+    float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+    std::vector<float> outputVector{outputValues, outputValues + size * mOutputShapes[0][1]};
+    // for(int s = 0; s < size; s++){
+    //   for(int o = 0; o < mOutputShapes[0][1]; o++){
+    //     outputValues.push_back(tmp_output_values[s*(int)mOutputShapes[0][1] + o]);
+    //   }
+    // }
+    return outputVector;
+  } catch (const Ort::Exception& exception) {
+    LOG(error) << "Error running model inference: " << exception.what();
+  }
+  return std::vector<float>{};
+}
+
+void OnnxModel::setActiveThreads(int threads)
+{
+  activeThreads = threads;
+}
+
+template float* OnnxModel::inference(std::vector<float>, unsigned int);
+template std::vector<float> OnnxModel::inference_vector(std::vector<float>, unsigned int);
+
+} // namespace gpu
+
+} // namespace GPUCA_NAMESPACE
\ No newline at end of file
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 6266d4962b88e..63abf760bf87a 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -62,6 +62,7 @@ set(SRCS
     Merger/GPUTPCGlobalDebugSortKernels.cxx
     Merger/GPUTPCGMPhysicalTrackModel.cxx
     Merger/GPUTPCGMPolynomialFieldManager.cxx
+    ML/onnx_interface.cxx
     DataTypes/GPUTRDTrack.cxx
     TRDTracking/GPUTRDTracker.cxx
     TRDTracking/GPUTRDTrackletWord.cxx
@@ -195,6 +196,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2" OR CONFIG_O2_EXTENSIONS)
         TPCClusterFinder/GPUTPCCFPeakFinder.cxx
         TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx
         TPCClusterFinder/GPUTPCCFClusterizer.cxx
+        TPCClusterFinder/GPUTPCNNClusterizer.cxx
         TPCClusterFinder/GPUTPCCFDeconvolution.cxx
         TPCClusterFinder/GPUTPCCFMCLabelFlattener.cxx
         TPCClusterFinder/GPUTPCCFDecodeZS.cxx
@@ -306,6 +308,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
                  PUBLIC_LINK_LIBRARIES O2::GPUCommon
                                        O2::ReconstructionDataFormats
                                        O2::TPCFastTransformation
+                                       ONNXRuntime::ONNXRuntime
                  PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC
                  SOURCES ${SRCS_DATATYPE_HEADERS})
   target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAVE_O2HEADERS)
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 7b2c5539439be..cca00ed3a1d02 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -18,6 +18,7 @@
 #include "GPUO2DataTypes.h"
 #include "GPUMemorySizeScalers.h"
 #include "GPUTrackingInputProvider.h"
+#include "GPUTPCNNClusterizer.h"
 #include <fstream>
 
 #ifdef GPUCA_O2_LIB
@@ -880,7 +881,9 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclus
             runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
           } else {
             // FIXME: Here I need to apply the neural network
-            runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+            // runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+            GPUCA_NAMESPACE::gpu::GPUTPCNNClusterizer nn_clus;
+            nn_clus.exec({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
           }
         }
         if (GetProcessingSettings().debugLevel >= 3) {
diff --git a/GPU/GPUTracking/ML/onnx_interface.cxx b/GPU/GPUTracking/ML/onnx_interface.cxx
new file mode 100644
index 0000000000000..e7c952d6b8cdc
--- /dev/null
+++ b/GPU/GPUTracking/ML/onnx_interface.cxx
@@ -0,0 +1,184 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+///
+/// \file     model.cxx
+///
+/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
+///
+/// \brief    A general-purpose class with functions for ONNX model applications
+///
+
+// ONNX includes
+#include "ML/onnx_interface.h"
+
+namespace o2
+{
+
+namespace ml
+{
+
+std::string OnnxModel::printShape(const std::vector<int64_t>& v)
+{
+  std::stringstream ss("");
+  for (size_t i = 0; i < v.size() - 1; i++)
+    ss << v[i] << "x";
+  ss << v[v.size() - 1];
+  return ss.str();
+}
+
+void OnnxModel::init(std::string localPath, bool enableOptimizations, int threads)
+{
+
+  LOG(info) << "--- ONNX-ML model ---";
+  LOG(info) << "Taking model from: " << localPath;
+  modelPath = localPath;
+  activeThreads = threads;
+
+  /// Enableing optimizations
+  if(threads != 0){
+    // sessionOptions.SetInterOpNumThreads(1);
+    if(threads == 1){
+      sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+    }
+    else{
+      sessionOptions.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
+      sessionOptions.SetIntraOpNumThreads(threads);
+    }
+  }
+  if (enableOptimizations) {
+    // sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
+    sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
+    // uint32_t coreml_flags = 0;
+    // coreml_flags |= COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE;
+    // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(sessionOptions, coreml_flags));
+  }
+
+  mEnv = std::make_shared<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "onnx-model");
+  mSession = std::make_shared<Ort::Experimental::Session>(*mEnv, modelPath, sessionOptions);
+
+  mInputNames = mSession->GetInputNames();
+  mInputShapes = mSession->GetInputShapes();
+  mOutputNames = mSession->GetOutputNames();
+  mOutputShapes = mSession->GetOutputShapes();
+
+  LOG(info) << "Input Nodes:";
+  for (size_t i = 0; i < mInputNames.size(); i++) {
+    LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
+  }
+
+  LOG(info) << "Output Nodes:";
+  for (size_t i = 0; i < mOutputNames.size(); i++) {
+    LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]);
+  }
+  
+  LOG(info) << "--- Model initialized! ---";
+}
+
+// float* OnnxModel::inference(std::vector<Ort::Value> input, int device_id)
+// {
+
+//   // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id));
+
+//   try {
+//     auto outputTensors = mSession->Run(mInputNames, input, mOutputNames);
+//     float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+//     return outputValues;
+//   } catch (const Ort::Exception& exception) {
+//     LOG(error) << "Error running model inference: " << exception.what();
+//   }
+//   return nullptr;
+// }
+
+// float* OnnxModel::inference(std::vector<float> input, int device_id)
+// {
+// 
+//   // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id));
+// 
+//   int64_t size = input.size();
+//   assert(size % mInputShapes[0][1] == 0);
+//   std::vector<int64_t> inputShape{size / mInputShapes[0][1], mInputShapes[0][1]};
+//   std::vector<Ort::Value> inputTensors;
+//   inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), size, inputShape));
+//   try {
+//     auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
+//     float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+//     return outputValues;
+//   } catch (const Ort::Exception& exception) {
+//     LOG(error) << "Error running model inference: " << exception.what();
+//   }
+//   return nullptr;
+// }
+
+template<class T>
+float* OnnxModel::inference(T input, unsigned int size)
+{
+
+  std::vector<int64_t> inputShape = mInputShapes[0];
+  inputShape[0] = size;
+  std::vector<Ort::Value> inputTensors;
+  size_t mem_size = 1;
+  for(auto elem : inputShape){
+    mem_size*=elem;
+  }
+  inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
+  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
+  try {
+    auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
+    float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+    return outputValues;
+  } catch (const Ort::Exception& exception) {
+    LOG(error) << "Error running model inference: " << exception.what();
+  }
+  return nullptr;
+}
+
+template<class T>
+std::vector<float> OnnxModel::inference_vector(T input, unsigned int size)
+{
+
+  std::vector<int64_t> inputShape = mInputShapes[0];
+  inputShape[0] = size;
+  std::vector<Ort::Value> inputTensors;
+  // std::vector<float> outputValues;
+  size_t mem_size = 1;
+  for(auto elem : inputShape){
+    mem_size*=elem;
+  }
+  inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
+  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
+  try {
+    auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
+    float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+    std::vector<float> outputVector{outputValues, outputValues + size * mOutputShapes[0][1]};
+    // for(int s = 0; s < size; s++){
+    //   for(int o = 0; o < mOutputShapes[0][1]; o++){
+    //     outputValues.push_back(tmp_output_values[s*(int)mOutputShapes[0][1] + o]);
+    //   }
+    // }
+    return outputVector;
+  } catch (const Ort::Exception& exception) {
+    LOG(error) << "Error running model inference: " << exception.what();
+  }
+  return std::vector<float>{};
+}
+
+void OnnxModel::setActiveThreads(int threads)
+{
+  activeThreads = threads;
+}
+
+template float* OnnxModel::inference(std::vector<float>, unsigned int);
+template std::vector<float> OnnxModel::inference_vector(std::vector<float>, unsigned int);
+
+} // namespace gpu
+
+} // namespace GPUCA_NAMESPACE
\ No newline at end of file
diff --git a/GPU/GPUTracking/ML/onnx_interface.h b/GPU/GPUTracking/ML/onnx_interface.h
new file mode 100644
index 0000000000000..506311c067351
--- /dev/null
+++ b/GPU/GPUTracking/ML/onnx_interface.h
@@ -0,0 +1,88 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+///
+/// \file     model.h
+///
+/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
+///
+/// \brief    A general-purpose class for ONNX models
+///
+
+#ifndef GPU_ML_ONNX_INTERFACE_H
+#define GPU_ML_ONNX_INTERFACE_H
+
+// C++ and system includes
+#include <onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>
+#include <vector>
+#include <string>
+#include <memory>
+#include <map>
+#include <thread>
+
+// O2 includes
+#include "Framework/Logger.h"
+
+namespace o2
+{
+
+namespace ml
+{
+
+class OnnxModel
+{
+
+ public:
+  OnnxModel() = default;
+  ~OnnxModel() = default;
+
+  // Inferencing
+  void init(std::string, bool = false, int = 0);
+  // float* inference(std::vector<Ort::Value>, int = 0);
+  // float* inference(std::vector<float>, int = 0);
+  template<class T> float* inference(T input, unsigned int size);
+  template<class T> std::vector<float> inference_vector(T input, unsigned int size);
+
+  // Reset session
+  void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); }
+
+  // Getters & Setters
+  Ort::SessionOptions* getSessionOptions() { return &sessionOptions; } // For optimizations in post
+  std::shared_ptr<Ort::Experimental::Session> getSession() { return mSession; }
+  std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
+  std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
+  void setActiveThreads(int);
+
+ private:
+  // Environment variables for the ONNX runtime
+  std::shared_ptr<Ort::Env> mEnv = nullptr;
+  std::shared_ptr<Ort::Experimental::Session> mSession = nullptr;
+  Ort::SessionOptions sessionOptions;
+
+  // Input & Output specifications of the loaded network
+  std::vector<std::string> mInputNames;
+  std::vector<std::vector<int64_t>> mInputShapes;
+  std::vector<std::string> mOutputNames;
+  std::vector<std::vector<int64_t>> mOutputShapes;
+
+  // Environment settings
+  std::string modelPath;
+  int activeThreads = 0;
+
+  // Internal function for printing the shape of tensors
+  std::string printShape(const std::vector<int64_t>&);
+};
+
+} // namespace gpu
+
+} // namespace GPUCA_NAMESPACE
+
+#endif // GPU_ML_ONNX_INTERFACE_H
\ No newline at end of file
diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
index 3958f6d3aa137..344a0fae3995f 100644
--- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
+++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
@@ -43,6 +43,23 @@ class ClusterAccumulator
   GPUd() void finalize(const ChargePos&, tpccf::Charge, tpccf::TPCTime, const GPUTPCGeometry&);
   GPUd() bool toNative(const ChargePos&, tpccf::Charge, tpc::ClusterNative&, const GPUParam&) const;
 
+  GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uchar splitInTime, uchar splitInPad){
+    mQtot = qtot;
+    mPadMean = padMean;
+    mPadSigma = padSigma;
+    mTimeMean = timeMean;
+    mTimeSigma = timeSigma;
+    mSplitInTime = splitInTime;
+    mSplitInPad = splitInPad;
+  }
+  GPUd() void setQtot(float qtot) { mQtot = qtot; }
+  GPUd() void setPadMean(float padMean) { mPadMean = padMean; }
+  GPUd() void setPadSigma(float padSigma) { mPadSigma = padSigma; }
+  GPUd() void setTimeMean(float timeMean) { mTimeMean = timeMean; }
+  GPUd() void setTimeSigma(float timeSigma) { mTimeSigma = timeSigma; }
+  GPUd() void setSplitInTime(uchar splitInTime) { mSplitInTime = splitInTime; }
+  GPUd() void setSplitInPad(uchar splitInPad) { mSplitInPad = splitInPad; }
+
  private:
   float mQtot = 0;
   float mPadMean = 0;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 3097d3adecb3d..6c64c54ca5193 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -37,6 +37,139 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo
   GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
 }
 
+void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC)
+{
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CPU_ONLY(
+    MCLabelAccumulator labelAcc(clusterer));
+
+  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+
+  OnnxModel model_class, model_reg;
+  std::string path_class = "", path_reg = "";
+
+  model_class.init(path_class, 1, 0);
+  model_reg.init(path_reg, 1, 0);
+
+  GPUTPCNNClusterizer::nn_clusterizer(model_class, model_reg, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, 1, 0.16, 1);
+}
+
+int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current)
+{
+  return (int)((pad_row_max[row_ref] - pad_row_max[row_current]) / 2);
+}
+
+// ---------------------------------
+bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift)
+{
+  if (row < 0 || pad < 0) {
+    return true;
+  } else if (row <= 62) {
+    if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row]) / 2) {
+      return true;
+    } else {
+      return false;
+    }
+  } else if (row <= 62 + global_shift) {
+    return true;
+  } else if (row <= o2::tpc::constants::MAXGLOBALPADROW-1 + global_shift) {
+    if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row - global_shift]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row - global_shift]) / 2) {
+      return true;
+    } else {
+      return false;
+    }
+  } else if (row > o2::tpc::constants::MAXGLOBALPADROW-1 + global_shift) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void GPUTPCNNClusterizer::nn_clusterizer(OnnxModel model_class, OnnxModel model_reg, 
+                                          processorType& clusterer,
+                                          const CfFragment& fragment,
+                                          GPUSharedMemory& smem,
+                                          const Array2D<PackedCharge>& chargeMap,
+                                          const ChargePos* filteredPeakPositions,
+                                          const GPUSettingsRec& calib,
+                                          MCLabelAccumulator* labelAcc,
+                                          uint clusternum,
+                                          uint maxClusterPerRow,
+                                          uint* clusterInRow,
+                                          tpc::ClusterNative* clusterByRow,
+                                          uint* clusterPosInRow
+                                          int in_row, int in_pad, int in_time, bool add_index_data, float class_threshold, bool sigmoid_transform){
+
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  std::vector<float> input_data(clusterer.mPmemory->counters.nClusters * (2*in_row + 1) * (2*in_pad + 1) * (2*in_time + 1));
+  float classification_threshold = class_threshold;
+  if(sigmoid_transform){
+    classification_threshold = (float)std::log(class_threshold/(1.f-class_threshold));
+  }
+  
+  for(float cls = 0; cls < clusterer.mPmemory->counters.nClusters; cls++){
+    ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
+    int row = peak.row(), pad = peak.pad(), time = peak.time();
+    float central_charge = chargeMap[peak].unpack();
+    unsigned int glo_idx = cls * ((2*in_row + 1) + (2*in_pad + 1) * (2*in_time + 1));
+    for(int r = -in_row; r <= in_row; r++){
+      for(int p = -in_pad; p <= in_pad; p++){
+        for(int t = -in_time; t <= in_time; t++){
+          int offset = padOffset(row, row + r);
+          if(isBoundary(row + r, pad + p + offset)){
+            continue;
+          } else {
+            unsigned int idx = glo_idx + (row + r) * (2*in_pad + 1) * (2*in_time + 1) + (pad + p) * (2*in_time + 1) + (time + t);
+            ChargePos tmp_pos(row + r, pad + p + offset, time + t);
+            input_data[idx] = (chargeMap[tmp_pos].unpack() / central_charge);
+          }
+        }
+      }
+    }
+  }
+  std::vector<float> out_class = model_class.inference_vector(input_data, clusterer.mPmemory->counters.nClusters);
+  std::vector<float> out_reg = model_reg.inference_vector(input_data, clusterer.mPmemory->counters.nClusters);
+  int num_outputs = model_reg.getNumOutputNodes()[0][1];
+
+  for(int cls = 0; cls < clusterer.mPmemory->counters.nClusters; cls++){
+    if(out_class > classification_threshold){
+      int idx = cls * num_outputs;
+      ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
+      ClusterAccumulator pc;
+      pc.setFull(chargeMap[peak].unpack() * out_reg[idx + 4], peak.pad() + out_reg[idx], out_reg[idx + 2], peak.time() + out_reg[idx + 1], out_reg[idx + 3], 0, 0);
+      tpc::ClusterNative myCluster;
+      bool rejectCluster = !pc.toNative(pos, chargeMap[peak].unpack(), myCluster, clusterer.Param());
+      if (rejectCluster) {
+        if (clusterPosInRow) {
+          clusterPosInRow[idx] = maxClusterPerRow;
+        }
+        return;
+      }
+
+      uint rowIndex = 0;
+      if (clusterByRow != nullptr) {
+        rowIndex = sortIntoBuckets(
+          clusterer,
+          myCluster,
+          pos.row(),
+          maxClusterPerRow,
+          clusterInRow,
+          clusterByRow);
+        if (clusterPosInRow != nullptr) {
+          clusterPosInRow[idx] = rowIndex;
+        }
+      } else if (clusterPosInRow) {
+        rowIndex = clusterPosInRow[idx];
+      }
+
+      CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow));
+    }
+  }
+
+}
+
+
+
 GPUdii() void GPUTPCNNClusterizer::computeClustersImpl(int nBlocks, int nThreads, int iBlock, int iThread,
                                                        processorType& clusterer,
                                                        const CfFragment& fragment,
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index f2b92c5f50d40..56ffcbc842223 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -21,6 +21,9 @@
 #include "GPUTPCClusterFinder.h"
 #include "Array2D.h"
 #include "PackedCharge.h"
+#include "ML/onnx_interface.h"
+
+using namespace o2::ml;
 
 namespace o2::tpc
 {
@@ -61,7 +64,30 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
 
   static GPUd() void computeClustersImpl(int, int, int, int, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D<PackedCharge>&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint, uint, uint*, tpc::ClusterNative*, uint*);
 
+  void exec(int, int, int, int, GPUSharedMemory&, processorType&, char);
+  int padOffset(int);
+  bool isBoundary(int, int, int);
+  static void nn_clusterizer(OnnxModel, OnnxModel,
+                              processorType&,
+                              const CfFragment&,
+                              GPUSharedMemory&,
+                              const Array2D<PackedCharge>&,
+                              const ChargePos*,
+                              const GPUSettingsRec&,
+                              MCLabelAccumulator*,
+                              uint,
+                              uint,
+                              uint*,
+                              tpc::ClusterNative*,
+                              uint*,
+                              int = 3, int = 3, int = 3, bool = true);
+
  private:
+  // ---------------------------------
+  std::vector<int> pad_row_max{
+  65, 65, 65, 67, 67, 67, 69, 69, 69, 71, 71, 71, 73, 73, 73, 73, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 81, 81, 81, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 91, 91, 91, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 99, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 79, 81, 81, 81, 83, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 101, 101, 101, 103, 103, 103, 105, 109, 109, 111, 111, 111, 113, 113, 113, 115, 115, 115, 117, 117, 117, 117, 117, 119, 119, 121, 121, 123, 123, 123, 125, 125, 127, 127, 127, 129, 129, 131, 131, 131, 133, 133, 135, 135, 137, 137
+  };
+
   static GPUd() void updateClusterInner(const GPUSettingsRec&, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uchar*);
 
   static GPUd() void updateClusterOuter(ushort, ushort, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*);

From 3f6c934987d68cce26ca1c63c07dc2038be3850b Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 29 May 2024 11:38:33 +0200
Subject: [PATCH 03/23] Adding a compiling and running version with
 single-threaded ONNX model executions. Clusters are not getting published yet
 (FIXME)

---
 GPU/GPUTracking/CMakeLists.txt                |   2 +-
 .../Definitions/GPUDefGPUParameters.h         |   6 +
 .../Global/GPUChainTrackingClusterizer.cxx    |  16 +-
 .../TPCClusterFinder/GPUTPCClusterFinder.h    |   5 +
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 161 +++++++++++-------
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  22 +--
 GPU/GPUTracking/kernels.cmake                 |   1 +
 7 files changed, 133 insertions(+), 80 deletions(-)

diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 63abf760bf87a..8b3a37894810c 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -195,8 +195,8 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2" OR CONFIG_O2_EXTENSIONS)
         TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx
         TPCClusterFinder/GPUTPCCFPeakFinder.cxx
         TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx
-        TPCClusterFinder/GPUTPCCFClusterizer.cxx
         TPCClusterFinder/GPUTPCNNClusterizer.cxx
+        TPCClusterFinder/GPUTPCCFClusterizer.cxx
         TPCClusterFinder/GPUTPCCFDeconvolution.cxx
         TPCClusterFinder/GPUTPCCFMCLabelFlattener.cxx
         TPCClusterFinder/GPUTPCCFDecodeZS.cxx
diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
index 4bb8303ee9a96..d8eba2a9ad384 100644
--- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
+++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
@@ -79,6 +79,7 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 512
   #define GPUCA_LB_GPUTPCCFDeconvolution 512
   #define GPUCA_LB_GPUTPCCFClusterizer 448
+  #define GPUCA_LB_GPUTPCNNClusterizer 448
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -143,6 +144,7 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 512
   #define GPUCA_LB_GPUTPCCFDeconvolution 512
   #define GPUCA_LB_GPUTPCCFClusterizer 512
+  #define GPUCA_LB_GPUTPCNNClusterizer 512
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -207,6 +209,7 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 448
   #define GPUCA_LB_GPUTPCCFDeconvolution 384
   #define GPUCA_LB_GPUTPCCFClusterizer 448
+  #define GPUCA_LB_GPUTPCNNClusterizer 448
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -475,6 +478,9 @@
   #ifndef GPUCA_LB_GPUTPCCFClusterizer
     #define GPUCA_LB_GPUTPCCFClusterizer 512
   #endif
+  #ifndef GPUCA_LB_GPUTPCNNClusterizer
+    #define GPUCA_LB_GPUTPCNNClusterizer 512
+  #endif
   #ifndef GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU
     #define GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU 256
   #endif
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 8a6a899f35a1a..26878e6111bd5 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -18,7 +18,6 @@
 #include "GPUO2DataTypes.h"
 #include "GPUMemorySizeScalers.h"
 #include "GPUTrackingInputProvider.h"
-#include "GPUTPCNNClusterizer.h"
 #include <fstream>
 
 #ifdef GPUCA_O2_LIB
@@ -875,7 +874,15 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclus
         runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
         DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
 
-        runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSlice}}, 0);
+        if(doGPU){
+          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSlice}}, 0);
+        } else {
+          std::string path_class = "/lustre/alice/users/csonnab/PhD/jobs/clusterization/NN/output/normalized_qCenter/o2sim_150324_50Ev_10000QED_PbPb_13t7p/classification/3D_FCNN_1cls_03_04_2024_10M_FP16_addIndex/network/net_onnx.onnx", path_reg = "/lustre/alice/users/csonnab/PhD/jobs/clusterization/NN/output/normalized_qCenter/o2sim_150324_50Ev_10000QED_PbPb_13t7p/regression/3D_FCNN_1cls_05_04_2024_10M_FP16_addIndex/network/net_onnx.onnx";
+          clusterer.model_class.init(path_class, 1, 1);
+          clusterer.model_reg.init(path_reg, 1, 1);
+
+          runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+        }
         if (doGPU && propagateMCLabels) {
           TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane);
           if (doGPU) {
@@ -886,8 +893,9 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclus
           } else {
             // FIXME: Here I need to apply the neural network
             // runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
-            GPUCA_NAMESPACE::gpu::GPUTPCNNClusterizer nn_clus;
-            nn_clus.exec({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+            runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+            // GPUTPCNNClusterizer nn_clus;
+            // nn_clus.exec({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
           }
         }
         if (GetProcessingSettings().debugLevel >= 3) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index ca89053797a47..ae40ff780b25a 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -19,6 +19,9 @@
 #include "GPUProcessor.h"
 #include "GPUDataTypes.h"
 #include "CfFragment.h"
+#include "ML/onnx_interface.h"
+
+using namespace o2::ml;
 
 namespace o2
 {
@@ -141,6 +144,8 @@ class GPUTPCClusterFinder : public GPUProcessor
   short mZSOffsetId = -1;
   short mOutputId = -1;
 
+  OnnxModel model_class, model_reg;
+
 #ifndef GPUCA_GPUCODE
   void DumpDigits(std::ostream& out);
   void DumpChargeMap(std::ostream& out, std::string_view);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 6c64c54ca5193..7c19802825eb6 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -34,10 +34,14 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo
 
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
-  GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
+  GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true);
+
+  // tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+// 
+  // GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
 }
 
-void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC)
+GPUd() void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC)
 {
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(
@@ -45,27 +49,37 @@ void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThrea
 
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
-  OnnxModel model_class, model_reg;
   std::string path_class = "", path_reg = "";
 
-  model_class.init(path_class, 1, 0);
-  model_reg.init(path_reg, 1, 0);
+  clusterer.model_class.init(path_class, 1, 0);
+  clusterer.model_reg.init(path_reg, 1, 0);
 
-  GPUTPCNNClusterizer::nn_clusterizer(model_class, model_reg, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, 1, 0.16, 1);
+  GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true);
 }
 
 int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current)
 {
+  std::vector<int> pad_row_max{
+    65, 65, 65, 67, 67, 67, 69, 69, 69, 71, 71, 71, 73, 73, 73, 73, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 81, 81, 81, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 91, 91, 91, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 99, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 79, 81, 81, 81, 83, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 101, 101, 101, 103, 103, 103, 105, 109, 109, 111, 111, 111, 113, 113, 113, 115, 115, 115, 117, 117, 117, 117, 117, 119, 119, 121, 121, 123, 123, 123, 125, 125, 127, 127, 127, 129, 129, 131, 131, 131, 133, 133, 135, 135, 137, 137
+  };
   return (int)((pad_row_max[row_ref] - pad_row_max[row_current]) / 2);
 }
 
 // ---------------------------------
 bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift)
 {
+  std::vector<int> pad_row_max{
+    65, 65, 65, 67, 67, 67, 69, 69, 69, 71, 71, 71, 73, 73, 73, 73, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 81, 81, 81, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 91, 91, 91, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 99, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 79, 81, 81, 81, 83, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 101, 101, 101, 103, 103, 103, 105, 109, 109, 111, 111, 111, 113, 113, 113, 115, 115, 115, 117, 117, 117, 117, 117, 119, 119, 121, 121, 123, 123, 123, 125, 125, 127, 127, 127, 129, 129, 131, 131, 131, 133, 133, 135, 135, 137, 137
+  };
   if (row < 0 || pad < 0) {
     return true;
   } else if (row <= 62) {
-    if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row]) / 2) {
+    // if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row]) / 2) {
+    //   return true;
+    // } else {
+    //   return false;
+    // }
+    if (pad < 0 || pad > pad_row_max[row]) {
       return true;
     } else {
       return false;
@@ -73,7 +87,12 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift)
   } else if (row <= 62 + global_shift) {
     return true;
   } else if (row <= o2::tpc::constants::MAXGLOBALPADROW-1 + global_shift) {
-    if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row - global_shift]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row - global_shift]) / 2) {
+    //if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row - global_shift]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row - global_shift]) / 2) {
+    //  return true;
+    //} else {
+    //  return false;
+    //}
+    if (pad < 0 || pad > pad_row_max[row]) {
       return true;
     } else {
       return false;
@@ -85,7 +104,7 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift)
   }
 }
 
-void GPUTPCNNClusterizer::nn_clusterizer(OnnxModel model_class, OnnxModel model_reg, 
+GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int iBlock, int iThread,
                                           processorType& clusterer,
                                           const CfFragment& fragment,
                                           GPUSharedMemory& smem,
@@ -97,73 +116,93 @@ void GPUTPCNNClusterizer::nn_clusterizer(OnnxModel model_class, OnnxModel model_
                                           uint maxClusterPerRow,
                                           uint* clusterInRow,
                                           tpc::ClusterNative* clusterByRow,
-                                          uint* clusterPosInRow
+                                          uint* clusterPosInRow,
                                           int in_row, int in_pad, int in_time, bool add_index_data, float class_threshold, bool sigmoid_transform){
 
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  std::vector<float> input_data(clusterer.mPmemory->counters.nClusters * (2*in_row + 1) * (2*in_pad + 1) * (2*in_time + 1));
+  std::vector<float> input_data(((2*in_row + 1) * (2*in_pad + 1) * (2*in_time + 1) + (add_index_data ? 3 : 0)), -1.f);
   float classification_threshold = class_threshold;
   if(sigmoid_transform){
     classification_threshold = (float)std::log(class_threshold/(1.f-class_threshold));
   }
   
-  for(float cls = 0; cls < clusterer.mPmemory->counters.nClusters; cls++){
-    ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
-    int row = peak.row(), pad = peak.pad(), time = peak.time();
-    float central_charge = chargeMap[peak].unpack();
-    unsigned int glo_idx = cls * ((2*in_row + 1) + (2*in_pad + 1) * (2*in_time + 1));
-    for(int r = -in_row; r <= in_row; r++){
-      for(int p = -in_pad; p <= in_pad; p++){
-        for(int t = -in_time; t <= in_time; t++){
-          int offset = padOffset(row, row + r);
-          if(isBoundary(row + r, pad + p + offset)){
-            continue;
-          } else {
-            unsigned int idx = glo_idx + (row + r) * (2*in_pad + 1) * (2*in_time + 1) + (pad + p) * (2*in_time + 1) + (time + t);
-            ChargePos tmp_pos(row + r, pad + p + offset, time + t);
-            input_data[idx] = (chargeMap[tmp_pos].unpack() / central_charge);
-          }
+  uint idx = get_global_id(0);
+  uint cls = CAMath::Min(idx, clusternum - 1);
+
+  // For certain configurations dummy work items are added, so the total
+  // number of work items is dividable by 64.
+  // These dummy items also compute the last cluster but discard the result.
+  
+  ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
+  int row = peak.row(), pad = peak.pad(), time = peak.time();
+  float central_charge = chargeMap[peak].unpack();
+  CPU_ONLY(labelAcc->collect(peak, central_charge));
+  // unsigned int glo_idx = cls * ((2*in_row + 1) + (2*in_pad + 1) * (2*in_time + 1));
+  unsigned int write_idx = 0;
+  for(int r = -in_row; r <= in_row; r++){
+    for(int p = -in_pad; p <= in_pad; p++){
+      for(int t = -in_time; t <= in_time; t++){
+        int offset = GPUTPCNNClusterizer::padOffset(row, row + r);
+        if(GPUTPCNNClusterizer::isBoundary(row + r, pad + p + offset, in_row)){
+          continue;
+        } else {
+          // unsigned int loc_idx = (row + r) * (2*in_pad + 1) * (2*in_time + 1) + (pad + p) * (2*in_time + 1) + (time + t);
+          ChargePos tmp_pos(row + r, pad + p + offset, time + t);
+          input_data[write_idx] = (chargeMap[tmp_pos].unpack() / central_charge);
+          write_idx++;
         }
       }
+      if(idx == 100){
+        LOG(info) << "[" << input_data[write_idx-7] << ", " << input_data[write_idx-6] << ", " << input_data[write_idx-5] << ", " << input_data[write_idx-4] << ", " << input_data[write_idx-3] << ", " << input_data[write_idx-2] << ", " << input_data[write_idx-1] << "]";
+      }
+    }
+  }
+  if(add_index_data){
+    input_data[input_data.size()-3] = 1;
+    input_data[input_data.size()-2] = (float)peak.row() / 152.f;
+    input_data[input_data.size()-1] = (float)peak.pad() / 138.f;
+    if(idx == 100){
+      LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]";
     }
   }
-  std::vector<float> out_class = model_class.inference_vector(input_data, clusterer.mPmemory->counters.nClusters);
-  std::vector<float> out_reg = model_reg.inference_vector(input_data, clusterer.mPmemory->counters.nClusters);
-  int num_outputs = model_reg.getNumOutputNodes()[0][1];
-
-  for(int cls = 0; cls < clusterer.mPmemory->counters.nClusters; cls++){
-    if(out_class > classification_threshold){
-      int idx = cls * num_outputs;
-      ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
-      ClusterAccumulator pc;
-      pc.setFull(chargeMap[peak].unpack() * out_reg[idx + 4], peak.pad() + out_reg[idx], out_reg[idx + 2], peak.time() + out_reg[idx + 1], out_reg[idx + 3], 0, 0);
-      tpc::ClusterNative myCluster;
-      bool rejectCluster = !pc.toNative(pos, chargeMap[peak].unpack(), myCluster, clusterer.Param());
-      if (rejectCluster) {
-        if (clusterPosInRow) {
-          clusterPosInRow[idx] = maxClusterPerRow;
-        }
-        return;
-      }
 
-      uint rowIndex = 0;
-      if (clusterByRow != nullptr) {
-        rowIndex = sortIntoBuckets(
-          clusterer,
-          myCluster,
-          pos.row(),
-          maxClusterPerRow,
-          clusterInRow,
-          clusterByRow);
-        if (clusterPosInRow != nullptr) {
-          clusterPosInRow[idx] = rowIndex;
-        }
-      } else if (clusterPosInRow) {
-        rowIndex = clusterPosInRow[idx];
+  std::vector<float> out_class = clusterer.model_class.inference_vector(input_data, 1);
+  std::vector<float> out_reg = clusterer.model_reg.inference_vector(input_data, 1);
+  int num_outputs = clusterer.model_reg.getNumOutputNodes()[0][1];
+
+  if(idx == 100){
+    LOG(info) << "Classification model: " << out_class[0];
+    LOG(info) << "Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4];
+  }
+
+  if(out_class[0] > classification_threshold){
+    ClusterAccumulator pc;
+    pc.setFull(chargeMap[peak].unpack() * out_reg[4], peak.pad() + out_reg[0], out_reg[2], fragment.start + peak.time() + out_reg[1], out_reg[3], 0, 0);
+    tpc::ClusterNative myCluster;
+    bool rejectCluster = !pc.toNative(peak, chargeMap[peak].unpack(), myCluster, clusterer.Param());
+    if (rejectCluster) {
+      if (clusterPosInRow) {
+        clusterPosInRow[idx] = maxClusterPerRow;
       }
+      return;
+    }
 
-      CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow));
+    uint rowIndex = 0;
+    if (clusterByRow != nullptr) {
+      rowIndex = sortIntoBuckets(
+        clusterer,
+        myCluster,
+        peak.row(),
+        maxClusterPerRow,
+        clusterInRow,
+        clusterByRow);
+      if (clusterPosInRow != nullptr) {
+        clusterPosInRow[idx] = rowIndex;
+      }
+    } else if (clusterPosInRow) {
+      rowIndex = clusterPosInRow[idx];
     }
+
+    CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, maxClusterPerRow));
   }
 
 }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 56ffcbc842223..905e6f860a90f 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -12,8 +12,8 @@
 /// \file GPUTPCNNClusterizer.h
 /// \author Christian Sonnabend
 
-#ifndef O2_GPU_CLUSTERIZER_H
-#define O2_GPU_CLUSTERIZER_H
+#ifndef O2_GPU_NN_CLUSTERIZER_H
+#define O2_GPU_NN_CLUSTERIZER_H
 
 #include "clusterFinderDefs.h"
 #include "GPUGeneralKernels.h"
@@ -21,9 +21,6 @@
 #include "GPUTPCClusterFinder.h"
 #include "Array2D.h"
 #include "PackedCharge.h"
-#include "ML/onnx_interface.h"
-
-using namespace o2::ml;
 
 namespace o2::tpc
 {
@@ -39,7 +36,7 @@ class MCLabelAccumulator;
 class GPUTPCNNClusterizer : public GPUKernelTemplate
 {
  public:
-  static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFClusterizer);
+  static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizer);
   struct GPUSharedMemory {
     ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
     PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N];
@@ -64,10 +61,10 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
 
   static GPUd() void computeClustersImpl(int, int, int, int, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D<PackedCharge>&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint, uint, uint*, tpc::ClusterNative*, uint*);
 
-  void exec(int, int, int, int, GPUSharedMemory&, processorType&, char);
-  int padOffset(int);
-  bool isBoundary(int, int, int);
-  static void nn_clusterizer(OnnxModel, OnnxModel,
+  static GPUd() void exec(int, int, int, int, GPUSharedMemory&, processorType&, char);
+  static int padOffset(int, int);
+  static bool isBoundary(int, int, int);
+  static GPUd() void nn_clusterizer(int, int, int, int,
                               processorType&,
                               const CfFragment&,
                               GPUSharedMemory&,
@@ -80,13 +77,10 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
                               uint*,
                               tpc::ClusterNative*,
                               uint*,
-                              int = 3, int = 3, int = 3, bool = true);
+                              int = 3, int = 3, int = 3, bool = 1, float = 0.16, bool = true);
 
  private:
   // ---------------------------------
-  std::vector<int> pad_row_max{
-  65, 65, 65, 67, 67, 67, 69, 69, 69, 71, 71, 71, 73, 73, 73, 73, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 81, 81, 81, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 91, 91, 91, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 99, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 79, 81, 81, 81, 83, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 101, 101, 101, 103, 103, 103, 105, 109, 109, 111, 111, 111, 113, 113, 113, 115, 115, 115, 117, 117, 117, 117, 117, 119, 119, 121, 121, 123, 123, 123, 125, 125, 127, 127, 127, 129, 129, 131, 131, 131, 133, 133, 135, 135, 137, 137
-  };
 
   static GPUd() void updateClusterInner(const GPUSettingsRec&, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uchar*);
 
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index d4f5ca93e9def..b0270511c2249 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -117,6 +117,7 @@ o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression"        "= TPCCLUS
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks"             "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFDeconvolution"                             "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single char onlyMC)
+o2_gpu_add_kernel("GPUTPCNNClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single char onlyMC)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets"           "= TPCCLUSTERFINDER"                                    NO      single)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten"                 "= TPCCLUSTERFINDER"                                    NO      single GPUTPCLinearLabels* out)
 o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanStart"               "= TPCCLUSTERFINDER"                                    LB      single int iBuf int stage)

From 8ba6805ebd889bebf4b11972170570bdd99892cf Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 29 May 2024 21:11:04 +0200
Subject: [PATCH 04/23] Clusters now working by a hack

---
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 7c19802825eb6..afee680bc0ceb 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -32,7 +32,7 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo
   CPU_ONLY(
     MCLabelAccumulator labelAcc(clusterer));
 
-  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow; // (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
   GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true);
 
@@ -142,7 +142,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
     for(int p = -in_pad; p <= in_pad; p++){
       for(int t = -in_time; t <= in_time; t++){
         int offset = GPUTPCNNClusterizer::padOffset(row, row + r);
-        if(GPUTPCNNClusterizer::isBoundary(row + r, pad + p + offset, in_row)){
+        if(GPUTPCNNClusterizer::isBoundary(row + r, pad + p, in_row)){
           continue;
         } else {
           // unsigned int loc_idx = (row + r) * (2*in_pad + 1) * (2*in_time + 1) + (pad + p) * (2*in_time + 1) + (time + t);
@@ -151,18 +151,18 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
           write_idx++;
         }
       }
-      if(idx == 100){
-        LOG(info) << "[" << input_data[write_idx-7] << ", " << input_data[write_idx-6] << ", " << input_data[write_idx-5] << ", " << input_data[write_idx-4] << ", " << input_data[write_idx-3] << ", " << input_data[write_idx-2] << ", " << input_data[write_idx-1] << "]";
-      }
+      // if(idx == 100){
+      //   LOG(info) << "[" << input_data[write_idx-7] << ", " << input_data[write_idx-6] << ", " << input_data[write_idx-5] << ", " << input_data[write_idx-4] << ", " << input_data[write_idx-3] << ", " << input_data[write_idx-2] << ", " << input_data[write_idx-1] << "]";
+      // }
     }
   }
   if(add_index_data){
     input_data[input_data.size()-3] = 1;
     input_data[input_data.size()-2] = (float)peak.row() / 152.f;
     input_data[input_data.size()-1] = (float)peak.pad() / 138.f;
-    if(idx == 100){
-      LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]";
-    }
+    // if(idx == 100){
+    //   LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]";
+    // }
   }
 
   std::vector<float> out_class = clusterer.model_class.inference_vector(input_data, 1);
@@ -170,16 +170,17 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
   int num_outputs = clusterer.model_reg.getNumOutputNodes()[0][1];
 
   if(idx == 100){
-    LOG(info) << "Classification model: " << out_class[0];
+    LOG(info) << "Classification model: " << out_class[0] << " (>? " << classification_threshold << ")";
     LOG(info) << "Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4];
   }
 
   if(out_class[0] > classification_threshold){
     ClusterAccumulator pc;
-    pc.setFull(chargeMap[peak].unpack() * out_reg[4], peak.pad() + out_reg[0], out_reg[2], fragment.start + peak.time() + out_reg[1], out_reg[3], 0, 0);
+    pc.setFull(central_charge * out_reg[4], peak.pad() + out_reg[0], out_reg[2], fragment.start + peak.time() + out_reg[1], out_reg[3], 0, 0);
     tpc::ClusterNative myCluster;
-    bool rejectCluster = !pc.toNative(peak, chargeMap[peak].unpack(), myCluster, clusterer.Param());
+    bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param());
     if (rejectCluster) {
+      LOG(warning) << "Cluster rejected!";
       if (clusterPosInRow) {
         clusterPosInRow[idx] = maxClusterPerRow;
       }

From 6ec3c46d37e82b2f37f648ff3750d14f8d72f5b1 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 6 Jun 2024 17:49:38 +0200
Subject: [PATCH 05/23] Working implementation of settings via GPUSettings.h
 and --configKeyValues "GPU_proc.[setting]=...;..."

---
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  9 ++++++
 GPU/GPUTracking/Global/GPUChainTracking.cxx   |  2 +-
 GPU/GPUTracking/Global/GPUChainTracking.h     |  2 +-
 .../Global/GPUChainTrackingClusterizer.cxx    | 28 +++++++++++--------
 .../TPCClusterFinder/GPUTPCClusterFinder.h    |  7 +++++
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  |  2 +-
 6 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 777ea1e70b0d8..b3f38c6ab81d2 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -280,6 +280,15 @@ AddOption(tpcDownscaledEdx, unsigned char, 0, "", 0, "If != 0, downscale dEdx pr
 AddOption(tpcMaxAttachedClustersPerSectorRow, unsigned int, 51000, "", 0, "Maximum number of TPC attached clusters which can be decoded per SectorRow")
 AddOption(tpcUseOldCPUDecoding, bool, false, "", 0, "Enable old CPU-based TPC decoding")
 AddOption(RTCcacheFolder, std::string, "./rtccache/", "", 0, "Folder in which the cache file is stored")
+AddOption(applyNNclusterizer, int, 0, "", 0, "(Bool, default = 0), if the neural network clsuterizer should be used.")
+AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
+AddOption(nnRegressionPath, std::string, "./network_reg.onnx", "", 0, "The regression network path")
+AddOption(nnClassThreshold, float, 0.16, "", 0, "The cutoff at which clusters will be accepted / rejected.")
+AddOption(nnSigmoidTrafoThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
+AddOption(nnAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input")
+AddOption(nnSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnSizeInputPad, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnSizeInputTime, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
 AddVariable(eventDisplay, GPUCA_NAMESPACE::gpu::GPUDisplayFrontendInterface*, nullptr)
 AddSubConfig(GPUSettingsProcessingRTC, rtc)
 AddSubConfig(GPUSettingsProcessingParam, param)
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
index 68615f47d05db..7a202c852b895 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -722,7 +722,7 @@ int GPUChainTracking::RunChain()
       return 1;
     }
   } else if (mIOPtrs.tpcPackedDigits || mIOPtrs.tpcZS) {
-    if (runRecoStep(RecoStep::TPCClusterFinding, &GPUChainTracking::RunTPCClusterizer, false, true)) { // FIXME: This enables the neural network clusterization -> Need to actually set this as configurable
+    if (runRecoStep(RecoStep::TPCClusterFinding, &GPUChainTracking::RunTPCClusterizer, false)) { // FIXME: This enables the neural network clusterization -> Need to actually set this as configurable
       return 1;
     }
   }
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h
index 032ad0524ccff..89f2ecd10f65f 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.h
+++ b/GPU/GPUTracking/Global/GPUChainTracking.h
@@ -161,7 +161,7 @@ class GPUChainTracking : public GPUChain, GPUReconstructionHelpers::helperDelega
   void SetQAFromForeignChain(GPUChainTracking* chain) { mQAFromForeignChain = chain; }
 
   // Processing functions
-  int RunTPCClusterizer(bool synchronizeOutput = true, bool applyNNclusterizer = false);
+  int RunTPCClusterizer(bool synchronizeOutput = true);
   int ForwardTPCDigits();
   int RunTPCTrackingSlices();
   int RunTPCTrackingMerger(bool synchronizeOutput = true);
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 26878e6111bd5..6ed3406646abb 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -566,7 +566,7 @@ int GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
 #endif
 
 // TODO: Clusterizer not working with OCL1 (Clusterizer on CPU, Tracking on GPU)
-int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclusterizer)
+int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 {
   if (param().rec.fwdTPCDigitsAsClusters) {
     return ForwardTPCDigits();
@@ -837,7 +837,7 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclus
         if (clusterer.mPmemory->counters.nPeaks == 0) {
           continue;
         }
-        if(!applyNNclusterizer){
+        if(!GetProcessingSettings().applyNNclusterizer){
           runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
           runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
         } else {
@@ -877,25 +877,29 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclus
         if(doGPU){
           runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSlice}}, 0);
         } else {
-          std::string path_class = "/lustre/alice/users/csonnab/PhD/jobs/clusterization/NN/output/normalized_qCenter/o2sim_150324_50Ev_10000QED_PbPb_13t7p/classification/3D_FCNN_1cls_03_04_2024_10M_FP16_addIndex/network/net_onnx.onnx", path_reg = "/lustre/alice/users/csonnab/PhD/jobs/clusterization/NN/output/normalized_qCenter/o2sim_150324_50Ev_10000QED_PbPb_13t7p/regression/3D_FCNN_1cls_05_04_2024_10M_FP16_addIndex/network/net_onnx.onnx";
-          clusterer.model_class.init(path_class, 1, 1);
-          clusterer.model_reg.init(path_reg, 1, 1);
-
-          runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+          if(GetProcessingSettings().applyNNclusterizer){
+            clusterer.model_class.init(GetProcessingSettings().nnClassificationPath, 1, 1);
+            clusterer.model_reg.init(GetProcessingSettings().nnRegressionPath, 1, 1);
+            clusterer.nnSizeInputRow = GetProcessingSettings().nnSizeInputRow;
+            clusterer.nnSizeInputPad = GetProcessingSettings().nnSizeInputPad;
+            clusterer.nnSizeInputTime = GetProcessingSettings().nnSizeInputTime;
+            clusterer.nnAddIndexData = GetProcessingSettings().nnAddIndexData;
+            clusterer.nnClassThreshold = GetProcessingSettings().nnClassThreshold;
+            clusterer.nnSigmoidTrafoThreshold = GetProcessingSettings().nnSigmoidTrafoThreshold;
+            runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+          } else {
+            runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+          }
         }
         if (doGPU && propagateMCLabels) {
           TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane);
           if (doGPU) {
             SynchronizeStream(lane);
           }
-          if(!applyNNclusterizer){
+          if(!GetProcessingSettings().applyNNclusterizer){
             runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
           } else {
-            // FIXME: Here I need to apply the neural network
-            // runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
             runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
-            // GPUTPCNNClusterizer nn_clus;
-            // nn_clus.exec({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
           }
         }
         if (GetProcessingSettings().debugLevel >= 3) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index ae40ff780b25a..a449eb23ef426 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -144,6 +144,13 @@ class GPUTPCClusterFinder : public GPUProcessor
   short mZSOffsetId = -1;
   short mOutputId = -1;
 
+  int nnSizeInputRow = 3;
+  int nnSizeInputPad = 3;
+  int nnSizeInputTime = 3;
+  bool nnAddIndexData = true;
+  float nnClassThreshold = 0.16;
+  bool nnSigmoidTrafoThreshold = 1;
+
   OnnxModel model_class, model_reg;
 
 #ifndef GPUCA_GPUCODE
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index afee680bc0ceb..d2656531c6df1 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -34,7 +34,7 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo
 
   tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow; // (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
-  GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true);
+  GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, clusterer.nnSizeInputRow, clusterer.nnSizeInputPad, clusterer.nnSizeInputTime, clusterer.nnAddIndexData, clusterer.nnClassThreshold, clusterer.nnSigmoidTrafoThreshold);
 
   // tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 // 

From ab4653a78a470e740478ed719f24bfed0b8fc0cb Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 24 Jun 2024 08:08:01 +0200
Subject: [PATCH 06/23] Modifying the onnx_interface to include the right
 headers

---
 Common/ML/include/ML/onnx_interface.h | 6 +++++-
 GPU/GPUTracking/ML/onnx_interface.h   | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/Common/ML/include/ML/onnx_interface.h b/Common/ML/include/ML/onnx_interface.h
index 506311c067351..d3676b7a3f87a 100644
--- a/Common/ML/include/ML/onnx_interface.h
+++ b/Common/ML/include/ML/onnx_interface.h
@@ -21,7 +21,11 @@
 #define GPU_ML_ONNX_INTERFACE_H
 
 // C++ and system includes
-#include <onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>
+#if __has_include(<onnxruntime/core/session/onnxruntime_cxx_api.h>)
+#include <onnxruntime/core/session/onnxruntime_cxx_api.h>
+#else
+#include <onnxruntime_cxx_api.h>
+#endif
 #include <vector>
 #include <string>
 #include <memory>
diff --git a/GPU/GPUTracking/ML/onnx_interface.h b/GPU/GPUTracking/ML/onnx_interface.h
index 506311c067351..d3676b7a3f87a 100644
--- a/GPU/GPUTracking/ML/onnx_interface.h
+++ b/GPU/GPUTracking/ML/onnx_interface.h
@@ -21,7 +21,11 @@
 #define GPU_ML_ONNX_INTERFACE_H
 
 // C++ and system includes
-#include <onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>
+#if __has_include(<onnxruntime/core/session/onnxruntime_cxx_api.h>)
+#include <onnxruntime/core/session/onnxruntime_cxx_api.h>
+#else
+#include <onnxruntime_cxx_api.h>
+#endif
 #include <vector>
 #include <string>
 #include <memory>

From 04084c8fd1ea9be525a7368afe5567112d4549cc Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 24 Jun 2024 13:21:09 +0200
Subject: [PATCH 07/23] Adjusting initialization for new ONNXRuntime version

---
 Common/ML/include/ML/onnx_interface.h | 26 ++++---
 Common/ML/src/onnx_interface.cxx      | 98 +++++++++++++++++++--------
 GPU/GPUTracking/ML/onnx_interface.cxx | 98 +++++++++++++++++++--------
 GPU/GPUTracking/ML/onnx_interface.h   | 19 ++++--
 4 files changed, 171 insertions(+), 70 deletions(-)

diff --git a/Common/ML/include/ML/onnx_interface.h b/Common/ML/include/ML/onnx_interface.h
index d3676b7a3f87a..fcc02a49996ea 100644
--- a/Common/ML/include/ML/onnx_interface.h
+++ b/Common/ML/include/ML/onnx_interface.h
@@ -17,8 +17,8 @@
 /// \brief    A general-purpose class for ONNX models
 ///
 
-#ifndef GPU_ML_ONNX_INTERFACE_H
-#define GPU_ML_ONNX_INTERFACE_H
+#ifndef COMMON_ML_ONNX_INTERFACE_H
+#define COMMON_ML_ONNX_INTERFACE_H
 
 // C++ and system includes
 #if __has_include(<onnxruntime/core/session/onnxruntime_cxx_api.h>)
@@ -43,10 +43,9 @@ namespace ml
 
 class OnnxModel
 {
-
  public:
-  OnnxModel() = default;
-  ~OnnxModel() = default;
+  OnnxModel() : mMemoryInfo(Ort::MemoryInfo::CreateCpu(OrtAllocatorType, OrtMemType)) {};
+  virtual ~OnnxModel() = default;
 
   // Inferencing
   void init(std::string, bool = false, int = 0);
@@ -56,11 +55,19 @@ class OnnxModel
   template<class T> std::vector<float> inference_vector(T input, unsigned int size);
 
   // Reset session
-  void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); }
+  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
+    void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); };
+  #else
+    void resetSession() { mSession.reset(new Ort::Session{*mEnv, modelPath.c_str(), sessionOptions}); };
+  #endif
 
   // Getters & Setters
   Ort::SessionOptions* getSessionOptions() { return &sessionOptions; } // For optimizations in post
-  std::shared_ptr<Ort::Experimental::Session> getSession() { return mSession; }
+  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
+    std::shared_ptr<Ort::Experimental::Session> getSession() { return mSession; }
+  #else
+    std::shared_ptr<Ort::Session> getSession() { return mSession; }
+  #endif
   std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
   std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
   void setActiveThreads(int);
@@ -68,7 +75,8 @@ class OnnxModel
  private:
   // Environment variables for the ONNX runtime
   std::shared_ptr<Ort::Env> mEnv = nullptr;
-  std::shared_ptr<Ort::Experimental::Session> mSession = nullptr;
+  std::shared_ptr<Ort::Session> mSession = nullptr; ///< ONNX session
+  Ort::MemoryInfo mMemoryInfo;
   Ort::SessionOptions sessionOptions;
 
   // Input & Output specifications of the loaded network
@@ -89,4 +97,4 @@ class OnnxModel
 
 } // namespace GPUCA_NAMESPACE
 
-#endif // GPU_ML_ONNX_INTERFACE_H
\ No newline at end of file
+#endif // COMMON_ML_ONNX_INTERFACE_H
\ No newline at end of file
diff --git a/Common/ML/src/onnx_interface.cxx b/Common/ML/src/onnx_interface.cxx
index e7c952d6b8cdc..549575600a656 100644
--- a/Common/ML/src/onnx_interface.cxx
+++ b/Common/ML/src/onnx_interface.cxx
@@ -43,6 +43,11 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread
   modelPath = localPath;
   activeThreads = threads;
 
+#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
+#else
+  mMemoryInfo = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
+#endif
+
   /// Enableing optimizations
   if(threads != 0){
     // sessionOptions.SetInterOpNumThreads(1);
@@ -63,12 +68,28 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread
   }
 
   mEnv = std::make_shared<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "onnx-model");
-  mSession = std::make_shared<Ort::Experimental::Session>(*mEnv, modelPath, sessionOptions);
-
-  mInputNames = mSession->GetInputNames();
-  mInputShapes = mSession->GetInputShapes();
-  mOutputNames = mSession->GetOutputNames();
-  mOutputShapes = mSession->GetOutputShapes();
+  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
+    mSession = std::make_shared<Ort::Experimental::Session>(*mEnv, modelPath, sessionOptions);
+    mInputNames = mSession->GetInputNames();
+    mInputShapes = mSession->GetInputShapes();
+    mOutputNames = mSession->GetOutputNames();
+    mOutputShapes = mSession->GetOutputShapes();
+  #else
+    mSession = std::make_shared<Ort::Session>(*mEnv, modelPath.c_str(), sessionOptions);
+    Ort::AllocatorWithDefaultOptions tmpAllocator;
+    for (size_t i = 0; i < mSession->GetInputCount(); ++i) {
+      mInputNames.push_back(mSession->GetInputNameAllocated(i, tmpAllocator).get());
+    }
+    for (size_t i = 0; i < mSession->GetInputCount(); ++i) {
+      mInputShapes.emplace_back(mSession->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+    }
+    for (size_t i = 0; i < mSession->GetOutputCount(); ++i) {
+      mOutputNames.push_back(mSession->GetOutputNameAllocated(i, tmpAllocator).get());
+    }
+    for (size_t i = 0; i < mSession->GetOutputCount(); ++i) {
+      mOutputShapes.emplace_back(mSession->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+    }
+  #endif
 
   LOG(info) << "Input Nodes:";
   for (size_t i = 0; i < mInputNames.size(); i++) {
@@ -121,7 +142,6 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread
 template<class T>
 float* OnnxModel::inference(T input, unsigned int size)
 {
-
   std::vector<int64_t> inputShape = mInputShapes[0];
   inputShape[0] = size;
   std::vector<Ort::Value> inputTensors;
@@ -129,22 +149,36 @@ float* OnnxModel::inference(T input, unsigned int size)
   for(auto elem : inputShape){
     mem_size*=elem;
   }
+#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
   inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
-  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
   try {
-    auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-    float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-    return outputValues;
-  } catch (const Ort::Exception& exception) {
-    LOG(error) << "Error running model inference: " << exception.what();
-  }
+      auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
+      inputTensors.clear();
+      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+      return outputValues;
+    } catch (const Ort::Exception& exception) {
+      LOG(error) << "Error running model inference: " << exception.what();
+    }
+#else
+  std::vector<const char*> tmpInputs;
+  std::vector<const char*> tmpOutputs;
+  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1));
+  try {
+      auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size());
+      inputTensors.clear();
+      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+      return outputValues;
+    } catch (const Ort::Exception& exception) {
+      LOG(error) << "Error running model inference: " << exception.what();
+    }
+#endif
+  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
   return nullptr;
 }
 
 template<class T>
 std::vector<float> OnnxModel::inference_vector(T input, unsigned int size)
 {
-
   std::vector<int64_t> inputShape = mInputShapes[0];
   inputShape[0] = size;
   std::vector<Ort::Value> inputTensors;
@@ -153,21 +187,29 @@ std::vector<float> OnnxModel::inference_vector(T input, unsigned int size)
   for(auto elem : inputShape){
     mem_size*=elem;
   }
+#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
   inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
-  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
   try {
-    auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-    float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-    std::vector<float> outputVector{outputValues, outputValues + size * mOutputShapes[0][1]};
-    // for(int s = 0; s < size; s++){
-    //   for(int o = 0; o < mOutputShapes[0][1]; o++){
-    //     outputValues.push_back(tmp_output_values[s*(int)mOutputShapes[0][1] + o]);
-    //   }
-    // }
-    return outputVector;
-  } catch (const Ort::Exception& exception) {
-    LOG(error) << "Error running model inference: " << exception.what();
-  }
+      auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
+      inputTensors.clear();
+      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+      return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
+    } catch (const Ort::Exception& exception) {
+      LOG(error) << "Error running model inference: " << exception.what();
+    }
+#else
+  std::vector<const char*> tmpInputs;
+  std::vector<const char*> tmpOutputs;
+  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1));
+  try {
+      auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size());
+      inputTensors.clear();
+      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+      return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
+    } catch (const Ort::Exception& exception) {
+      LOG(error) << "Error running model inference: " << exception.what();
+    }
+#endif
   return std::vector<float>{};
 }
 
diff --git a/GPU/GPUTracking/ML/onnx_interface.cxx b/GPU/GPUTracking/ML/onnx_interface.cxx
index e7c952d6b8cdc..549575600a656 100644
--- a/GPU/GPUTracking/ML/onnx_interface.cxx
+++ b/GPU/GPUTracking/ML/onnx_interface.cxx
@@ -43,6 +43,11 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread
   modelPath = localPath;
   activeThreads = threads;
 
+#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
+#else
+  mMemoryInfo = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
+#endif
+
   /// Enableing optimizations
   if(threads != 0){
     // sessionOptions.SetInterOpNumThreads(1);
@@ -63,12 +68,28 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread
   }
 
   mEnv = std::make_shared<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "onnx-model");
-  mSession = std::make_shared<Ort::Experimental::Session>(*mEnv, modelPath, sessionOptions);
-
-  mInputNames = mSession->GetInputNames();
-  mInputShapes = mSession->GetInputShapes();
-  mOutputNames = mSession->GetOutputNames();
-  mOutputShapes = mSession->GetOutputShapes();
+  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
+    mSession = std::make_shared<Ort::Experimental::Session>(*mEnv, modelPath, sessionOptions);
+    mInputNames = mSession->GetInputNames();
+    mInputShapes = mSession->GetInputShapes();
+    mOutputNames = mSession->GetOutputNames();
+    mOutputShapes = mSession->GetOutputShapes();
+  #else
+    mSession = std::make_shared<Ort::Session>(*mEnv, modelPath.c_str(), sessionOptions);
+    Ort::AllocatorWithDefaultOptions tmpAllocator;
+    for (size_t i = 0; i < mSession->GetInputCount(); ++i) {
+      mInputNames.push_back(mSession->GetInputNameAllocated(i, tmpAllocator).get());
+    }
+    for (size_t i = 0; i < mSession->GetInputCount(); ++i) {
+      mInputShapes.emplace_back(mSession->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+    }
+    for (size_t i = 0; i < mSession->GetOutputCount(); ++i) {
+      mOutputNames.push_back(mSession->GetOutputNameAllocated(i, tmpAllocator).get());
+    }
+    for (size_t i = 0; i < mSession->GetOutputCount(); ++i) {
+      mOutputShapes.emplace_back(mSession->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+    }
+  #endif
 
   LOG(info) << "Input Nodes:";
   for (size_t i = 0; i < mInputNames.size(); i++) {
@@ -121,7 +142,6 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread
 template<class T>
 float* OnnxModel::inference(T input, unsigned int size)
 {
-
   std::vector<int64_t> inputShape = mInputShapes[0];
   inputShape[0] = size;
   std::vector<Ort::Value> inputTensors;
@@ -129,22 +149,36 @@ float* OnnxModel::inference(T input, unsigned int size)
   for(auto elem : inputShape){
     mem_size*=elem;
   }
+#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
   inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
-  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
   try {
-    auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-    float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-    return outputValues;
-  } catch (const Ort::Exception& exception) {
-    LOG(error) << "Error running model inference: " << exception.what();
-  }
+      auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
+      inputTensors.clear();
+      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+      return outputValues;
+    } catch (const Ort::Exception& exception) {
+      LOG(error) << "Error running model inference: " << exception.what();
+    }
+#else
+  std::vector<const char*> tmpInputs;
+  std::vector<const char*> tmpOutputs;
+  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1));
+  try {
+      auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size());
+      inputTensors.clear();
+      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+      return outputValues;
+    } catch (const Ort::Exception& exception) {
+      LOG(error) << "Error running model inference: " << exception.what();
+    }
+#endif
+  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
   return nullptr;
 }
 
 template<class T>
 std::vector<float> OnnxModel::inference_vector(T input, unsigned int size)
 {
-
   std::vector<int64_t> inputShape = mInputShapes[0];
   inputShape[0] = size;
   std::vector<Ort::Value> inputTensors;
@@ -153,21 +187,29 @@ std::vector<float> OnnxModel::inference_vector(T input, unsigned int size)
   for(auto elem : inputShape){
     mem_size*=elem;
   }
+#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
   inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
-  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
   try {
-    auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-    float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-    std::vector<float> outputVector{outputValues, outputValues + size * mOutputShapes[0][1]};
-    // for(int s = 0; s < size; s++){
-    //   for(int o = 0; o < mOutputShapes[0][1]; o++){
-    //     outputValues.push_back(tmp_output_values[s*(int)mOutputShapes[0][1] + o]);
-    //   }
-    // }
-    return outputVector;
-  } catch (const Ort::Exception& exception) {
-    LOG(error) << "Error running model inference: " << exception.what();
-  }
+      auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
+      inputTensors.clear();
+      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+      return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
+    } catch (const Ort::Exception& exception) {
+      LOG(error) << "Error running model inference: " << exception.what();
+    }
+#else
+  std::vector<const char*> tmpInputs;
+  std::vector<const char*> tmpOutputs;
+  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1));
+  try {
+      auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size());
+      inputTensors.clear();
+      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+      return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
+    } catch (const Ort::Exception& exception) {
+      LOG(error) << "Error running model inference: " << exception.what();
+    }
+#endif
   return std::vector<float>{};
 }
 
diff --git a/GPU/GPUTracking/ML/onnx_interface.h b/GPU/GPUTracking/ML/onnx_interface.h
index d3676b7a3f87a..5ee2bd716d257 100644
--- a/GPU/GPUTracking/ML/onnx_interface.h
+++ b/GPU/GPUTracking/ML/onnx_interface.h
@@ -45,8 +45,8 @@ class OnnxModel
 {
 
  public:
-  OnnxModel() = default;
-  ~OnnxModel() = default;
+  OnnxModel(OrtAllocatorType allocatorType = OrtDeviceAllocator, OrtMemType memoryType = OrtMemTypeCPU) : mMemoryInfo(Ort::MemoryInfo::CreateCpu(allocatorType, memoryType)) {};
+  virtual ~OnnxModel() = default;
 
   // Inferencing
   void init(std::string, bool = false, int = 0);
@@ -56,11 +56,19 @@ class OnnxModel
   template<class T> std::vector<float> inference_vector(T input, unsigned int size);
 
   // Reset session
-  void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); }
+  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
+    void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); };
+  #else
+    void resetSession() { mSession.reset(new Ort::Session{*mEnv, modelPath.c_str(), sessionOptions}); };
+  #endif
 
   // Getters & Setters
   Ort::SessionOptions* getSessionOptions() { return &sessionOptions; } // For optimizations in post
-  std::shared_ptr<Ort::Experimental::Session> getSession() { return mSession; }
+  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
+    std::shared_ptr<Ort::Experimental::Session> getSession() { return mSession; }
+  #else
+    std::shared_ptr<Ort::Session> getSession() { return mSession; }
+  #endif
   std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
   std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
   void setActiveThreads(int);
@@ -68,7 +76,8 @@ class OnnxModel
  private:
   // Environment variables for the ONNX runtime
   std::shared_ptr<Ort::Env> mEnv = nullptr;
-  std::shared_ptr<Ort::Experimental::Session> mSession = nullptr;
+  std::shared_ptr<Ort::Session> mSession = nullptr; ///< ONNX session
+  Ort::MemoryInfo mMemoryInfo;
   Ort::SessionOptions sessionOptions;
 
   // Input & Output specifications of the loaded network

From 01dc4a1bd96f3c6094f1368604dff895754a17d3 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 26 Jun 2024 09:53:36 +0200
Subject: [PATCH 08/23] Adjusting global settings and CF code for several
 settings

---
 Common/ML/src/onnx_interface.cxx              |  4 +-
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  1 +
 .../Global/GPUChainTrackingClusterizer.cxx    | 28 +++++------
 GPU/GPUTracking/ML/onnx_interface.cxx         | 49 +++++++++++--------
 GPU/GPUTracking/ML/onnx_interface.h           |  2 +-
 .../TPCClusterFinder/GPUTPCClusterFinder.h    |  1 +
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 38 +++++++-------
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  2 +-
 8 files changed, 67 insertions(+), 58 deletions(-)

diff --git a/Common/ML/src/onnx_interface.cxx b/Common/ML/src/onnx_interface.cxx
index 549575600a656..c348d4577d47f 100644
--- a/Common/ML/src/onnx_interface.cxx
+++ b/Common/ML/src/onnx_interface.cxx
@@ -200,9 +200,9 @@ std::vector<float> OnnxModel::inference_vector(T input, unsigned int size)
 #else
   std::vector<const char*> tmpInputs;
   std::vector<const char*> tmpOutputs;
-  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1));
+  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
   try {
-      auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size());
+      auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
       inputTensors.clear();
       float* outputValues = outputTensors[0].GetTensorMutableData<float>();
       return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 5e9d3499eda77..bc42d50d4a88a 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -283,6 +283,7 @@ AddOption(tpcMaxAttachedClustersPerSectorRow, unsigned int, 51000, "", 0, "Maxim
 AddOption(tpcUseOldCPUDecoding, bool, false, "", 0, "Enable old CPU-based TPC decoding")
 AddOption(RTCcacheFolder, std::string, "./rtccache/", "", 0, "Folder in which the cache file is stored")
 AddOption(applyNNclusterizer, int, 0, "", 0, "(Bool, default = 0), if the neural network clsuterizer should be used.")
+AddOption(nnClusterizerVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs")
 AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
 AddOption(nnRegressionPath, std::string, "./network_reg.onnx", "", 0, "The regression network path")
 AddOption(nnClassThreshold, float, 0.16, "", 0, "The cutoff at which clusters will be accepted / rejected.")
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 6ed3406646abb..44cd1a5f62f4c 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -874,23 +874,21 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
         DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
 
-        if(doGPU){
-          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSlice}}, 0);
+        if(GetProcessingSettings().applyNNclusterizer){
+          clusterer.model_class.init(GetProcessingSettings().nnClassificationPath, 1, 1, GetProcessingSettings().nnClusterizerVerbosity);
+          clusterer.model_reg.init(GetProcessingSettings().nnRegressionPath, 1, 1, GetProcessingSettings().nnClusterizerVerbosity);
+          clusterer.nnSizeInputRow = GetProcessingSettings().nnSizeInputRow;
+          clusterer.nnSizeInputPad = GetProcessingSettings().nnSizeInputPad;
+          clusterer.nnSizeInputTime = GetProcessingSettings().nnSizeInputTime;
+          clusterer.nnAddIndexData = GetProcessingSettings().nnAddIndexData;
+          clusterer.nnClassThreshold = GetProcessingSettings().nnClassThreshold;
+          clusterer.nnSigmoidTrafoThreshold = GetProcessingSettings().nnSigmoidTrafoThreshold;
+          clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnClusterizerVerbosity;
+          runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
         } else {
-          if(GetProcessingSettings().applyNNclusterizer){
-            clusterer.model_class.init(GetProcessingSettings().nnClassificationPath, 1, 1);
-            clusterer.model_reg.init(GetProcessingSettings().nnRegressionPath, 1, 1);
-            clusterer.nnSizeInputRow = GetProcessingSettings().nnSizeInputRow;
-            clusterer.nnSizeInputPad = GetProcessingSettings().nnSizeInputPad;
-            clusterer.nnSizeInputTime = GetProcessingSettings().nnSizeInputTime;
-            clusterer.nnAddIndexData = GetProcessingSettings().nnAddIndexData;
-            clusterer.nnClassThreshold = GetProcessingSettings().nnClassThreshold;
-            clusterer.nnSigmoidTrafoThreshold = GetProcessingSettings().nnSigmoidTrafoThreshold;
-            runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
-          } else {
-            runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
-          }
+          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
         }
+
         if (doGPU && propagateMCLabels) {
           TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane);
           if (doGPU) {
diff --git a/GPU/GPUTracking/ML/onnx_interface.cxx b/GPU/GPUTracking/ML/onnx_interface.cxx
index 549575600a656..9bb5137ec63dd 100644
--- a/GPU/GPUTracking/ML/onnx_interface.cxx
+++ b/GPU/GPUTracking/ML/onnx_interface.cxx
@@ -35,11 +35,13 @@ std::string OnnxModel::printShape(const std::vector<int64_t>& v)
   return ss.str();
 }
 
-void OnnxModel::init(std::string localPath, bool enableOptimizations, int threads)
+void OnnxModel::init(std::string localPath, bool enableOptimizations, int threads, int verbosity)
 {
 
-  LOG(info) << "--- ONNX-ML model ---";
-  LOG(info) << "Taking model from: " << localPath;
+  if(verbosity > 1){
+    LOG(info) << "--- ONNX-ML model ---";
+    LOG(info) << "Taking model from: " << localPath;
+  }
   modelPath = localPath;
   activeThreads = threads;
 
@@ -91,17 +93,18 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread
     }
   #endif
 
-  LOG(info) << "Input Nodes:";
-  for (size_t i = 0; i < mInputNames.size(); i++) {
-    LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
-  }
+  if(verbosity > 1){
+    LOG(info) << "Input Nodes:";
+    for (size_t i = 0; i < mInputNames.size(); i++) {
+      LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
+    }
 
-  LOG(info) << "Output Nodes:";
-  for (size_t i = 0; i < mOutputNames.size(); i++) {
-    LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]);
+    LOG(info) << "Output Nodes:";
+    for (size_t i = 0; i < mOutputNames.size(); i++) {
+      LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]);
+    }
+    LOG(info) << "--- Model initialized! ---";
   }
-  
-  LOG(info) << "--- Model initialized! ---";
 }
 
 // float* OnnxModel::inference(std::vector<Ort::Value> input, int device_id)
@@ -200,15 +203,21 @@ std::vector<float> OnnxModel::inference_vector(T input, unsigned int size)
 #else
   std::vector<const char*> tmpInputs;
   std::vector<const char*> tmpOutputs;
-  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1));
+  for (unsigned int i = 0; i < mInputNames.size(); i++) {
+    tmpInputs.emplace_back(mInputNames[i].c_str());
+  }
+  for (unsigned int i = 0; i < mOutputNames.size(); i++) {
+    tmpOutputs.emplace_back(mOutputNames[i].c_str());
+  }
+  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(mMemoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size()));
   try {
-      auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size());
-      inputTensors.clear();
-      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-      return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
-    } catch (const Ort::Exception& exception) {
-      LOG(error) << "Error running model inference: " << exception.what();
-    }
+    auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size());
+    inputTensors.clear();
+    float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+    return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
+  } catch (const Ort::Exception& exception) {
+    LOG(error) << "Error running model inference: " << exception.what();
+  }
 #endif
   return std::vector<float>{};
 }
diff --git a/GPU/GPUTracking/ML/onnx_interface.h b/GPU/GPUTracking/ML/onnx_interface.h
index 5ee2bd716d257..17c45f439dc63 100644
--- a/GPU/GPUTracking/ML/onnx_interface.h
+++ b/GPU/GPUTracking/ML/onnx_interface.h
@@ -49,7 +49,7 @@ class OnnxModel
   virtual ~OnnxModel() = default;
 
   // Inferencing
-  void init(std::string, bool = false, int = 0);
+  void init(std::string, bool = false, int = 0, int = 0);
   // float* inference(std::vector<Ort::Value>, int = 0);
   // float* inference(std::vector<float>, int = 0);
   template<class T> float* inference(T input, unsigned int size);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index a449eb23ef426..aed00623ef167 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -150,6 +150,7 @@ class GPUTPCClusterFinder : public GPUProcessor
   bool nnAddIndexData = true;
   float nnClassThreshold = 0.16;
   bool nnSigmoidTrafoThreshold = 1;
+  int nnClusterizerVerbosity = 1;
 
   OnnxModel model_class, model_reg;
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index d2656531c6df1..d7e3226e0d54c 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -34,28 +34,28 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo
 
   tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow; // (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
-  GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, clusterer.nnSizeInputRow, clusterer.nnSizeInputPad, clusterer.nnSizeInputTime, clusterer.nnAddIndexData, clusterer.nnClassThreshold, clusterer.nnSigmoidTrafoThreshold);
+  GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, clusterer.nnSizeInputRow, clusterer.nnSizeInputPad, clusterer.nnSizeInputTime, clusterer.nnAddIndexData, clusterer.nnClassThreshold, clusterer.nnSigmoidTrafoThreshold, clusterer.nnClusterizerVerbosity);
 
   // tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 // 
   // GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
 }
 
-GPUd() void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC)
-{
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  CPU_ONLY(
-    MCLabelAccumulator labelAcc(clusterer));
-
-  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
-
-  std::string path_class = "", path_reg = "";
-
-  clusterer.model_class.init(path_class, 1, 0);
-  clusterer.model_reg.init(path_reg, 1, 0);
-
-  GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true);
-}
+// GPUd() void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC)
+// {
+//   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+//   CPU_ONLY(
+//     MCLabelAccumulator labelAcc(clusterer));
+// 
+//   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+// 
+//   std::string path_class = "", path_reg = "";
+// 
+//   clusterer.model_class.init(path_class, 1, 0);
+//   clusterer.model_reg.init(path_reg, 1, 0);
+// 
+//   GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true);
+// }
 
 int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current)
 {
@@ -117,7 +117,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
                                           uint* clusterInRow,
                                           tpc::ClusterNative* clusterByRow,
                                           uint* clusterPosInRow,
-                                          int in_row, int in_pad, int in_time, bool add_index_data, float class_threshold, bool sigmoid_transform){
+                                          int in_row, int in_pad, int in_time, bool add_index_data, float class_threshold, bool sigmoid_transform, int verbosity){
 
   std::vector<float> input_data(((2*in_row + 1) * (2*in_pad + 1) * (2*in_time + 1) + (add_index_data ? 3 : 0)), -1.f);
   float classification_threshold = class_threshold;
@@ -169,7 +169,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
   std::vector<float> out_reg = clusterer.model_reg.inference_vector(input_data, 1);
   int num_outputs = clusterer.model_reg.getNumOutputNodes()[0][1];
 
-  if(idx == 100){
+  if((verbosity > 4) && idx == 100){
     LOG(info) << "Classification model: " << out_class[0] << " (>? " << classification_threshold << ")";
     LOG(info) << "Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4];
   }
@@ -179,7 +179,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
     pc.setFull(central_charge * out_reg[4], peak.pad() + out_reg[0], out_reg[2], fragment.start + peak.time() + out_reg[1], out_reg[3], 0, 0);
     tpc::ClusterNative myCluster;
     bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param());
-    if (rejectCluster) {
+    if ((verbosity > 0) && rejectCluster) {
       LOG(warning) << "Cluster rejected!";
       if (clusterPosInRow) {
         clusterPosInRow[idx] = maxClusterPerRow;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 905e6f860a90f..7fbf5a806a916 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -77,7 +77,7 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
                               uint*,
                               tpc::ClusterNative*,
                               uint*,
-                              int = 3, int = 3, int = 3, bool = 1, float = 0.16, bool = true);
+                              int = 3, int = 3, int = 3, bool = 1, float = 0.16, bool = true, int = 1);
 
  private:
   // ---------------------------------

From accd7abaac7a2fce98a280ec6e4d8fa2e8eb6254 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 3 Jul 2024 13:39:11 +0200
Subject: [PATCH 09/23] Adding return statement if cluster is rejected

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index d7e3226e0d54c..3c2dadaf660b1 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -202,8 +202,12 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
     } else if (clusterPosInRow) {
       rowIndex = clusterPosInRow[idx];
     }
-
     CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, maxClusterPerRow));
+  } else {
+    if (clusterPosInRow) {
+      clusterPosInRow[idx] = maxClusterPerRow;
+    }
+    return;
   }
 
 }

From 3473a066755dc4ae23ce7965d7b77cb7d5ffb020 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 4 Jul 2024 14:10:58 +0200
Subject: [PATCH 10/23] Adding some statements back

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 3c2dadaf660b1..98f7cdee72b0c 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -32,7 +32,7 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo
   CPU_ONLY(
     MCLabelAccumulator labelAcc(clusterer));
 
-  tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow; // (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
   GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, clusterer.nnSizeInputRow, clusterer.nnSizeInputPad, clusterer.nnSizeInputTime, clusterer.nnAddIndexData, clusterer.nnClassThreshold, clusterer.nnSigmoidTrafoThreshold, clusterer.nnClusterizerVerbosity);
 
@@ -210,6 +210,10 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
     return;
   }
 
+  if((verbosity > 4) && idx == 100){
+    LOG(info) << "Clusterization done!";
+  }
+
 }
 
 

From df21c963bc4cd132eb5eb175160bf5c76e264fe3 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 17 Oct 2024 14:09:20 +0200
Subject: [PATCH 11/23] Update to latest status of gpu clusterization

---
 Common/CMakeLists.txt                         |   1 +
 Common/ML/CMakeLists.txt                      |   5 +-
 Common/ML/include/ML/3rdparty/GPUORTFloat16.h | 867 ++++++++++++++++++
 Common/ML/include/ML/onnx_interface.h         | 100 --
 Common/ML/include/ML/ort_interface.h          |  94 ++
 Common/ML/src/onnx_interface.cxx              | 226 -----
 Common/ML/src/ort_interface.cxx               | 262 ++++++
 GPU/GPUTracking/CMakeLists.txt                |   3 +-
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  28 +-
 GPU/GPUTracking/Global/GPUChainTracking.cxx   |   2 +-
 .../Global/GPUChainTrackingClusterizer.cxx    |  67 +-
 GPU/GPUTracking/ML/onnx_interface.cxx         | 235 -----
 GPU/GPUTracking/ML/onnx_interface.h           | 101 --
 GPU/GPUTracking/TPCClusterFinder/ChargePos.h  |   1 +
 .../TPCClusterFinder/GPUTPCClusterFinder.h    |  25 +-
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 446 ++++++---
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  12 +-
 17 files changed, 1651 insertions(+), 824 deletions(-)
 create mode 100644 Common/ML/include/ML/3rdparty/GPUORTFloat16.h
 delete mode 100644 Common/ML/include/ML/onnx_interface.h
 create mode 100644 Common/ML/include/ML/ort_interface.h
 delete mode 100644 Common/ML/src/onnx_interface.cxx
 create mode 100644 Common/ML/src/ort_interface.cxx
 delete mode 100644 GPU/GPUTracking/ML/onnx_interface.cxx
 delete mode 100644 GPU/GPUTracking/ML/onnx_interface.h

diff --git a/Common/CMakeLists.txt b/Common/CMakeLists.txt
index f435e269575aa..0b92758e45f43 100644
--- a/Common/CMakeLists.txt
+++ b/Common/CMakeLists.txt
@@ -16,5 +16,6 @@ add_subdirectory(Types)
 add_subdirectory(Utils)
 add_subdirectory(SimConfig)
 add_subdirectory(DCAFitter)
+add_subdirectory(ML)
 
 o2_data_file(COPY maps DESTINATION Common)
diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt
index 60a07041da2e0..954d29d6e2793 100644
--- a/Common/ML/CMakeLists.txt
+++ b/Common/ML/CMakeLists.txt
@@ -10,7 +10,6 @@
 # or submit itself to any jurisdiction.
 
 o2_add_library(ML
-               SOURCES src/onnx_interface.cxx
+               SOURCES src/ort_interface.cxx
                TARGETVARNAME targetName
-               PUBLIC_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime
-)
\ No newline at end of file
+               PRIVATE_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime)
\ No newline at end of file
diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
new file mode 100644
index 0000000000000..db65328409d3c
--- /dev/null
+++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
@@ -0,0 +1,867 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// This code was created from:
+//    - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_float16.h
+//    - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+
+#include <stdint.h>
+#include <cmath>
+#include <cstring>
+#include <limits>
+
+namespace o2
+{
+
+namespace OrtDataType
+{
+
+namespace detail
+{
+
+enum class endian {
+#if defined(_WIN32)
+  little = 0,
+  big = 1,
+  native = little,
+#elif defined(__GNUC__) || defined(__clang__)
+  little = __ORDER_LITTLE_ENDIAN__,
+  big = __ORDER_BIG_ENDIAN__,
+  native = __BYTE_ORDER__,
+#else
+#error OrtDataType::detail::endian is not implemented in this environment.
+#endif
+};
+
+static_assert(
+  endian::native == endian::little || endian::native == endian::big,
+  "Only little-endian or big-endian native byte orders are supported.");
+
+} // namespace detail
+
+/// <summary>
+/// Shared implementation between public and internal classes. CRTP pattern.
+/// </summary>
+template <class Derived>
+struct Float16Impl {
+ protected:
+  /// <summary>
+  /// Converts from float to uint16_t float16 representation
+  /// </summary>
+  /// <param name="v"></param>
+  /// <returns></returns>
+  constexpr static uint16_t ToUint16Impl(float v) noexcept;
+
+  /// <summary>
+  /// Converts float16 to float
+  /// </summary>
+  /// <returns>float representation of float16 value</returns>
+  float ToFloatImpl() const noexcept;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  uint16_t AbsImpl() const noexcept
+  {
+    return static_cast<uint16_t>(val & ~kSignMask);
+  }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  uint16_t NegateImpl() const noexcept
+  {
+    return IsNaN() ? val : static_cast<uint16_t>(val ^ kSignMask);
+  }
+
+ public:
+  // uint16_t special values
+  static constexpr uint16_t kSignMask = 0x8000U;
+  static constexpr uint16_t kBiasedExponentMask = 0x7C00U;
+  static constexpr uint16_t kPositiveInfinityBits = 0x7C00U;
+  static constexpr uint16_t kNegativeInfinityBits = 0xFC00U;
+  static constexpr uint16_t kPositiveQNaNBits = 0x7E00U;
+  static constexpr uint16_t kNegativeQNaNBits = 0xFE00U;
+  static constexpr uint16_t kEpsilonBits = 0x4170U;
+  static constexpr uint16_t kMinValueBits = 0xFBFFU; // Minimum normal number
+  static constexpr uint16_t kMaxValueBits = 0x7BFFU; // Largest normal number
+  static constexpr uint16_t kOneBits = 0x3C00U;
+  static constexpr uint16_t kMinusOneBits = 0xBC00U;
+
+  uint16_t val{0};
+
+  Float16Impl() = default;
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  bool IsNegative() const noexcept
+  {
+    return static_cast<int16_t>(val) < 0;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  bool IsNaN() const noexcept
+  {
+    return AbsImpl() > kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  bool IsFinite() const noexcept
+  {
+    return AbsImpl() < kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  bool IsPositiveInfinity() const noexcept
+  {
+    return val == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  bool IsNegativeInfinity() const noexcept
+  {
+    return val == kNegativeInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  bool IsInfinity() const noexcept
+  {
+    return AbsImpl() == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  bool IsNaNOrZero() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs == 0 || abs > kPositiveInfinityBits);
+  }
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsNormal() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)          // is finite
+           && (abs != 0)                          // is not zero
+           && ((abs & kBiasedExponentMask) != 0); // is not subnormal (has a non-zero exponent)
+  }
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsSubnormal() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)          // is finite
+           && (abs != 0)                          // is not zero
+           && ((abs & kBiasedExponentMask) == 0); // is subnormal (has a zero exponent)
+  }
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); }
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  static bool AreZero(const Float16Impl& lhs, const Float16Impl& rhs) noexcept
+  {
+    return static_cast<uint16_t>((lhs.val | rhs.val) & ~kSignMask) == 0;
+  }
+
+  bool operator==(const Float16Impl& rhs) const noexcept
+  {
+    if (IsNaN() || rhs.IsNaN()) {
+      // IEEE defines that NaN is not equal to anything, including itself.
+      return false;
+    }
+    return val == rhs.val;
+  }
+
+  bool operator!=(const Float16Impl& rhs) const noexcept { return !(*this == rhs); }
+
+  bool operator<(const Float16Impl& rhs) const noexcept
+  {
+    if (IsNaN() || rhs.IsNaN()) {
+      // IEEE defines that NaN is unordered with respect to everything, including itself.
+      return false;
+    }
+
+    const bool left_is_negative = IsNegative();
+    if (left_is_negative != rhs.IsNegative()) {
+      // When the signs of left and right differ, we know that left is less than right if it is
+      // the negative value. The exception to this is if both values are zero, in which case IEEE
+      // says they should be equal, even if the signs differ.
+      return left_is_negative && !AreZero(*this, rhs);
+    }
+    return (val != rhs.val) && ((val < rhs.val) ^ left_is_negative);
+  }
+};
+
+// The following Float16_t conversions are based on the code from
+// Eigen library.
+
+// The conversion routines are Copyright (c) Fabian Giesen, 2016.
+// The original license follows:
+//
+// Copyright (c) Fabian Giesen, 2016
+// All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+namespace detail
+{
+union float32_bits {
+  unsigned int u;
+  float f;
+};
+}; // namespace detail
+
+template <class Derived>
+inline constexpr uint16_t Float16Impl<Derived>::ToUint16Impl(float v) noexcept
+{
+  detail::float32_bits f{};
+  f.f = v;
+
+  constexpr detail::float32_bits f32infty = {255 << 23};
+  constexpr detail::float32_bits f16max = {(127 + 16) << 23};
+  constexpr detail::float32_bits denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23};
+  constexpr unsigned int sign_mask = 0x80000000u;
+  uint16_t val = static_cast<uint16_t>(0x0u);
+
+  unsigned int sign = f.u & sign_mask;
+  f.u ^= sign;
+
+  // NOTE all the integer compares in this function can be safely
+  // compiled into signed compares since all operands are below
+  // 0x80000000. Important if you want fast straight SSE2 code
+  // (since there's no unsigned PCMPGTD).
+
+  if (f.u >= f16max.u) {                        // result is Inf or NaN (all exponent bits set)
+    val = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
+  } else {                                      // (De)normalized number or zero
+    if (f.u < (113 << 23)) {                    // resulting FP16 is subnormal or zero
+      // use a magic value to align our 10 mantissa bits at the bottom of
+      // the float. as long as FP addition is round-to-nearest-even this
+      // just works.
+      f.f += denorm_magic.f;
+
+      // and one integer subtract of the bias later, we have our final float!
+      val = static_cast<uint16_t>(f.u - denorm_magic.u);
+    } else {
+      unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
+
+      // update exponent, rounding bias part 1
+      // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but
+      // without arithmetic overflow.
+      f.u += 0xc8000fffU;
+      // rounding bias part 2
+      f.u += mant_odd;
+      // take the bits!
+      val = static_cast<uint16_t>(f.u >> 13);
+    }
+  }
+
+  val |= static_cast<uint16_t>(sign >> 16);
+  return val;
+}
+
+template <class Derived>
+inline float Float16Impl<Derived>::ToFloatImpl() const noexcept
+{
+  constexpr detail::float32_bits magic = {113 << 23};
+  constexpr unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
+  detail::float32_bits o{};
+
+  o.u = (val & 0x7fff) << 13;           // exponent/mantissa bits
+  unsigned int exp = shifted_exp & o.u; // just the exponent
+  o.u += (127 - 15) << 23;              // exponent adjust
+
+  // handle exponent special cases
+  if (exp == shifted_exp) {  // Inf/NaN?
+    o.u += (128 - 16) << 23; // extra exp adjust
+  } else if (exp == 0) {     // Zero/Denormal?
+    o.u += 1 << 23;          // extra exp adjust
+    o.f -= magic.f;          // re-normalize
+  }
+
+  // Attempt to workaround the Internal Compiler Error on ARM64
+  // for bitwise | operator, including std::bitset
+#if (defined _MSC_VER) && (defined _M_ARM || defined _M_ARM64 || defined _M_ARM64EC)
+  if (IsNegative()) {
+    return -o.f;
+  }
+#else
+  // original code:
+  o.u |= (val & 0x8000U) << 16U; // sign bit
+#endif
+  return o.f;
+}
+
+/// Shared implementation between public and internal classes. CRTP pattern.
+template <class Derived>
+struct BFloat16Impl {
+ protected:
+  /// <summary>
+  /// Converts from float to uint16_t float16 representation
+  /// </summary>
+  /// <param name="v"></param>
+  /// <returns></returns>
+  static uint16_t ToUint16Impl(float v) noexcept;
+
+  /// <summary>
+  /// Converts bfloat16 to float
+  /// </summary>
+  /// <returns>float representation of bfloat16 value</returns>
+  float ToFloatImpl() const noexcept;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  uint16_t AbsImpl() const noexcept
+  {
+    return static_cast<uint16_t>(val & ~kSignMask);
+  }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  uint16_t NegateImpl() const noexcept
+  {
+    return IsNaN() ? val : static_cast<uint16_t>(val ^ kSignMask);
+  }
+
+ public:
+  // uint16_t special values
+  static constexpr uint16_t kSignMask = 0x8000U;
+  static constexpr uint16_t kBiasedExponentMask = 0x7F80U;
+  static constexpr uint16_t kPositiveInfinityBits = 0x7F80U;
+  static constexpr uint16_t kNegativeInfinityBits = 0xFF80U;
+  static constexpr uint16_t kPositiveQNaNBits = 0x7FC1U;
+  static constexpr uint16_t kNegativeQNaNBits = 0xFFC1U;
+  static constexpr uint16_t kSignaling_NaNBits = 0x7F80U;
+  static constexpr uint16_t kEpsilonBits = 0x0080U;
+  static constexpr uint16_t kMinValueBits = 0xFF7FU;
+  static constexpr uint16_t kMaxValueBits = 0x7F7FU;
+  static constexpr uint16_t kRoundToNearest = 0x7FFFU;
+  static constexpr uint16_t kOneBits = 0x3F80U;
+  static constexpr uint16_t kMinusOneBits = 0xBF80U;
+
+  uint16_t val{0};
+
+  BFloat16Impl() = default;
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  bool IsNegative() const noexcept
+  {
+    return static_cast<int16_t>(val) < 0;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  bool IsNaN() const noexcept
+  {
+    return AbsImpl() > kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  bool IsFinite() const noexcept
+  {
+    return AbsImpl() < kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  bool IsPositiveInfinity() const noexcept
+  {
+    return val == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  bool IsNegativeInfinity() const noexcept
+  {
+    return val == kNegativeInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  bool IsInfinity() const noexcept
+  {
+    return AbsImpl() == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  bool IsNaNOrZero() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs == 0 || abs > kPositiveInfinityBits);
+  }
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsNormal() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)          // is finite
+           && (abs != 0)                          // is not zero
+           && ((abs & kBiasedExponentMask) != 0); // is not subnormal (has a non-zero exponent)
+  }
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsSubnormal() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)          // is finite
+           && (abs != 0)                          // is not zero
+           && ((abs & kBiasedExponentMask) == 0); // is subnormal (has a zero exponent)
+  }
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); }
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  static bool AreZero(const BFloat16Impl& lhs, const BFloat16Impl& rhs) noexcept
+  {
+    // IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+    // for two values by or'ing the private bits together and stripping the sign. They are both zero,
+    // and therefore equivalent, if the resulting value is still zero.
+    return static_cast<uint16_t>((lhs.val | rhs.val) & ~kSignMask) == 0;
+  }
+};
+
+template <class Derived>
+inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
+{
+  uint16_t result;
+  if (std::isnan(v)) {
+    result = kPositiveQNaNBits;
+  } else {
+    auto get_msb_half = [](float fl) {
+      uint16_t result;
+#ifdef __cpp_if_constexpr
+      if constexpr (detail::endian::native == detail::endian::little)
+#else
+      if (detail::endian::native == detail::endian::little)
+#endif
+      {
+        std::memcpy(&result, reinterpret_cast<char*>(&fl) + sizeof(uint16_t), sizeof(uint16_t));
+      } else {
+        std::memcpy(&result, &fl, sizeof(uint16_t));
+      }
+      return result;
+    };
+
+    uint16_t upper_bits = get_msb_half(v);
+    union {
+      uint32_t U32;
+      float F32;
+    };
+    F32 = v;
+    U32 += (upper_bits & 1) + kRoundToNearest;
+    result = get_msb_half(F32);
+  }
+  return result;
+}
+
+template <class Derived>
+inline float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
+{
+  if (IsNaN()) {
+    return std::numeric_limits<float>::quiet_NaN();
+  }
+  float result;
+  char* const first = reinterpret_cast<char*>(&result);
+  char* const second = first + sizeof(uint16_t);
+#ifdef __cpp_if_constexpr
+  if constexpr (detail::endian::native == detail::endian::little)
+#else
+  if (detail::endian::native == detail::endian::little)
+#endif
+  {
+    std::memset(first, 0, sizeof(uint16_t));
+    std::memcpy(second, &val, sizeof(uint16_t));
+  } else {
+    std::memcpy(first, &val, sizeof(uint16_t));
+    std::memset(second, 0, sizeof(uint16_t));
+  }
+  return result;
+}
+
+/** \brief IEEE 754 half-precision floating point data type
+ *
+ * \details This struct is used for converting float to float16 and back
+ * so the user could feed inputs and fetch outputs using these type.
+ *
+ * The size of the structure should align with uint16_t and one can freely cast
+ * uint16_t buffers to/from Ort::Float16_t to feed and retrieve data.
+ *
+ * \code{.unparsed}
+ * // This example demonstrates converion from float to float16
+ * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
+ * std::vector<Ort::Float16_t> fp16_values;
+ * fp16_values.reserve(std::size(values));
+ * std::transform(std::begin(values), std::end(values), std::back_inserter(fp16_values),
+ *     [](float value) { return Ort::Float16_t(value); });
+ *
+ * \endcode
+ */
+struct Float16_t : OrtDataType::Float16Impl<Float16_t> {
+ private:
+  /// <summary>
+  /// Constructor from a 16-bit representation of a float16 value
+  /// No conversion is done here.
+  /// </summary>
+  /// <param name="v">16-bit representation</param>
+  constexpr explicit Float16_t(uint16_t v) noexcept { val = v; }
+
+ public:
+  using Base = OrtDataType::Float16Impl<Float16_t>;
+
+  /// <summary>
+  /// Default constructor
+  /// </summary>
+  Float16_t() = default;
+
+  /// <summary>
+  /// Explicit conversion to uint16_t representation of float16.
+  /// </summary>
+  /// <param name="v">uint16_t bit representation of float16</param>
+  /// <returns>new instance of Float16_t</returns>
+  constexpr static Float16_t FromBits(uint16_t v) noexcept { return Float16_t(v); }
+
+  /// <summary>
+  /// __ctor from float. Float is converted into float16 16-bit representation.
+  /// </summary>
+  /// <param name="v">float value</param>
+  explicit Float16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
+
+  /// <summary>
+  /// Converts float16 to float
+  /// </summary>
+  /// <returns>float representation of float16 value</returns>
+  float ToFloat() const noexcept { return Base::ToFloatImpl(); }
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  using Base::IsNegative;
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  using Base::IsNaN;
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  using Base::IsFinite;
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  using Base::IsPositiveInfinity;
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  using Base::IsNegativeInfinity;
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  using Base::IsInfinity;
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  using Base::IsNaNOrZero;
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsNormal;
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsSubnormal;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  using Base::Abs;
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  using Base::Negate;
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  using Base::AreZero;
+
+  /// <summary>
+  /// User defined conversion operator. Converts Float16_t to float.
+  /// </summary>
+  explicit operator float() const noexcept { return ToFloat(); }
+
+  using Base::operator==;
+  using Base::operator!=;
+  using Base::operator<;
+};
+
+static_assert(sizeof(Float16_t) == sizeof(uint16_t), "Sizes must match");
+
+/** \brief bfloat16 (Brain Floating Point) data type
+ *
+ * \details This struct is used for converting float to bfloat16 and back
+ * so the user could feed inputs and fetch outputs using these type.
+ *
+ * The size of the structure should align with uint16_t and one can freely cast
+ * uint16_t buffers to/from Ort::BFloat16_t to feed and retrieve data.
+ *
+ * \code{.unparsed}
+ * // This example demonstrates converion from float to float16
+ * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
+ * std::vector<Ort::BFloat16_t> bfp16_values;
+ * bfp16_values.reserve(std::size(values));
+ * std::transform(std::begin(values), std::end(values), std::back_inserter(bfp16_values),
+ *     [](float value) { return Ort::BFloat16_t(value); });
+ *
+ * \endcode
+ */
+struct BFloat16_t : OrtDataType::BFloat16Impl<BFloat16_t> {
+ private:
+  /// <summary>
+  /// Constructor from a uint16_t representation of bfloat16
+  /// used in FromBits() to escape overload resolution issue with
+  /// constructor from float.
+  /// No conversion is done.
+  /// </summary>
+  /// <param name="v">16-bit bfloat16 value</param>
+  constexpr explicit BFloat16_t(uint16_t v) noexcept { val = v; }
+
+ public:
+  using Base = OrtDataType::BFloat16Impl<BFloat16_t>;
+
+  BFloat16_t() = default;
+
+  /// <summary>
+  /// Explicit conversion to uint16_t representation of bfloat16.
+  /// </summary>
+  /// <param name="v">uint16_t bit representation of bfloat16</param>
+  /// <returns>new instance of BFloat16_t</returns>
+  static constexpr BFloat16_t FromBits(uint16_t v) noexcept { return BFloat16_t(v); }
+
+  /// <summary>
+  /// __ctor from float. Float is converted into bfloat16 16-bit representation.
+  /// </summary>
+  /// <param name="v">float value</param>
+  explicit BFloat16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
+
+  /// <summary>
+  /// Converts bfloat16 to float
+  /// </summary>
+  /// <returns>float representation of bfloat16 value</returns>
+  float ToFloat() const noexcept { return Base::ToFloatImpl(); }
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  using Base::IsNegative;
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  using Base::IsNaN;
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  using Base::IsFinite;
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  using Base::IsPositiveInfinity;
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  using Base::IsNegativeInfinity;
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  using Base::IsInfinity;
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  using Base::IsNaNOrZero;
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsNormal;
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsSubnormal;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  using Base::Abs;
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  using Base::Negate;
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  using Base::AreZero;
+
+  /// <summary>
+  /// User defined conversion operator. Converts BFloat16_t to float.
+  /// </summary>
+  explicit operator float() const noexcept { return ToFloat(); }
+
+  // We do not have an inherited impl for the below operators
+  // as the internal class implements them a little differently
+  bool operator==(const BFloat16_t& rhs) const noexcept;
+  bool operator!=(const BFloat16_t& rhs) const noexcept { return !(*this == rhs); }
+  bool operator<(const BFloat16_t& rhs) const noexcept;
+};
+
+static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match");
+
+} // namespace OrtDataType
+
+} // namespace o2
\ No newline at end of file
diff --git a/Common/ML/include/ML/onnx_interface.h b/Common/ML/include/ML/onnx_interface.h
deleted file mode 100644
index fcc02a49996ea..0000000000000
--- a/Common/ML/include/ML/onnx_interface.h
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-///
-/// \file     model.h
-///
-/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
-///
-/// \brief    A general-purpose class for ONNX models
-///
-
-#ifndef COMMON_ML_ONNX_INTERFACE_H
-#define COMMON_ML_ONNX_INTERFACE_H
-
-// C++ and system includes
-#if __has_include(<onnxruntime/core/session/onnxruntime_cxx_api.h>)
-#include <onnxruntime/core/session/onnxruntime_cxx_api.h>
-#else
-#include <onnxruntime_cxx_api.h>
-#endif
-#include <vector>
-#include <string>
-#include <memory>
-#include <map>
-#include <thread>
-
-// O2 includes
-#include "Framework/Logger.h"
-
-namespace o2
-{
-
-namespace ml
-{
-
-class OnnxModel
-{
- public:
-  OnnxModel() : mMemoryInfo(Ort::MemoryInfo::CreateCpu(OrtAllocatorType, OrtMemType)) {};
-  virtual ~OnnxModel() = default;
-
-  // Inferencing
-  void init(std::string, bool = false, int = 0);
-  // float* inference(std::vector<Ort::Value>, int = 0);
-  // float* inference(std::vector<float>, int = 0);
-  template<class T> float* inference(T input, unsigned int size);
-  template<class T> std::vector<float> inference_vector(T input, unsigned int size);
-
-  // Reset session
-  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-    void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); };
-  #else
-    void resetSession() { mSession.reset(new Ort::Session{*mEnv, modelPath.c_str(), sessionOptions}); };
-  #endif
-
-  // Getters & Setters
-  Ort::SessionOptions* getSessionOptions() { return &sessionOptions; } // For optimizations in post
-  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-    std::shared_ptr<Ort::Experimental::Session> getSession() { return mSession; }
-  #else
-    std::shared_ptr<Ort::Session> getSession() { return mSession; }
-  #endif
-  std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
-  std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
-  void setActiveThreads(int);
-
- private:
-  // Environment variables for the ONNX runtime
-  std::shared_ptr<Ort::Env> mEnv = nullptr;
-  std::shared_ptr<Ort::Session> mSession = nullptr; ///< ONNX session
-  Ort::MemoryInfo mMemoryInfo;
-  Ort::SessionOptions sessionOptions;
-
-  // Input & Output specifications of the loaded network
-  std::vector<std::string> mInputNames;
-  std::vector<std::vector<int64_t>> mInputShapes;
-  std::vector<std::string> mOutputNames;
-  std::vector<std::vector<int64_t>> mOutputShapes;
-
-  // Environment settings
-  std::string modelPath;
-  int activeThreads = 0;
-
-  // Internal function for printing the shape of tensors
-  std::string printShape(const std::vector<int64_t>&);
-};
-
-} // namespace gpu
-
-} // namespace GPUCA_NAMESPACE
-
-#endif // COMMON_ML_ONNX_INTERFACE_H
\ No newline at end of file
diff --git a/Common/ML/include/ML/ort_interface.h b/Common/ML/include/ML/ort_interface.h
new file mode 100644
index 0000000000000..a365860db3279
--- /dev/null
+++ b/Common/ML/include/ML/ort_interface.h
@@ -0,0 +1,94 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file     ort_interface.h
+/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
+/// \brief    A header library for loading ONNX models and inferencing them on CPU and GPU
+
+#ifndef O2_ML_ONNX_INTERFACE_H
+#define O2_ML_ONNX_INTERFACE_H
+
+// C++ and system includes
+#include <vector>
+#include <string>
+#include <memory>
+#include <map>
+#include <thread>
+
+// O2 includes
+#include "Framework/Logger.h"
+
+namespace o2
+{
+
+namespace ml
+{
+
+class OrtModel
+{
+
+  public:
+    // Constructor
+    OrtModel() = default;
+    OrtModel(std::unordered_map<std::string, std::string> optionsMap){ reset(optionsMap); }
+    void init(std::unordered_map<std::string, std::string> optionsMap){ reset(optionsMap); }
+    void reset(std::unordered_map<std::string, std::string>);
+
+    virtual ~OrtModel() = default;
+
+    // Conversion
+    template<class I, class O>
+    std::vector<O> v2v(std::vector<I>&, bool = true);
+
+    // Inferencing
+    template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
+    std::vector<O> inference(std::vector<I>&);
+
+    template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+    std::vector<O> inference(std::vector<std::vector<I>>&);
+
+    // template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
+    // std::vector<O> inference(std::vector<I>&);
+
+    // Reset session
+    void resetSession();
+
+    std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
+    std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
+    std::vector<std::string> getInputNames() const { return mInputNames; }
+    std::vector<std::string> getOutputNames() const { return mOutputNames; }
+
+    void setActiveThreads(int threads) { intraOpNumThreads = threads; }
+
+  private:
+
+    // ORT variables -> need to be hidden as Pimpl
+    struct OrtVariables;
+    OrtVariables* pImplOrt;
+
+    // Input & Output specifications of the loaded network
+    std::vector<const char*> inputNamesChar, outputNamesChar;
+    std::vector<std::string> mInputNames, mOutputNames;
+    std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes;
+
+    // Environment settings
+    std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda
+    int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
+
+    std::string printShape(const std::vector<int64_t>&);
+
+};
+
+} // namespace ml
+
+} // namespace ml
+
+#endif // O2_ML_ORT_INTERFACE_H
\ No newline at end of file
diff --git a/Common/ML/src/onnx_interface.cxx b/Common/ML/src/onnx_interface.cxx
deleted file mode 100644
index c348d4577d47f..0000000000000
--- a/Common/ML/src/onnx_interface.cxx
+++ /dev/null
@@ -1,226 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-///
-/// \file     model.cxx
-///
-/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
-///
-/// \brief    A general-purpose class with functions for ONNX model applications
-///
-
-// ONNX includes
-#include "ML/onnx_interface.h"
-
-namespace o2
-{
-
-namespace ml
-{
-
-std::string OnnxModel::printShape(const std::vector<int64_t>& v)
-{
-  std::stringstream ss("");
-  for (size_t i = 0; i < v.size() - 1; i++)
-    ss << v[i] << "x";
-  ss << v[v.size() - 1];
-  return ss.str();
-}
-
-void OnnxModel::init(std::string localPath, bool enableOptimizations, int threads)
-{
-
-  LOG(info) << "--- ONNX-ML model ---";
-  LOG(info) << "Taking model from: " << localPath;
-  modelPath = localPath;
-  activeThreads = threads;
-
-#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-#else
-  mMemoryInfo = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
-#endif
-
-  /// Enableing optimizations
-  if(threads != 0){
-    // sessionOptions.SetInterOpNumThreads(1);
-    if(threads == 1){
-      sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
-    }
-    else{
-      sessionOptions.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
-      sessionOptions.SetIntraOpNumThreads(threads);
-    }
-  }
-  if (enableOptimizations) {
-    // sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
-    sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
-    // uint32_t coreml_flags = 0;
-    // coreml_flags |= COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE;
-    // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(sessionOptions, coreml_flags));
-  }
-
-  mEnv = std::make_shared<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "onnx-model");
-  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-    mSession = std::make_shared<Ort::Experimental::Session>(*mEnv, modelPath, sessionOptions);
-    mInputNames = mSession->GetInputNames();
-    mInputShapes = mSession->GetInputShapes();
-    mOutputNames = mSession->GetOutputNames();
-    mOutputShapes = mSession->GetOutputShapes();
-  #else
-    mSession = std::make_shared<Ort::Session>(*mEnv, modelPath.c_str(), sessionOptions);
-    Ort::AllocatorWithDefaultOptions tmpAllocator;
-    for (size_t i = 0; i < mSession->GetInputCount(); ++i) {
-      mInputNames.push_back(mSession->GetInputNameAllocated(i, tmpAllocator).get());
-    }
-    for (size_t i = 0; i < mSession->GetInputCount(); ++i) {
-      mInputShapes.emplace_back(mSession->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
-    }
-    for (size_t i = 0; i < mSession->GetOutputCount(); ++i) {
-      mOutputNames.push_back(mSession->GetOutputNameAllocated(i, tmpAllocator).get());
-    }
-    for (size_t i = 0; i < mSession->GetOutputCount(); ++i) {
-      mOutputShapes.emplace_back(mSession->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
-    }
-  #endif
-
-  LOG(info) << "Input Nodes:";
-  for (size_t i = 0; i < mInputNames.size(); i++) {
-    LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
-  }
-
-  LOG(info) << "Output Nodes:";
-  for (size_t i = 0; i < mOutputNames.size(); i++) {
-    LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]);
-  }
-  
-  LOG(info) << "--- Model initialized! ---";
-}
-
-// float* OnnxModel::inference(std::vector<Ort::Value> input, int device_id)
-// {
-
-//   // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id));
-
-//   try {
-//     auto outputTensors = mSession->Run(mInputNames, input, mOutputNames);
-//     float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-//     return outputValues;
-//   } catch (const Ort::Exception& exception) {
-//     LOG(error) << "Error running model inference: " << exception.what();
-//   }
-//   return nullptr;
-// }
-
-// float* OnnxModel::inference(std::vector<float> input, int device_id)
-// {
-// 
-//   // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id));
-// 
-//   int64_t size = input.size();
-//   assert(size % mInputShapes[0][1] == 0);
-//   std::vector<int64_t> inputShape{size / mInputShapes[0][1], mInputShapes[0][1]};
-//   std::vector<Ort::Value> inputTensors;
-//   inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), size, inputShape));
-//   try {
-//     auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-//     float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-//     return outputValues;
-//   } catch (const Ort::Exception& exception) {
-//     LOG(error) << "Error running model inference: " << exception.what();
-//   }
-//   return nullptr;
-// }
-
-template<class T>
-float* OnnxModel::inference(T input, unsigned int size)
-{
-  std::vector<int64_t> inputShape = mInputShapes[0];
-  inputShape[0] = size;
-  std::vector<Ort::Value> inputTensors;
-  size_t mem_size = 1;
-  for(auto elem : inputShape){
-    mem_size*=elem;
-  }
-#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-  inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
-  try {
-      auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-      inputTensors.clear();
-      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-      return outputValues;
-    } catch (const Ort::Exception& exception) {
-      LOG(error) << "Error running model inference: " << exception.what();
-    }
-#else
-  std::vector<const char*> tmpInputs;
-  std::vector<const char*> tmpOutputs;
-  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1));
-  try {
-      auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size());
-      inputTensors.clear();
-      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-      return outputValues;
-    } catch (const Ort::Exception& exception) {
-      LOG(error) << "Error running model inference: " << exception.what();
-    }
-#endif
-  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
-  return nullptr;
-}
-
-template<class T>
-std::vector<float> OnnxModel::inference_vector(T input, unsigned int size)
-{
-  std::vector<int64_t> inputShape = mInputShapes[0];
-  inputShape[0] = size;
-  std::vector<Ort::Value> inputTensors;
-  // std::vector<float> outputValues;
-  size_t mem_size = 1;
-  for(auto elem : inputShape){
-    mem_size*=elem;
-  }
-#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-  inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
-  try {
-      auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-      inputTensors.clear();
-      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-      return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
-    } catch (const Ort::Exception& exception) {
-      LOG(error) << "Error running model inference: " << exception.what();
-    }
-#else
-  std::vector<const char*> tmpInputs;
-  std::vector<const char*> tmpOutputs;
-  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
-  try {
-      auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-      inputTensors.clear();
-      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-      return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
-    } catch (const Ort::Exception& exception) {
-      LOG(error) << "Error running model inference: " << exception.what();
-    }
-#endif
-  return std::vector<float>{};
-}
-
-void OnnxModel::setActiveThreads(int threads)
-{
-  activeThreads = threads;
-}
-
-template float* OnnxModel::inference(std::vector<float>, unsigned int);
-template std::vector<float> OnnxModel::inference_vector(std::vector<float>, unsigned int);
-
-} // namespace gpu
-
-} // namespace GPUCA_NAMESPACE
\ No newline at end of file
diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx
new file mode 100644
index 0000000000000..84a06ce1da068
--- /dev/null
+++ b/Common/ML/src/ort_interface.cxx
@@ -0,0 +1,262 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file     ort_interface.cxx
+/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
+/// \brief    A header library for loading ONNX models and inferencing them on CPU and GPU
+
+#include "ML/ort_interface.h"
+#include "ML/3rdparty/GPUORTFloat16.h"
+
+// ONNX includes
+#include <onnxruntime_cxx_api.h>
+
+namespace o2
+{
+
+namespace ml
+{
+
+struct OrtModel::OrtVariables {  // The actual implementation is hidden in the .cxx file
+  // ORT runtime objects
+  Ort::RunOptions runOptions;
+  std::shared_ptr<Ort::Env> env = nullptr;
+  std::shared_ptr<Ort::Session> session = nullptr; ///< ONNX session
+  Ort::SessionOptions sessionOptions;
+  Ort::AllocatorWithDefaultOptions allocator;
+  Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
+};
+
+void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
+
+  pImplOrt = new OrtVariables();
+
+  // Load from options map
+  if(!optionsMap.contains("model-path")){
+    LOG(fatal) << "(ORT) Model path cannot be empty!";
+  }
+  modelPath = optionsMap["model-path"];
+  device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU");
+  dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
+  deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
+  allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
+  intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ?  std::stoi(optionsMap["intra-op-num-threads"]) : 0);
+  loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0);
+  enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0);
+  enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0);
+
+  std::string dev_mem_str = "Hip";
+#ifdef ORT_ROCM_BUILD
+  if(device == "ROCM") {
+    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId));
+    LOG(info) << "(ORT) ROCM execution provider set";
+  }
+#endif
+#ifdef ORT_MIGRAPHX_BUILD
+  if(device == "MIGRAPHX") {
+    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId));
+    LOG(info) << "(ORT) MIGraphX execution provider set";
+  }
+#endif
+#ifdef ORT_CUDA_BUILD
+  if(device == "CUDA") {
+    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId));
+    LOG(info) << "(ORT) CUDA execution provider set";
+    dev_mem_str = "Cuda";
+  }
+#endif
+
+  if(allocateDeviceMemory){
+    pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
+    LOG(info) << "(ORT) Memory info set to on-device memory";
+  }
+
+  if(device == "CPU") {
+    (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
+    if(intraOpNumThreads > 1){
+      (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL);
+    } else if(intraOpNumThreads == 1){
+      (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+    }
+    LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads";
+  }
+
+  (pImplOrt->sessionOptions).DisableMemPattern();
+  (pImplOrt->sessionOptions).DisableCpuMemArena();
+
+  if(enableProfiling){
+    if(optionsMap.contains("profiling-output-path")){
+      (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str());
+    } else {
+      LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now.";
+      (pImplOrt->sessionOptions).DisableProfiling();
+    }
+  } else {
+    (pImplOrt->sessionOptions).DisableProfiling();
+  }
+  (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations));
+  (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel));
+
+  pImplOrt->env = std::make_shared<Ort::Env>(OrtLoggingLevel(loggingLevel), (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str()));
+  (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
+
+  for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
+      mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
+  }
+  for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
+      mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+  }
+  for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
+      mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
+  }
+  for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
+      mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+  }
+
+  inputNamesChar.resize(mInputNames.size(), nullptr);
+  std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar),
+      [&](const std::string& str) { return str.c_str(); });
+  outputNamesChar.resize(mOutputNames.size(), nullptr);
+  std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
+      [&](const std::string& str) { return str.c_str(); });
+
+  // Print names
+  if(loggingLevel > 1) {
+    LOG(info) << "Input Nodes:";
+    for (size_t i = 0; i < mInputNames.size(); i++) {
+      LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
+    }
+
+    LOG(info) << "Output Nodes:";
+    for (size_t i = 0; i < mOutputNames.size(); i++) {
+      LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]);
+    }
+  }
+}
+
+void OrtModel::resetSession() { 
+  (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
+}
+
+template<class I, class O>
+std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput) {
+  if constexpr (std::is_same_v<I,O>){
+    return input;
+  } else {
+    std::vector<O> output(input.size());
+    std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); });
+    if(clearInput) input.clear();
+    return output;
+  }
+}
+
+template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+std::vector<O> OrtModel::inference(std::vector<I>& input){
+  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(input)).data(), input.size(), inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
+  std::vector<O> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input){
+  std::vector<Ort::Value> inputTensor;
+  for(auto i : input){
+    std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+    inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(i)).data(), i.size(), inputShape.data(), inputShape.size()));
+  }
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
+  std::vector<O> outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+std::string OrtModel::printShape(const std::vector<int64_t>& v)
+{
+  std::stringstream ss("");
+  for (size_t i = 0; i < v.size() - 1; i++)
+    ss << v[i] << "x";
+  ss << v[v.size() - 1];
+  return ss.str();
+}
+
+template <> std::vector<float> OrtModel::inference<float, float>(std::vector<float>& input) {
+  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  float* outputValues = outputTensors[0].template GetTensorMutableData<float>();
+  std::vector<float> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+template <> std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>& input) {
+  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  float* outputValues = outputTensors[0].template GetTensorMutableData<float>();
+  std::vector<float> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>& input) {
+  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  OrtDataType::Float16_t* outputValues = reinterpret_cast<OrtDataType::Float16_t*>(outputTensors[0].template GetTensorMutableData<Ort::Float16_t>());
+  std::vector<OrtDataType::Float16_t> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDataType::Float16_t>(std::vector<float>& input) {
+  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  OrtDataType::Float16_t* outputValues = reinterpret_cast<OrtDataType::Float16_t*>(outputTensors[0].template GetTensorMutableData<Ort::Float16_t>());
+  std::vector<OrtDataType::Float16_t> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>& input) {
+  std::vector<Ort::Value> inputTensor;
+  for(auto i : input){
+    std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+    inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(i.data()), i.size(), inputShape.data(), inputShape.size()));
+  }
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  OrtDataType::Float16_t* outputValues = reinterpret_cast<OrtDataType::Float16_t*>(outputTensors[0].template GetTensorMutableData<Ort::Float16_t>());
+  std::vector<OrtDataType::Float16_t> outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+} // namespace ml
+
+} // namespace o2
\ No newline at end of file
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index ad8e53309beee..0efed3ad4c76c 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -63,7 +63,6 @@ set(SRCS
     Merger/GPUTPCGlobalDebugSortKernels.cxx
     Merger/GPUTPCGMPhysicalTrackModel.cxx
     Merger/GPUTPCGMPolynomialFieldManager.cxx
-    ML/onnx_interface.cxx
     DataTypes/GPUTRDTrack.cxx
     TRDTracking/GPUTRDTracker.cxx
     TRDTracking/GPUTRDTrackletWord.cxx
@@ -313,7 +312,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
                  PUBLIC_LINK_LIBRARIES O2::GPUCommon
                                        O2::ReconstructionDataFormats
                                        O2::TPCFastTransformation
-                                       ONNXRuntime::ONNXRuntime
+                                       O2::ML
                  PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC
                  SOURCES ${SRCS_DATATYPE_HEADERS})
   target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAVE_O2HEADERS)
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 31d46b928a33f..24c1ea6a6e2ce 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -294,16 +294,26 @@ AddOption(printSettings, bool, false, "", 0, "Print all settings when initializi
 AddVariable(eventDisplay, GPUCA_NAMESPACE::gpu::GPUDisplayFrontendInterface*, nullptr)
 AddSubConfig(GPUSettingsProcessingRTC, rtc)
 AddSubConfig(GPUSettingsProcessingParam, param)
-AddOption(applyNNclusterizer, int, 0, "", 0, "(Bool, default = 0), if the neural network clsuterizer should be used.")
-AddOption(nnClusterizerVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs")
+AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural network clusterizer should be used.")
+AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)")
+AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id")
+AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference")
+AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
+AddOption(nnInferenceThreadsPerNN, int, 0, "", 0, "Number of threads used to evaluate one neural network")
+AddOption(nnInferenceEnableOrtOptimization, unsigned int, 1, "", 0, "Enables graph optimizations in ONNX Runtime. Can be greater than 1!")
+AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime")
+AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If mmInferenceOrtProfiling is set, the path to store the profiling data")
+AddOption(nnInferenceVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs")
+AddOption(nnClusterizerAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input")
+AddOption(nnClusterizerSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnClusterizerSizeInputPad, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnClusterizerSizeInputTime, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnClusterizerUseCFregression, int, 0, "", 0, "(bool, default = false) If true, use the regression from the native clusterizer and not the NN")
+AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable")
 AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
-AddOption(nnRegressionPath, std::string, "./network_reg.onnx", "", 0, "The regression network path")
-AddOption(nnClassThreshold, float, 0.16, "", 0, "The cutoff at which clusters will be accepted / rejected.")
-AddOption(nnSigmoidTrafoThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
-AddOption(nnAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input")
-AddOption(nnSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
-AddOption(nnSizeInputPad, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
-AddOption(nnSizeInputTime, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
+AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
+AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
 AddHelp("help", 'h')
 EndConfig()
 #endif // __OPENCL__
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
index f413598c13f59..528c683944ef1 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -742,7 +742,7 @@ int32_t GPUChainTracking::RunChain()
       return 1;
     }
   } else if (mIOPtrs.tpcPackedDigits || mIOPtrs.tpcZS) {
-    if (runRecoStep(RecoStep::TPCClusterFinding, &GPUChainTracking::RunTPCClusterizer, false)) { // FIXME: This enables the neural network clusterization -> Need to actually set this as configurable
+    if (runRecoStep(RecoStep::TPCClusterFinding, &GPUChainTracking::RunTPCClusterizer, false)) {
       return 1;
     }
   }
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index e69c3d15c6fc2..eafd50a72424f 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -841,7 +841,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
           runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
         } else {
-          // FIXME: This needs to be removed when I actually apply the NN! For now its onyl to make the code work
+          // FIXME: This potentially needs to be removed when I actually apply the NN. For now its only to make the code work
           runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
           runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
         }
@@ -875,16 +875,60 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
 
         if(GetProcessingSettings().applyNNclusterizer){
-          clusterer.model_class.init(GetProcessingSettings().nnClassificationPath, 1, 1, GetProcessingSettings().nnClusterizerVerbosity);
-          clusterer.model_reg.init(GetProcessingSettings().nnRegressionPath, 1, 1, GetProcessingSettings().nnClusterizerVerbosity);
-          clusterer.nnSizeInputRow = GetProcessingSettings().nnSizeInputRow;
-          clusterer.nnSizeInputPad = GetProcessingSettings().nnSizeInputPad;
-          clusterer.nnSizeInputTime = GetProcessingSettings().nnSizeInputTime;
-          clusterer.nnAddIndexData = GetProcessingSettings().nnAddIndexData;
+          // Settings for the clusterizer
+          clusterer.nnClusterizerUseCFregression = GetProcessingSettings().nnClusterizerUseCFregression;
+          clusterer.nnClusterizerSizeInputRow = GetProcessingSettings().nnClusterizerSizeInputRow;
+          clusterer.nnClusterizerSizeInputPad = GetProcessingSettings().nnClusterizerSizeInputPad;
+          clusterer.nnClusterizerSizeInputTime = GetProcessingSettings().nnClusterizerSizeInputTime;
+          clusterer.nnClusterizerAddIndexData = GetProcessingSettings().nnClusterizerAddIndexData;
+          clusterer.nnClusterizerElementSize = ((2*clusterer.nnClusterizerSizeInputRow + 1) * (2*clusterer.nnClusterizerSizeInputPad + 1) * (2*clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0);
+          clusterer.nnClusterizerBatchedMode = GetProcessingSettings().nnClusterizerBatchedMode;
+          clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnInferenceVerbosity;
+
+          // Settings for the NN evaluation
           clusterer.nnClassThreshold = GetProcessingSettings().nnClassThreshold;
-          clusterer.nnSigmoidTrafoThreshold = GetProcessingSettings().nnSigmoidTrafoThreshold;
-          clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnClusterizerVerbosity;
-          runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
+          clusterer.nnSigmoidTrafoClassThreshold = GetProcessingSettings().nnSigmoidTrafoClassThreshold;
+
+          // Settings for the neural network evaluation
+          clusterer.OrtOptions = {
+            {"model-path", GetProcessingSettings().nnClassificationPath},
+            {"device",  GetProcessingSettings().nnInferenceDevice},
+            {"device-id", std::to_string(GetProcessingSettings().nnInferenceDeviceId)},
+            {"allocate-device-memory", std::to_string(GetProcessingSettings().nnInferenceAllocateDevMem)},
+            {"dtype", GetProcessingSettings().nnInferenceDtype},
+            {"intra-op-num-threads", std::to_string(GetProcessingSettings().nnInferenceThreadsPerNN)},
+            {"enable-optimizations", std::to_string(GetProcessingSettings().nnInferenceEnableOrtOptimization)},
+            {"enable-profiling", std::to_string(GetProcessingSettings().nnInferenceOrtProfiling)},
+            {"profiling-output-path", GetProcessingSettings().nnInferenceOrtProfilingPath},
+            {"logging-level", std::to_string(GetProcessingSettings().nnInferenceVerbosity)}
+          };
+          clusterer.model_class.init(clusterer.OrtOptions);
+          if(!clusterer.nnClusterizerUseCFregression){
+            std::vector<std::string> reg_model_paths = o2::utils::Str::tokenize(GetProcessingSettings().nnRegressionPath, ':');
+            if(clusterer.model_class.getNumOutputNodes()[0][1] == 1){
+              clusterer.OrtOptions["model-path"] = reg_model_paths[0];
+              clusterer.model_reg_1.init(clusterer.OrtOptions);
+            } else {
+              if(reg_model_paths.size() == 1){
+                clusterer.OrtOptions["model-path"] = reg_model_paths[0];
+                clusterer.model_reg_1.init(clusterer.OrtOptions);
+              } else {
+                clusterer.OrtOptions["model-path"] = reg_model_paths[0];
+                clusterer.model_reg_1.init(clusterer.OrtOptions);
+                clusterer.OrtOptions["model-path"] = reg_model_paths[1];
+                clusterer.model_reg_2.init(clusterer.OrtOptions);
+              }
+            }
+          } else {
+            runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
+            DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
+          }
+
+          if(clusterer.nnSigmoidTrafoClassThreshold){
+            // Inverse sigmoid transformation
+            clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold/(1.f-clusterer.nnClassThreshold));
+          }
+          runKernel<GPUTPCNNClusterizer>({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
         } else {
           runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
         }
@@ -897,9 +941,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           if(!GetProcessingSettings().applyNNclusterizer){
             runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
           } else {
-            runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+            runKernel<GPUTPCNNClusterizer>({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
           }
         }
+
         if (GetProcessingSettings().debugLevel >= 3) {
           GPUInfo("Sector %02d Fragment %02d Lane %d: Found clusters: digits %u peaks %u clusters %u", iSlice, fragment.index, lane, (int32_t)clusterer.mPmemory->counters.nPositions, (int32_t)clusterer.mPmemory->counters.nPeaks, (int32_t)clusterer.mPmemory->counters.nClusters);
         }
diff --git a/GPU/GPUTracking/ML/onnx_interface.cxx b/GPU/GPUTracking/ML/onnx_interface.cxx
deleted file mode 100644
index 9bb5137ec63dd..0000000000000
--- a/GPU/GPUTracking/ML/onnx_interface.cxx
+++ /dev/null
@@ -1,235 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-///
-/// \file     model.cxx
-///
-/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
-///
-/// \brief    A general-purpose class with functions for ONNX model applications
-///
-
-// ONNX includes
-#include "ML/onnx_interface.h"
-
-namespace o2
-{
-
-namespace ml
-{
-
-std::string OnnxModel::printShape(const std::vector<int64_t>& v)
-{
-  std::stringstream ss("");
-  for (size_t i = 0; i < v.size() - 1; i++)
-    ss << v[i] << "x";
-  ss << v[v.size() - 1];
-  return ss.str();
-}
-
-void OnnxModel::init(std::string localPath, bool enableOptimizations, int threads, int verbosity)
-{
-
-  if(verbosity > 1){
-    LOG(info) << "--- ONNX-ML model ---";
-    LOG(info) << "Taking model from: " << localPath;
-  }
-  modelPath = localPath;
-  activeThreads = threads;
-
-#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-#else
-  mMemoryInfo = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
-#endif
-
-  /// Enableing optimizations
-  if(threads != 0){
-    // sessionOptions.SetInterOpNumThreads(1);
-    if(threads == 1){
-      sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
-    }
-    else{
-      sessionOptions.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
-      sessionOptions.SetIntraOpNumThreads(threads);
-    }
-  }
-  if (enableOptimizations) {
-    // sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
-    sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
-    // uint32_t coreml_flags = 0;
-    // coreml_flags |= COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE;
-    // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(sessionOptions, coreml_flags));
-  }
-
-  mEnv = std::make_shared<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "onnx-model");
-  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-    mSession = std::make_shared<Ort::Experimental::Session>(*mEnv, modelPath, sessionOptions);
-    mInputNames = mSession->GetInputNames();
-    mInputShapes = mSession->GetInputShapes();
-    mOutputNames = mSession->GetOutputNames();
-    mOutputShapes = mSession->GetOutputShapes();
-  #else
-    mSession = std::make_shared<Ort::Session>(*mEnv, modelPath.c_str(), sessionOptions);
-    Ort::AllocatorWithDefaultOptions tmpAllocator;
-    for (size_t i = 0; i < mSession->GetInputCount(); ++i) {
-      mInputNames.push_back(mSession->GetInputNameAllocated(i, tmpAllocator).get());
-    }
-    for (size_t i = 0; i < mSession->GetInputCount(); ++i) {
-      mInputShapes.emplace_back(mSession->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
-    }
-    for (size_t i = 0; i < mSession->GetOutputCount(); ++i) {
-      mOutputNames.push_back(mSession->GetOutputNameAllocated(i, tmpAllocator).get());
-    }
-    for (size_t i = 0; i < mSession->GetOutputCount(); ++i) {
-      mOutputShapes.emplace_back(mSession->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
-    }
-  #endif
-
-  if(verbosity > 1){
-    LOG(info) << "Input Nodes:";
-    for (size_t i = 0; i < mInputNames.size(); i++) {
-      LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
-    }
-
-    LOG(info) << "Output Nodes:";
-    for (size_t i = 0; i < mOutputNames.size(); i++) {
-      LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]);
-    }
-    LOG(info) << "--- Model initialized! ---";
-  }
-}
-
-// float* OnnxModel::inference(std::vector<Ort::Value> input, int device_id)
-// {
-
-//   // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id));
-
-//   try {
-//     auto outputTensors = mSession->Run(mInputNames, input, mOutputNames);
-//     float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-//     return outputValues;
-//   } catch (const Ort::Exception& exception) {
-//     LOG(error) << "Error running model inference: " << exception.what();
-//   }
-//   return nullptr;
-// }
-
-// float* OnnxModel::inference(std::vector<float> input, int device_id)
-// {
-// 
-//   // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id));
-// 
-//   int64_t size = input.size();
-//   assert(size % mInputShapes[0][1] == 0);
-//   std::vector<int64_t> inputShape{size / mInputShapes[0][1], mInputShapes[0][1]};
-//   std::vector<Ort::Value> inputTensors;
-//   inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), size, inputShape));
-//   try {
-//     auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-//     float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-//     return outputValues;
-//   } catch (const Ort::Exception& exception) {
-//     LOG(error) << "Error running model inference: " << exception.what();
-//   }
-//   return nullptr;
-// }
-
-template<class T>
-float* OnnxModel::inference(T input, unsigned int size)
-{
-  std::vector<int64_t> inputShape = mInputShapes[0];
-  inputShape[0] = size;
-  std::vector<Ort::Value> inputTensors;
-  size_t mem_size = 1;
-  for(auto elem : inputShape){
-    mem_size*=elem;
-  }
-#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-  inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
-  try {
-      auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-      inputTensors.clear();
-      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-      return outputValues;
-    } catch (const Ort::Exception& exception) {
-      LOG(error) << "Error running model inference: " << exception.what();
-    }
-#else
-  std::vector<const char*> tmpInputs;
-  std::vector<const char*> tmpOutputs;
-  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1));
-  try {
-      auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size());
-      inputTensors.clear();
-      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-      return outputValues;
-    } catch (const Ort::Exception& exception) {
-      LOG(error) << "Error running model inference: " << exception.what();
-    }
-#endif
-  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
-  return nullptr;
-}
-
-template<class T>
-std::vector<float> OnnxModel::inference_vector(T input, unsigned int size)
-{
-  std::vector<int64_t> inputShape = mInputShapes[0];
-  inputShape[0] = size;
-  std::vector<Ort::Value> inputTensors;
-  // std::vector<float> outputValues;
-  size_t mem_size = 1;
-  for(auto elem : inputShape){
-    mem_size*=elem;
-  }
-#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-  inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
-  try {
-      auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-      inputTensors.clear();
-      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-      return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
-    } catch (const Ort::Exception& exception) {
-      LOG(error) << "Error running model inference: " << exception.what();
-    }
-#else
-  std::vector<const char*> tmpInputs;
-  std::vector<const char*> tmpOutputs;
-  for (unsigned int i = 0; i < mInputNames.size(); i++) {
-    tmpInputs.emplace_back(mInputNames[i].c_str());
-  }
-  for (unsigned int i = 0; i < mOutputNames.size(); i++) {
-    tmpOutputs.emplace_back(mOutputNames[i].c_str());
-  }
-  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(mMemoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size()));
-  try {
-    auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size());
-    inputTensors.clear();
-    float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-    return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
-  } catch (const Ort::Exception& exception) {
-    LOG(error) << "Error running model inference: " << exception.what();
-  }
-#endif
-  return std::vector<float>{};
-}
-
-void OnnxModel::setActiveThreads(int threads)
-{
-  activeThreads = threads;
-}
-
-template float* OnnxModel::inference(std::vector<float>, unsigned int);
-template std::vector<float> OnnxModel::inference_vector(std::vector<float>, unsigned int);
-
-} // namespace gpu
-
-} // namespace GPUCA_NAMESPACE
\ No newline at end of file
diff --git a/GPU/GPUTracking/ML/onnx_interface.h b/GPU/GPUTracking/ML/onnx_interface.h
deleted file mode 100644
index 17c45f439dc63..0000000000000
--- a/GPU/GPUTracking/ML/onnx_interface.h
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-///
-/// \file     model.h
-///
-/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
-///
-/// \brief    A general-purpose class for ONNX models
-///
-
-#ifndef GPU_ML_ONNX_INTERFACE_H
-#define GPU_ML_ONNX_INTERFACE_H
-
-// C++ and system includes
-#if __has_include(<onnxruntime/core/session/onnxruntime_cxx_api.h>)
-#include <onnxruntime/core/session/onnxruntime_cxx_api.h>
-#else
-#include <onnxruntime_cxx_api.h>
-#endif
-#include <vector>
-#include <string>
-#include <memory>
-#include <map>
-#include <thread>
-
-// O2 includes
-#include "Framework/Logger.h"
-
-namespace o2
-{
-
-namespace ml
-{
-
-class OnnxModel
-{
-
- public:
-  OnnxModel(OrtAllocatorType allocatorType = OrtDeviceAllocator, OrtMemType memoryType = OrtMemTypeCPU) : mMemoryInfo(Ort::MemoryInfo::CreateCpu(allocatorType, memoryType)) {};
-  virtual ~OnnxModel() = default;
-
-  // Inferencing
-  void init(std::string, bool = false, int = 0, int = 0);
-  // float* inference(std::vector<Ort::Value>, int = 0);
-  // float* inference(std::vector<float>, int = 0);
-  template<class T> float* inference(T input, unsigned int size);
-  template<class T> std::vector<float> inference_vector(T input, unsigned int size);
-
-  // Reset session
-  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-    void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); };
-  #else
-    void resetSession() { mSession.reset(new Ort::Session{*mEnv, modelPath.c_str(), sessionOptions}); };
-  #endif
-
-  // Getters & Setters
-  Ort::SessionOptions* getSessionOptions() { return &sessionOptions; } // For optimizations in post
-  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-    std::shared_ptr<Ort::Experimental::Session> getSession() { return mSession; }
-  #else
-    std::shared_ptr<Ort::Session> getSession() { return mSession; }
-  #endif
-  std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
-  std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
-  void setActiveThreads(int);
-
- private:
-  // Environment variables for the ONNX runtime
-  std::shared_ptr<Ort::Env> mEnv = nullptr;
-  std::shared_ptr<Ort::Session> mSession = nullptr; ///< ONNX session
-  Ort::MemoryInfo mMemoryInfo;
-  Ort::SessionOptions sessionOptions;
-
-  // Input & Output specifications of the loaded network
-  std::vector<std::string> mInputNames;
-  std::vector<std::vector<int64_t>> mInputShapes;
-  std::vector<std::string> mOutputNames;
-  std::vector<std::vector<int64_t>> mOutputShapes;
-
-  // Environment settings
-  std::string modelPath;
-  int activeThreads = 0;
-
-  // Internal function for printing the shape of tensors
-  std::string printShape(const std::vector<int64_t>&);
-};
-
-} // namespace gpu
-
-} // namespace GPUCA_NAMESPACE
-
-#endif // GPU_ML_ONNX_INTERFACE_H
\ No newline at end of file
diff --git a/GPU/GPUTracking/TPCClusterFinder/ChargePos.h b/GPU/GPUTracking/TPCClusterFinder/ChargePos.h
index f5ca9dbedd5ac..c2ee542f65434 100644
--- a/GPU/GPUTracking/TPCClusterFinder/ChargePos.h
+++ b/GPU/GPUTracking/TPCClusterFinder/ChargePos.h
@@ -47,6 +47,7 @@ struct ChargePos {
   GPUdi() tpccf::Row row() const { return gpad / TPC_PADS_PER_ROW_PADDED; }
   GPUdi() tpccf::Pad pad() const { return gpad % TPC_PADS_PER_ROW_PADDED - GPUCF_PADDING_PAD; }
   GPUdi() tpccf::TPCFragmentTime time() const { return timePadded - GPUCF_PADDING_TIME; }
+  GPUdi() tpccf::TPCFragmentTime globalTime() const { return timePadded; }
 
  private:
   // Maps the position of a pad given as row and index in that row to a unique
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index 10b52ca05da71..130453e833911 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -19,7 +19,8 @@
 #include "GPUProcessor.h"
 #include "GPUDataTypes.h"
 #include "CfFragment.h"
-#include "ML/onnx_interface.h"
+#include "ML/ort_interface.h"
+#include "ML/3rdparty/GPUORTFloat16.h"
 
 using namespace o2::ml;
 
@@ -144,16 +145,20 @@ class GPUTPCClusterFinder : public GPUProcessor
   int16_t mZSOffsetId = -1;
   int16_t mOutputId = -1;
 
-  int nnSizeInputRow = 3;
-  int nnSizeInputPad = 3;
-  int nnSizeInputTime = 3;
-  bool nnAddIndexData = true;
+  int nnClusterizerSizeInputRow = 3;
+  int nnClusterizerSizeInputPad = 3;
+  int nnClusterizerSizeInputTime = 3;
+  int nnClusterizerElementSize = -1;
+  bool nnClusterizerAddIndexData = true;
   float nnClassThreshold = 0.16;
-  bool nnSigmoidTrafoThreshold = 1;
-  int nnClusterizerVerbosity = 1;
-
-  OnnxModel model_class, model_reg;
-
+  bool nnSigmoidTrafoClassThreshold = 1;
+  int nnClusterizerUseCFregression = 0;
+  int nnClusterizerBatchedMode = 1;
+  int nnClusterizerVerbosity = 0;
+
+  std::unordered_map<std::string, std::string> OrtOptions;
+  OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
+  
 #ifndef GPUCA_GPUCODE
   void DumpDigits(std::ostream& out);
   void DumpChargeMap(std::ostream& out, std::string_view);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 98f7cdee72b0c..e6cf745ce3101 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -34,76 +34,63 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo
 
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
-  GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, clusterer.nnSizeInputRow, clusterer.nnSizeInputPad, clusterer.nnSizeInputTime, clusterer.nnAddIndexData, clusterer.nnClassThreshold, clusterer.nnSigmoidTrafoThreshold, clusterer.nnClusterizerVerbosity);
-
+  if(clusterer.OrtOptions["dtype"].find("32") != std::string::npos){
+    GPUTPCNNClusterizer::nn_clusterizer<float>(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
+  } else if(clusterer.OrtOptions["dtype"].find("16") != std::string::npos) {
+    GPUTPCNNClusterizer::nn_clusterizer<OrtDataType::Float16_t>(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
+  } else {
+    LOG(fatal) << "Unsupported data type for neural network clusterizer!";
+  }
   // tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 // 
   // GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
 }
 
-// GPUd() void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC)
-// {
-//   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-//   CPU_ONLY(
-//     MCLabelAccumulator labelAcc(clusterer));
-// 
-//   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
-// 
-//   std::string path_class = "", path_reg = "";
-// 
-//   clusterer.model_class.init(path_class, 1, 0);
-//   clusterer.model_reg.init(path_reg, 1, 0);
-// 
-//   GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true);
-// }
+int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo)
+{
+  return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2);
+}
 
-int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current)
+int GPUTPCNNClusterizer::rowOffset(int row, int global_shift)
 {
-  std::vector<int> pad_row_max{
-    65, 65, 65, 67, 67, 67, 69, 69, 69, 71, 71, 71, 73, 73, 73, 73, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 81, 81, 81, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 91, 91, 91, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 99, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 79, 81, 81, 81, 83, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 101, 101, 101, 103, 103, 103, 105, 109, 109, 111, 111, 111, 113, 113, 113, 115, 115, 115, 117, 117, 117, 117, 117, 119, 119, 121, 121, 123, 123, 123, 125, 125, 127, 127, 127, 129, 129, 131, 131, 131, 133, 133, 135, 135, 137, 137
-  };
-  return (int)((pad_row_max[row_ref] - pad_row_max[row_current]) / 2);
+  return (row > 62 ? global_shift : 0);
 }
 
 // ---------------------------------
-bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift)
+bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo)
 {
-  std::vector<int> pad_row_max{
-    65, 65, 65, 67, 67, 67, 69, 69, 69, 71, 71, 71, 73, 73, 73, 73, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 81, 81, 81, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 91, 91, 91, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 99, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 79, 81, 81, 81, 83, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 101, 101, 101, 103, 103, 103, 105, 109, 109, 111, 111, 111, 113, 113, 113, 115, 115, 115, 117, 117, 117, 117, 117, 119, 119, 121, 121, 123, 123, 123, 125, 125, 127, 127, 127, 129, 129, 131, 131, 131, 133, 133, 135, 135, 137, 137
-  };
-  if (row < 0 || pad < 0) {
+  if (pad < 0 || row < 0) { // Faster short-circuit
     return true;
   } else if (row <= 62) {
-    // if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row]) / 2) {
+    // if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)) / 2) {
     //   return true;
     // } else {
     //   return false;
     // }
-    if (pad < 0 || pad > pad_row_max[row]) {
+    if (pad < 0 || pad > geo.NPads(row)) {
       return true;
     } else {
       return false;
     }
-  } else if (row <= 62 + global_shift) {
+  } else if (row <= 62 + global_shift) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
     return true;
   } else if (row <= o2::tpc::constants::MAXGLOBALPADROW-1 + global_shift) {
-    //if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row - global_shift]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row - global_shift]) / 2) {
+    //if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)- global_shift]) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)- global_shift]) / 2) {
     //  return true;
     //} else {
     //  return false;
     //}
-    if (pad < 0 || pad > pad_row_max[row]) {
+    if (pad < 0 || pad > geo.NPads(row)) {
       return true;
     } else {
       return false;
     }
-  } else if (row > o2::tpc::constants::MAXGLOBALPADROW-1 + global_shift) {
-    return true;
   } else {
-    return false;
+    return true;
   }
 }
 
+template <class T>
 GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int iBlock, int iThread,
                                           processorType& clusterer,
                                           const CfFragment& fragment,
@@ -116,104 +103,321 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
                                           uint maxClusterPerRow,
                                           uint* clusterInRow,
                                           tpc::ClusterNative* clusterByRow,
-                                          uint* clusterPosInRow,
-                                          int in_row, int in_pad, int in_time, bool add_index_data, float class_threshold, bool sigmoid_transform, int verbosity){
+                                          uint* clusterPosInRow){
 
-  std::vector<float> input_data(((2*in_row + 1) * (2*in_pad + 1) * (2*in_time + 1) + (add_index_data ? 3 : 0)), -1.f);
-  float classification_threshold = class_threshold;
-  if(sigmoid_transform){
-    classification_threshold = (float)std::log(class_threshold/(1.f-class_threshold));
-  }
-  
-  uint idx = get_global_id(0);
-  uint cls = CAMath::Min(idx, clusternum - 1);
+    uint glo_idx = get_global_id(0) * clusterer.nnClusterizerBatchedMode;
+    if(glo_idx >= clusternum){
+      return;
+    }
 
-  // For certain configurations dummy work items are added, so the total
-  // number of work items is dividable by 64.
-  // These dummy items also compute the last cluster but discard the result.
-  
-  ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
-  int row = peak.row(), pad = peak.pad(), time = peak.time();
-  float central_charge = chargeMap[peak].unpack();
-  CPU_ONLY(labelAcc->collect(peak, central_charge));
-  // unsigned int glo_idx = cls * ((2*in_row + 1) + (2*in_pad + 1) * (2*in_time + 1));
-  unsigned int write_idx = 0;
-  for(int r = -in_row; r <= in_row; r++){
-    for(int p = -in_pad; p <= in_pad; p++){
-      for(int t = -in_time; t <= in_time; t++){
-        int offset = GPUTPCNNClusterizer::padOffset(row, row + r);
-        if(GPUTPCNNClusterizer::isBoundary(row + r, pad + p, in_row)){
-          continue;
-        } else {
-          // unsigned int loc_idx = (row + r) * (2*in_pad + 1) * (2*in_time + 1) + (pad + p) * (2*in_time + 1) + (time + t);
-          ChargePos tmp_pos(row + r, pad + p + offset, time + t);
-          input_data[write_idx] = (chargeMap[tmp_pos].unpack() / central_charge);
-          write_idx++;
+    std::vector<float> central_charges(clusterer.nnClusterizerBatchedMode, -1.f);
+    std::vector<T> input_data(clusterer.nnClusterizerElementSize * clusterer.nnClusterizerBatchedMode, (T)-1.f);
+    std::vector<ChargePos> peak_positions(clusterer.nnClusterizerBatchedMode);
+    unsigned int write_idx = 0;
+
+    for(int batch_counter = 0; batch_counter < clusterer.nnClusterizerBatchedMode; batch_counter++){
+
+      uint cls = CAMath::Min(glo_idx + batch_counter, clusternum - 1);
+
+      ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
+      int row = peak.row(), pad = peak.pad(), time = peak.time();
+      float central_charge = chargeMap[peak].unpack();
+
+      peak_positions[batch_counter] = peak;
+      central_charges[batch_counter] = central_charge;
+
+      // unsigned int batch_offset = batch_counter * clusterer.nnClusterizerElementSize;
+      for(int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++){
+        bool push_mc_label = (r == 0);
+        int pad_offset = GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);
+        int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow);
+        for(int p = -clusterer.nnClusterizerSizeInputPad; p <= clusterer.nnClusterizerSizeInputPad; p++){
+          push_mc_label &= (std::abs(p) < 2); // Use inner 5x5 window
+          bool is_boundary = GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p + pad_offset, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
+          for(int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++){
+            push_mc_label &= (std::abs(t) < 2); // Use inner 5x5 window
+            if(!is_boundary){
+              ChargePos tmp_pos(row + r, pad + p + pad_offset, time + t);
+              input_data[write_idx] = (T)(chargeMap[tmp_pos].unpack() / central_charge);
+              if(push_mc_label){
+                ChargePos tmp_pos_mc(row, pad + p, time + t);
+                CPU_ONLY(labelAcc->collect(tmp_pos, chargeMap[tmp_pos_mc].unpack()));
+              }
+            }
+            write_idx++;
+          }
         }
       }
-      // if(idx == 100){
-      //   LOG(info) << "[" << input_data[write_idx-7] << ", " << input_data[write_idx-6] << ", " << input_data[write_idx-5] << ", " << input_data[write_idx-4] << ", " << input_data[write_idx-3] << ", " << input_data[write_idx-2] << ", " << input_data[write_idx-1] << "]";
-      // }
+      if(clusterer.nnClusterizerAddIndexData){
+        input_data[write_idx] = (T)(clusterer.mISlice / 36.f);
+        input_data[write_idx + 1] = (T)(row / 152.f);
+        input_data[write_idx + 2] = (T)((float)pad / clusterer.Param().tpcGeometry.NPads(row));
+        write_idx+=3;
+        // if(idx == 100){
+        //   LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]";
+        // }
+      }
     }
-  }
-  if(add_index_data){
-    input_data[input_data.size()-3] = 1;
-    input_data[input_data.size()-2] = (float)peak.row() / 152.f;
-    input_data[input_data.size()-1] = (float)peak.pad() / 138.f;
-    // if(idx == 100){
-    //   LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]";
-    // }
-  }
 
-  std::vector<float> out_class = clusterer.model_class.inference_vector(input_data, 1);
-  std::vector<float> out_reg = clusterer.model_reg.inference_vector(input_data, 1);
-  int num_outputs = clusterer.model_reg.getNumOutputNodes()[0][1];
+    std::vector<int> index_class_2;
+    std::vector<float> out_class = clusterer.model_class.inference<T,float>(input_data);
+    // LOG(info) << "input_data.size(): " << input_data.size() << "; write_idx: " << write_idx << "; out_class.size(): " << out_class.size();
+    int num_output_classes = clusterer.model_class.getNumOutputNodes()[0][1];
+
+    if(num_output_classes > 1){
+      std::vector<float> tmp_out_class(clusterer.nnClusterizerBatchedMode);
+      for(int cls_idx = 0; cls_idx < clusterer.nnClusterizerBatchedMode; cls_idx++){
+        auto elem_iterator = out_class.begin() + (unsigned int)(cls_idx*num_output_classes);
+        tmp_out_class[cls_idx] = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator+num_output_classes)) - 1; // -1 since 2-class classifier will have 3 outputs: classes 0, 1, 2
+        if(tmp_out_class[cls_idx] > 1){
+          index_class_2.push_back(cls_idx);
+        }
+      }
+      out_class = tmp_out_class;
+    }
 
-  if((verbosity > 4) && idx == 100){
-    LOG(info) << "Classification model: " << out_class[0] << " (>? " << classification_threshold << ")";
-    LOG(info) << "Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4];
-  }
+    if(!clusterer.nnClusterizerUseCFregression) {
+
+      std::vector<float> out_reg = clusterer.model_reg_1.inference<T,float>(input_data), tmp_out_reg_2;
+      if(index_class_2.size() > 0){
+        std::vector<T> tmp_in_reg_2(index_class_2.size() * clusterer.nnClusterizerElementSize);
+        int fill_counter = 0;
+        for(int cls_idx : index_class_2){
+          int from_idx = cls_idx*clusterer.nnClusterizerElementSize, to_idx = fill_counter * clusterer.nnClusterizerElementSize;
+          for(int reg_idx = 0; reg_idx < clusterer.nnClusterizerElementSize; reg_idx++){
+            tmp_in_reg_2[to_idx + reg_idx] = input_data[from_idx + reg_idx];
+          }
+          fill_counter++;
+        }
+        tmp_out_reg_2 = clusterer.model_reg_2.inference<T,float>(input_data);
+      }
+
+      input_data.clear();
 
-  if(out_class[0] > classification_threshold){
-    ClusterAccumulator pc;
-    pc.setFull(central_charge * out_reg[4], peak.pad() + out_reg[0], out_reg[2], fragment.start + peak.time() + out_reg[1], out_reg[3], 0, 0);
-    tpc::ClusterNative myCluster;
-    bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param());
-    if ((verbosity > 0) && rejectCluster) {
-      LOG(warning) << "Cluster rejected!";
-      if (clusterPosInRow) {
-        clusterPosInRow[idx] = maxClusterPerRow;
+      if((clusterer.nnClusterizerVerbosity >= 4) && glo_idx == 0){
+        LOG(info) << "[CF] Classification model: " << out_class[0] << " (>? " << clusterer.nnClassThreshold << ")";
+        LOG(info) << "[CF] Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4];
       }
-      return;
-    }
 
-    uint rowIndex = 0;
-    if (clusterByRow != nullptr) {
-      rowIndex = sortIntoBuckets(
-        clusterer,
-        myCluster,
-        peak.row(),
-        maxClusterPerRow,
-        clusterInRow,
-        clusterByRow);
-      if (clusterPosInRow != nullptr) {
-        clusterPosInRow[idx] = rowIndex;
+      int num_outputs_1 = clusterer.model_reg_1.getNumOutputNodes()[0][1], num_outputs_2 = 0, counter_class_2_idcs = 0;
+      if(num_output_classes > 1){
+        num_outputs_2 = clusterer.model_reg_2.getNumOutputNodes()[0][1];
       }
-    } else if (clusterPosInRow) {
-      rowIndex = clusterPosInRow[idx];
-    }
-    CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, maxClusterPerRow));
-  } else {
-    if (clusterPosInRow) {
-      clusterPosInRow[idx] = maxClusterPerRow;
-    }
-    return;
-  }
 
-  if((verbosity > 4) && idx == 100){
-    LOG(info) << "Clusterization done!";
-  }
+      for(int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) {
+
+        if (glo_idx + element >= clusternum) {
+          return;
+        }
 
+        int model_output_index = element*num_outputs_1;
+        if(out_class[element] > clusterer.nnClassThreshold) {
+          if((num_output_classes == 1) || ((num_output_classes > 1) && (out_class[element] < 2))) {
+            // CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+            ClusterAccumulator pc;
+
+            ClusterAccumulator dummy_pc;
+            CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+
+            // Dummy build to push MC labels
+            buildCluster(
+              calib,
+              chargeMap,
+              peak_positions[element],
+              smem.posBcast,
+              smem.buf,
+              smem.innerAboveThreshold,
+              &dummy_pc,
+              labelAcc);
+
+            if (fragment.isOverlap(peak_positions[element].time())) {
+              if (clusterPosInRow) {
+                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+              }
+              continue;
+            }
+
+            pc.setFull(central_charges[element] * out_reg[model_output_index + 4], peak_positions[element].pad() + out_reg[model_output_index + 0], out_reg[model_output_index + 2], fragment.start + peak_positions[element].time() + out_reg[model_output_index + 1], out_reg[model_output_index + 3], 0, 0);
+            // LOG(info) << "Example: " << num_outputs_1 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
+
+            tpc::ClusterNative myCluster;
+            bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+            if (rejectCluster) {
+              if(clusterer.nnClusterizerVerbosity > 3){
+                LOG(warning) << "[CF] Cluster rejected!";
+              }
+              if (clusterPosInRow) {
+                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+              }
+              continue;
+            }
+
+            uint rowIndex = 0;
+            if (clusterByRow != nullptr) {
+              rowIndex = sortIntoBuckets(
+                clusterer,
+                myCluster,
+                peak_positions[element].row(),
+                maxClusterPerRow,
+                clusterInRow,
+                clusterByRow);
+              if (clusterPosInRow != nullptr) {
+                clusterPosInRow[glo_idx + element] = rowIndex;
+              }
+            } else if (clusterPosInRow) {
+              rowIndex = clusterPosInRow[glo_idx + element];
+            }
+            CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
+          } else {
+            model_output_index = index_class_2[counter_class_2_idcs]*num_outputs_2;
+            counter_class_2_idcs++;
+
+            // Cluster 1
+            CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+            ClusterAccumulator pc;
+
+            if (fragment.isOverlap(peak_positions[element].time())) {
+              if (clusterPosInRow) {
+                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+              }
+              continue;
+            }
+
+            pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 8], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 4], tmp_out_reg_2[model_output_index + 2], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 2], tmp_out_reg_2[model_output_index + 6], 0, 0);
+            // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
+
+            tpc::ClusterNative myCluster;
+            bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+            if (rejectCluster) {
+              if(clusterer.nnClusterizerVerbosity > 3){
+                LOG(warning) << "[CF] Cluster rejected!";
+              }
+              if (clusterPosInRow) {
+                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+              }
+              continue;
+            }
+
+            uint rowIndex = 0;
+            if (clusterByRow != nullptr) {
+              rowIndex = sortIntoBuckets(
+                clusterer,
+                myCluster,
+                peak_positions[element].row(),
+                maxClusterPerRow,
+                clusterInRow,
+                clusterByRow);
+              if (clusterPosInRow != nullptr) {
+                clusterPosInRow[glo_idx + element] = rowIndex;
+              }
+            } else if (clusterPosInRow) {
+              rowIndex = clusterPosInRow[glo_idx + element];
+            }
+            CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
+
+            // Cluster 2
+            CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+            pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 9], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 1], tmp_out_reg_2[model_output_index + 5], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 3], tmp_out_reg_2[model_output_index + 7], 0, 0);
+            // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
+            rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+            if (rejectCluster) {
+              if(clusterer.nnClusterizerVerbosity > 3){
+                LOG(warning) << "[CF] Cluster rejected!";
+              }
+              if (clusterPosInRow) {
+                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+              }
+              continue;
+            }
+
+            rowIndex = 0;
+            if (clusterByRow != nullptr) {
+              rowIndex = sortIntoBuckets(
+                clusterer,
+                myCluster,
+                peak_positions[element].row(),
+                maxClusterPerRow,
+                clusterInRow,
+                clusterByRow);
+              if (clusterPosInRow != nullptr) {
+                clusterPosInRow[glo_idx + element] = rowIndex;
+              }
+            } else if (clusterPosInRow) {
+              rowIndex = clusterPosInRow[glo_idx + element];
+            }
+            CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
+          }
+        }
+      }
+
+    } else {
+
+      input_data.clear();
+      for(int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) {
+        if (glo_idx + element >= clusternum) {
+          return;
+        }
+
+        if(out_class[element] > clusterer.nnClassThreshold) {
+
+          ClusterAccumulator pc;
+          CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+
+          buildCluster(
+            calib,
+            chargeMap,
+            peak_positions[element],
+            smem.posBcast,
+            smem.buf,
+            smem.innerAboveThreshold,
+            &pc,
+            labelAcc);
+
+          if (fragment.isOverlap(peak_positions[element].time())) {
+            if (clusterPosInRow) {
+              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+            }
+            continue;
+          }
+          pc.finalize(peak_positions[element], central_charges[element], fragment.start, clusterer.Param().tpcGeometry);
+
+          tpc::ClusterNative myCluster;
+          bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+
+          if (rejectCluster) {
+              if(clusterer.nnClusterizerVerbosity > 3){
+                LOG(warning) << "[CF] Cluster rejected!";
+              }
+              if (clusterPosInRow) {
+                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+              }
+              continue;
+            }
+
+          uint rowIndex = 0;
+          if (clusterByRow != nullptr) {
+            rowIndex = sortIntoBuckets(
+              clusterer,
+              myCluster,
+              peak_positions[element].row(),
+              maxClusterPerRow,
+              clusterInRow,
+              clusterByRow);
+            if (clusterPosInRow != nullptr) {
+              clusterPosInRow[glo_idx + element] = rowIndex;
+            }
+          } else if (clusterPosInRow) {
+            rowIndex = clusterPosInRow[glo_idx + element];
+          }
+
+          CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
+        }
+      }
+    }
+
+    if(clusterer.nnClusterizerVerbosity > 4){
+      LOG(info) << "[CF] Clusterization done!";
+    }
 }
 
 
@@ -449,4 +653,4 @@ GPUd() uint GPUTPCNNClusterizer::sortIntoBuckets(processorType& clusterer, const
     CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket);
   }
   return index;
-}
+}
\ No newline at end of file
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 7fbf5a806a916..42104ae2099d3 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -62,8 +62,11 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
   static GPUd() void computeClustersImpl(int, int, int, int, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D<PackedCharge>&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint, uint, uint*, tpc::ClusterNative*, uint*);
 
   static GPUd() void exec(int, int, int, int, GPUSharedMemory&, processorType&, char);
-  static int padOffset(int, int);
-  static bool isBoundary(int, int, int);
+  static int padOffset(int, int, const GPUTPCGeometry&);
+  static int rowOffset(int, int);
+  static bool isBoundary(int, int, int, const GPUTPCGeometry&);
+
+  template<class T>
   static GPUd() void nn_clusterizer(int, int, int, int,
                               processorType&,
                               const CfFragment&,
@@ -76,8 +79,7 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
                               uint,
                               uint*,
                               tpc::ClusterNative*,
-                              uint*,
-                              int = 3, int = 3, int = 3, bool = 1, float = 0.16, bool = true, int = 1);
+                              uint*);
 
  private:
   // ---------------------------------
@@ -93,4 +95,4 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
 
 } // namespace GPUCA_NAMESPACE::gpu
 
-#endif
+#endif
\ No newline at end of file

From 06737fd8d044a75d4e6da947a3ae6792c7ae42af Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 18 Oct 2024 09:16:01 +0200
Subject: [PATCH 12/23] Fixing uchar -> uint8_t

---
 GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h    | 6 +++---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx | 8 ++++----
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h   | 6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
index 534cc44513286..d308b8bd6efa7 100644
--- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
+++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
@@ -43,7 +43,7 @@ class ClusterAccumulator
   GPUd() void finalize(const ChargePos&, tpccf::Charge, tpccf::TPCTime, const GPUTPCGeometry&);
   GPUd() bool toNative(const ChargePos&, tpccf::Charge, tpc::ClusterNative&, const GPUParam&) const;
 
-  GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uchar splitInTime, uchar splitInPad){
+  GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uint8_t splitInTime, uint8_t splitInPad){
     mQtot = qtot;
     mPadMean = padMean;
     mPadSigma = padSigma;
@@ -57,8 +57,8 @@ class ClusterAccumulator
   GPUd() void setPadSigma(float padSigma) { mPadSigma = padSigma; }
   GPUd() void setTimeMean(float timeMean) { mTimeMean = timeMean; }
   GPUd() void setTimeSigma(float timeSigma) { mTimeSigma = timeSigma; }
-  GPUd() void setSplitInTime(uchar splitInTime) { mSplitInTime = splitInTime; }
-  GPUd() void setSplitInPad(uchar splitInPad) { mSplitInPad = splitInPad; }
+  GPUd() void setSplitInTime(uint8_t splitInTime) { mSplitInTime = splitInTime; }
+  GPUd() void setSplitInPad(uint8_t splitInPad) { mSplitInPad = splitInPad; }
 
  private:
   float mQtot = 0;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index e6cf745ce3101..f5e094a3c363e 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -505,9 +505,9 @@ GPUdii() void GPUTPCNNClusterizer::updateClusterInner(
   const ChargePos& pos,
   ClusterAccumulator* cluster,
   MCLabelAccumulator* labelAcc,
-  uchar* innerAboveThreshold)
+  uint8_t* innerAboveThreshold)
 {
-  uchar aboveThreshold = 0;
+  uint8_t aboveThreshold = 0;
 
   GPUCA_UNROLL(U(), U())
   for (ushort i = 0; i < N; i++) {
@@ -520,7 +520,7 @@ GPUdii() void GPUTPCNNClusterizer::updateClusterInner(
     CPU_ONLY(
       labelAcc->collect(pos.delta(d), q));
 
-    aboveThreshold |= (uchar(q > calib.tpc.cfInnerThreshold) << i);
+    aboveThreshold |= (uint8_t(q > calib.tpc.cfInnerThreshold) << i);
   }
 
   innerAboveThreshold[lid] = aboveThreshold;
@@ -558,7 +558,7 @@ GPUdii() void GPUTPCNNClusterizer::buildCluster(
   ChargePos pos,
   ChargePos* posBcast,
   PackedCharge* buf,
-  uchar* innerAboveThreshold,
+  uint8_t* innerAboveThreshold,
   ClusterAccumulator* myCluster,
   MCLabelAccumulator* labelAcc)
 {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 42104ae2099d3..51a5c29022421 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -40,7 +40,7 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
   struct GPUSharedMemory {
     ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
     PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N];
-    uchar innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE];
+    uint8_t innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE];
   };
 
 #ifdef GPUCA_HAVE_O2HEADERS
@@ -84,11 +84,11 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
  private:
   // ---------------------------------
 
-  static GPUd() void updateClusterInner(const GPUSettingsRec&, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uchar*);
+  static GPUd() void updateClusterInner(const GPUSettingsRec&, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uint8_t*);
 
   static GPUd() void updateClusterOuter(ushort, ushort, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*);
 
-  static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uchar*, ClusterAccumulator*, MCLabelAccumulator*);
+  static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);
 
   static GPUd() uint sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint, uint, uint*, tpc::ClusterNative*);
 };

From b14844990173a00a66d9e2ad62185232ab3992d6 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 18 Oct 2024 09:55:31 +0200
Subject: [PATCH 13/23] Adding utils header

---
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index eafd50a72424f..0f22a7472feac 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -37,6 +37,7 @@
 #endif
 
 #include "utils/strtag.h"
+#include <CommonUtils/StringUtils.h>
 
 #ifndef GPUCA_NO_VC
 #include <Vc/Vc>

From 534da50f248210cff92acdeac763f4f74a2de30e Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 21 Oct 2024 09:40:43 +0200
Subject: [PATCH 14/23] Updating kernels.cmake to uint8_t

---
 GPU/GPUTracking/kernels.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index 5b5aed94a7472..b6490c0c5b4c6 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -116,7 +116,7 @@ o2_gpu_add_kernel("GPUTPCCFPeakFinder"                                "= TPCCLUS
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression"        "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks"             "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFDeconvolution"                             "= TPCCLUSTERFINDER"                                    LB      single)
-o2_gpu_add_kernel("GPUTPCNNClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single char onlyMC)
+o2_gpu_add_kernel("GPUTPCNNClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single int8_t onlyMC)
 o2_gpu_add_kernel("GPUTPCCFClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single int8_t onlyMC)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets"           "= TPCCLUSTERFINDER"                                    NO      single)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten"                 "= TPCCLUSTERFINDER"                                    NO      single GPUTPCLinearLabels* out)

From bb2cb6e48d12f71fb634b1429bf284db23bb97ee Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Mon, 21 Oct 2024 07:41:20 +0000
Subject: [PATCH 15/23] Please consider the following formatting changes

---
 Common/ML/include/ML/ort_interface.h          |  76 ++-
 Common/ML/src/ort_interface.cxx               |  88 +--
 .../Global/GPUChainTrackingClusterizer.cxx    |  23 +-
 .../TPCClusterFinder/ClusterAccumulator.h     |   3 +-
 .../TPCClusterFinder/GPUTPCClusterFinder.h    |   2 +-
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 539 +++++++++---------
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  26 +-
 7 files changed, 385 insertions(+), 372 deletions(-)

diff --git a/Common/ML/include/ML/ort_interface.h b/Common/ML/include/ML/ort_interface.h
index a365860db3279..2fe9a44a0623c 100644
--- a/Common/ML/include/ML/ort_interface.h
+++ b/Common/ML/include/ML/ort_interface.h
@@ -35,60 +35,58 @@ namespace ml
 class OrtModel
 {
 
-  public:
-    // Constructor
-    OrtModel() = default;
-    OrtModel(std::unordered_map<std::string, std::string> optionsMap){ reset(optionsMap); }
-    void init(std::unordered_map<std::string, std::string> optionsMap){ reset(optionsMap); }
-    void reset(std::unordered_map<std::string, std::string>);
+ public:
+  // Constructor
+  OrtModel() = default;
+  OrtModel(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
+  void init(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
+  void reset(std::unordered_map<std::string, std::string>);
 
-    virtual ~OrtModel() = default;
+  virtual ~OrtModel() = default;
 
-    // Conversion
-    template<class I, class O>
-    std::vector<O> v2v(std::vector<I>&, bool = true);
+  // Conversion
+  template <class I, class O>
+  std::vector<O> v2v(std::vector<I>&, bool = true);
 
-    // Inferencing
-    template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
-    std::vector<O> inference(std::vector<I>&);
+  // Inferencing
+  template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
+  std::vector<O> inference(std::vector<I>&);
 
-    template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-    std::vector<O> inference(std::vector<std::vector<I>>&);
+  template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+  std::vector<O> inference(std::vector<std::vector<I>>&);
 
-    // template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
-    // std::vector<O> inference(std::vector<I>&);
+  // template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
+  // std::vector<O> inference(std::vector<I>&);
 
-    // Reset session
-    void resetSession();
+  // Reset session
+  void resetSession();
 
-    std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
-    std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
-    std::vector<std::string> getInputNames() const { return mInputNames; }
-    std::vector<std::string> getOutputNames() const { return mOutputNames; }
+  std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
+  std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
+  std::vector<std::string> getInputNames() const { return mInputNames; }
+  std::vector<std::string> getOutputNames() const { return mOutputNames; }
 
-    void setActiveThreads(int threads) { intraOpNumThreads = threads; }
+  void setActiveThreads(int threads) { intraOpNumThreads = threads; }
 
-  private:
+ private:
+  // ORT variables -> need to be hidden as Pimpl
+  struct OrtVariables;
+  OrtVariables* pImplOrt;
 
-    // ORT variables -> need to be hidden as Pimpl
-    struct OrtVariables;
-    OrtVariables* pImplOrt;
+  // Input & Output specifications of the loaded network
+  std::vector<const char*> inputNamesChar, outputNamesChar;
+  std::vector<std::string> mInputNames, mOutputNames;
+  std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes;
 
-    // Input & Output specifications of the loaded network
-    std::vector<const char*> inputNamesChar, outputNamesChar;
-    std::vector<std::string> mInputNames, mOutputNames;
-    std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes;
-
-    // Environment settings
-    std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda
-    int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
-
-    std::string printShape(const std::vector<int64_t>&);
+  // Environment settings
+  std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda
+  int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
 
+  std::string printShape(const std::vector<int64_t>&);
 };
 
 } // namespace ml
 
-} // namespace ml
+} // namespace o2
 
 #endif // O2_ML_ORT_INTERFACE_H
\ No newline at end of file
diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx
index 84a06ce1da068..8ebe0588b4a2b 100644
--- a/Common/ML/src/ort_interface.cxx
+++ b/Common/ML/src/ort_interface.cxx
@@ -25,7 +25,7 @@ namespace o2
 namespace ml
 {
 
-struct OrtModel::OrtVariables {  // The actual implementation is hidden in the .cxx file
+struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file
   // ORT runtime objects
   Ort::RunOptions runOptions;
   std::shared_ptr<Ort::Env> env = nullptr;
@@ -35,12 +35,13 @@ struct OrtModel::OrtVariables {  // The actual implementation is hidden in the .
   Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
 };
 
-void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
+void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
+{
 
   pImplOrt = new OrtVariables();
 
   // Load from options map
-  if(!optionsMap.contains("model-path")){
+  if (!optionsMap.contains("model-path")) {
     LOG(fatal) << "(ORT) Model path cannot be empty!";
   }
   modelPath = optionsMap["model-path"];
@@ -48,42 +49,42 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
   dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
   deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
   allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
-  intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ?  std::stoi(optionsMap["intra-op-num-threads"]) : 0);
+  intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);
   loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0);
   enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0);
   enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0);
 
   std::string dev_mem_str = "Hip";
 #ifdef ORT_ROCM_BUILD
-  if(device == "ROCM") {
+  if (device == "ROCM") {
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId));
     LOG(info) << "(ORT) ROCM execution provider set";
   }
 #endif
 #ifdef ORT_MIGRAPHX_BUILD
-  if(device == "MIGRAPHX") {
+  if (device == "MIGRAPHX") {
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId));
     LOG(info) << "(ORT) MIGraphX execution provider set";
   }
 #endif
 #ifdef ORT_CUDA_BUILD
-  if(device == "CUDA") {
+  if (device == "CUDA") {
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId));
     LOG(info) << "(ORT) CUDA execution provider set";
     dev_mem_str = "Cuda";
   }
 #endif
 
-  if(allocateDeviceMemory){
+  if (allocateDeviceMemory) {
     pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
     LOG(info) << "(ORT) Memory info set to on-device memory";
   }
 
-  if(device == "CPU") {
+  if (device == "CPU") {
     (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
-    if(intraOpNumThreads > 1){
+    if (intraOpNumThreads > 1) {
       (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL);
-    } else if(intraOpNumThreads == 1){
+    } else if (intraOpNumThreads == 1) {
       (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
     }
     LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads";
@@ -92,8 +93,8 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
   (pImplOrt->sessionOptions).DisableMemPattern();
   (pImplOrt->sessionOptions).DisableCpuMemArena();
 
-  if(enableProfiling){
-    if(optionsMap.contains("profiling-output-path")){
+  if (enableProfiling) {
+    if (optionsMap.contains("profiling-output-path")) {
       (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str());
     } else {
       LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now.";
@@ -109,27 +110,27 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
   (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
 
   for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
-      mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
+    mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
   }
   for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
-      mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+    mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
   }
   for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
-      mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
+    mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
   }
   for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
-      mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+    mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
   }
 
   inputNamesChar.resize(mInputNames.size(), nullptr);
   std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar),
-      [&](const std::string& str) { return str.c_str(); });
+                 [&](const std::string& str) { return str.c_str(); });
   outputNamesChar.resize(mOutputNames.size(), nullptr);
   std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
-      [&](const std::string& str) { return str.c_str(); });
+                 [&](const std::string& str) { return str.c_str(); });
 
   // Print names
-  if(loggingLevel > 1) {
+  if (loggingLevel > 1) {
     LOG(info) << "Input Nodes:";
     for (size_t i = 0; i < mInputNames.size(); i++) {
       LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
@@ -142,24 +143,28 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
   }
 }
 
-void OrtModel::resetSession() { 
+void OrtModel::resetSession()
+{
   (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
 }
 
-template<class I, class O>
-std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput) {
-  if constexpr (std::is_same_v<I,O>){
+template <class I, class O>
+std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput)
+{
+  if constexpr (std::is_same_v<I, O>) {
     return input;
   } else {
     std::vector<O> output(input.size());
     std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); });
-    if(clearInput) input.clear();
+    if (clearInput)
+      input.clear();
     return output;
   }
 }
 
-template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-std::vector<O> OrtModel::inference(std::vector<I>& input){
+template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+std::vector<O> OrtModel::inference(std::vector<I>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(input)).data(), input.size(), inputShape.data(), inputShape.size()));
@@ -171,10 +176,11 @@ std::vector<O> OrtModel::inference(std::vector<I>& input){
   return outputValuesVec;
 }
 
-template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input){
+template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
+{
   std::vector<Ort::Value> inputTensor;
-  for(auto i : input){
+  for (auto i : input) {
     std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
     inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(i)).data(), i.size(), inputShape.data(), inputShape.size()));
   }
@@ -195,7 +201,9 @@ std::string OrtModel::printShape(const std::vector<int64_t>& v)
   return ss.str();
 }
 
-template <> std::vector<float> OrtModel::inference<float, float>(std::vector<float>& input) {
+template <>
+std::vector<float> OrtModel::inference<float, float>(std::vector<float>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size()));
@@ -207,7 +215,9 @@ template <> std::vector<float> OrtModel::inference<float, float>(std::vector<flo
   return outputValuesVec;
 }
 
-template <> std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>& input) {
+template <>
+std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
@@ -219,7 +229,9 @@ template <> std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float
   return outputValuesVec;
 }
 
-template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>& input) {
+template <>
+std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
@@ -231,7 +243,9 @@ template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType:
   return outputValuesVec;
 }
 
-template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDataType::Float16_t>(std::vector<float>& input) {
+template <>
+std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDataType::Float16_t>(std::vector<float>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
@@ -243,9 +257,11 @@ template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDa
   return outputValuesVec;
 }
 
-template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>& input) {
+template <>
+std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>& input)
+{
   std::vector<Ort::Value> inputTensor;
-  for(auto i : input){
+  for (auto i : input) {
     std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
     inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(i.data()), i.size(), inputShape.data(), inputShape.size()));
   }
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 0f22a7472feac..d8470fdc2bf10 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -838,7 +838,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (clusterer.mPmemory->counters.nPeaks == 0) {
           continue;
         }
-        if(!GetProcessingSettings().applyNNclusterizer){
+        if (!GetProcessingSettings().applyNNclusterizer) {
           runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
           runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
         } else {
@@ -875,14 +875,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
         DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
 
-        if(GetProcessingSettings().applyNNclusterizer){
+        if (GetProcessingSettings().applyNNclusterizer) {
           // Settings for the clusterizer
           clusterer.nnClusterizerUseCFregression = GetProcessingSettings().nnClusterizerUseCFregression;
           clusterer.nnClusterizerSizeInputRow = GetProcessingSettings().nnClusterizerSizeInputRow;
           clusterer.nnClusterizerSizeInputPad = GetProcessingSettings().nnClusterizerSizeInputPad;
           clusterer.nnClusterizerSizeInputTime = GetProcessingSettings().nnClusterizerSizeInputTime;
           clusterer.nnClusterizerAddIndexData = GetProcessingSettings().nnClusterizerAddIndexData;
-          clusterer.nnClusterizerElementSize = ((2*clusterer.nnClusterizerSizeInputRow + 1) * (2*clusterer.nnClusterizerSizeInputPad + 1) * (2*clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0);
+          clusterer.nnClusterizerElementSize = ((2 * clusterer.nnClusterizerSizeInputRow + 1) * (2 * clusterer.nnClusterizerSizeInputPad + 1) * (2 * clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0);
           clusterer.nnClusterizerBatchedMode = GetProcessingSettings().nnClusterizerBatchedMode;
           clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnInferenceVerbosity;
 
@@ -893,7 +893,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           // Settings for the neural network evaluation
           clusterer.OrtOptions = {
             {"model-path", GetProcessingSettings().nnClassificationPath},
-            {"device",  GetProcessingSettings().nnInferenceDevice},
+            {"device", GetProcessingSettings().nnInferenceDevice},
             {"device-id", std::to_string(GetProcessingSettings().nnInferenceDeviceId)},
             {"allocate-device-memory", std::to_string(GetProcessingSettings().nnInferenceAllocateDevMem)},
             {"dtype", GetProcessingSettings().nnInferenceDtype},
@@ -901,16 +901,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             {"enable-optimizations", std::to_string(GetProcessingSettings().nnInferenceEnableOrtOptimization)},
             {"enable-profiling", std::to_string(GetProcessingSettings().nnInferenceOrtProfiling)},
             {"profiling-output-path", GetProcessingSettings().nnInferenceOrtProfilingPath},
-            {"logging-level", std::to_string(GetProcessingSettings().nnInferenceVerbosity)}
-          };
+            {"logging-level", std::to_string(GetProcessingSettings().nnInferenceVerbosity)}};
           clusterer.model_class.init(clusterer.OrtOptions);
-          if(!clusterer.nnClusterizerUseCFregression){
+          if (!clusterer.nnClusterizerUseCFregression) {
             std::vector<std::string> reg_model_paths = o2::utils::Str::tokenize(GetProcessingSettings().nnRegressionPath, ':');
-            if(clusterer.model_class.getNumOutputNodes()[0][1] == 1){
+            if (clusterer.model_class.getNumOutputNodes()[0][1] == 1) {
               clusterer.OrtOptions["model-path"] = reg_model_paths[0];
               clusterer.model_reg_1.init(clusterer.OrtOptions);
             } else {
-              if(reg_model_paths.size() == 1){
+              if (reg_model_paths.size() == 1) {
                 clusterer.OrtOptions["model-path"] = reg_model_paths[0];
                 clusterer.model_reg_1.init(clusterer.OrtOptions);
               } else {
@@ -925,9 +924,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           }
 
-          if(clusterer.nnSigmoidTrafoClassThreshold){
+          if (clusterer.nnSigmoidTrafoClassThreshold) {
             // Inverse sigmoid transformation
-            clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold/(1.f-clusterer.nnClassThreshold));
+            clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold / (1.f - clusterer.nnClassThreshold));
           }
           runKernel<GPUTPCNNClusterizer>({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
         } else {
@@ -939,7 +938,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           if (doGPU) {
             SynchronizeStream(lane);
           }
-          if(!GetProcessingSettings().applyNNclusterizer){
+          if (!GetProcessingSettings().applyNNclusterizer) {
             runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
           } else {
             runKernel<GPUTPCNNClusterizer>({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
index d308b8bd6efa7..b7e535a107eac 100644
--- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
+++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
@@ -43,7 +43,8 @@ class ClusterAccumulator
   GPUd() void finalize(const ChargePos&, tpccf::Charge, tpccf::TPCTime, const GPUTPCGeometry&);
   GPUd() bool toNative(const ChargePos&, tpccf::Charge, tpc::ClusterNative&, const GPUParam&) const;
 
-  GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uint8_t splitInTime, uint8_t splitInPad){
+  GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uint8_t splitInTime, uint8_t splitInPad)
+  {
     mQtot = qtot;
     mPadMean = padMean;
     mPadSigma = padSigma;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index 130453e833911..fd420357073e9 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -158,7 +158,7 @@ class GPUTPCClusterFinder : public GPUProcessor
 
   std::unordered_map<std::string, std::string> OrtOptions;
   OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
-  
+
 #ifndef GPUCA_GPUCODE
   void DumpDigits(std::ostream& out);
   void DumpChargeMap(std::ostream& out, std::string_view);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index f5e094a3c363e..ba8fac2a397e9 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -34,15 +34,15 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo
 
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
-  if(clusterer.OrtOptions["dtype"].find("32") != std::string::npos){
+  if (clusterer.OrtOptions["dtype"].find("32") != std::string::npos) {
     GPUTPCNNClusterizer::nn_clusterizer<float>(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
-  } else if(clusterer.OrtOptions["dtype"].find("16") != std::string::npos) {
+  } else if (clusterer.OrtOptions["dtype"].find("16") != std::string::npos) {
     GPUTPCNNClusterizer::nn_clusterizer<OrtDataType::Float16_t>(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
   } else {
     LOG(fatal) << "Unsupported data type for neural network clusterizer!";
   }
   // tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
-// 
+  //
   // GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
 }
 
@@ -74,12 +74,12 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G
     }
   } else if (row <= 62 + global_shift) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
     return true;
-  } else if (row <= o2::tpc::constants::MAXGLOBALPADROW-1 + global_shift) {
-    //if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)- global_shift]) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)- global_shift]) / 2) {
-    //  return true;
-    //} else {
-    //  return false;
-    //}
+  } else if (row <= o2::tpc::constants::MAXGLOBALPADROW - 1 + global_shift) {
+    // if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)- global_shift]) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)- global_shift]) / 2) {
+    //   return true;
+    // } else {
+    //   return false;
+    // }
     if (pad < 0 || pad > geo.NPads(row)) {
       return true;
     } else {
@@ -92,277 +92,135 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G
 
 template <class T>
 GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int iBlock, int iThread,
-                                          processorType& clusterer,
-                                          const CfFragment& fragment,
-                                          GPUSharedMemory& smem,
-                                          const Array2D<PackedCharge>& chargeMap,
-                                          const ChargePos* filteredPeakPositions,
-                                          const GPUSettingsRec& calib,
-                                          MCLabelAccumulator* labelAcc,
-                                          uint clusternum,
-                                          uint maxClusterPerRow,
-                                          uint* clusterInRow,
-                                          tpc::ClusterNative* clusterByRow,
-                                          uint* clusterPosInRow){
-
-    uint glo_idx = get_global_id(0) * clusterer.nnClusterizerBatchedMode;
-    if(glo_idx >= clusternum){
-      return;
-    }
+                                                processorType& clusterer,
+                                                const CfFragment& fragment,
+                                                GPUSharedMemory& smem,
+                                                const Array2D<PackedCharge>& chargeMap,
+                                                const ChargePos* filteredPeakPositions,
+                                                const GPUSettingsRec& calib,
+                                                MCLabelAccumulator* labelAcc,
+                                                uint clusternum,
+                                                uint maxClusterPerRow,
+                                                uint* clusterInRow,
+                                                tpc::ClusterNative* clusterByRow,
+                                                uint* clusterPosInRow)
+{
+
+  uint glo_idx = get_global_id(0) * clusterer.nnClusterizerBatchedMode;
+  if (glo_idx >= clusternum) {
+    return;
+  }
 
-    std::vector<float> central_charges(clusterer.nnClusterizerBatchedMode, -1.f);
-    std::vector<T> input_data(clusterer.nnClusterizerElementSize * clusterer.nnClusterizerBatchedMode, (T)-1.f);
-    std::vector<ChargePos> peak_positions(clusterer.nnClusterizerBatchedMode);
-    unsigned int write_idx = 0;
-
-    for(int batch_counter = 0; batch_counter < clusterer.nnClusterizerBatchedMode; batch_counter++){
-
-      uint cls = CAMath::Min(glo_idx + batch_counter, clusternum - 1);
-
-      ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
-      int row = peak.row(), pad = peak.pad(), time = peak.time();
-      float central_charge = chargeMap[peak].unpack();
-
-      peak_positions[batch_counter] = peak;
-      central_charges[batch_counter] = central_charge;
-
-      // unsigned int batch_offset = batch_counter * clusterer.nnClusterizerElementSize;
-      for(int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++){
-        bool push_mc_label = (r == 0);
-        int pad_offset = GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);
-        int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow);
-        for(int p = -clusterer.nnClusterizerSizeInputPad; p <= clusterer.nnClusterizerSizeInputPad; p++){
-          push_mc_label &= (std::abs(p) < 2); // Use inner 5x5 window
-          bool is_boundary = GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p + pad_offset, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
-          for(int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++){
-            push_mc_label &= (std::abs(t) < 2); // Use inner 5x5 window
-            if(!is_boundary){
-              ChargePos tmp_pos(row + r, pad + p + pad_offset, time + t);
-              input_data[write_idx] = (T)(chargeMap[tmp_pos].unpack() / central_charge);
-              if(push_mc_label){
-                ChargePos tmp_pos_mc(row, pad + p, time + t);
-                CPU_ONLY(labelAcc->collect(tmp_pos, chargeMap[tmp_pos_mc].unpack()));
-              }
+  std::vector<float> central_charges(clusterer.nnClusterizerBatchedMode, -1.f);
+  std::vector<T> input_data(clusterer.nnClusterizerElementSize * clusterer.nnClusterizerBatchedMode, (T)-1.f);
+  std::vector<ChargePos> peak_positions(clusterer.nnClusterizerBatchedMode);
+  unsigned int write_idx = 0;
+
+  for (int batch_counter = 0; batch_counter < clusterer.nnClusterizerBatchedMode; batch_counter++) {
+
+    uint cls = CAMath::Min(glo_idx + batch_counter, clusternum - 1);
+
+    ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
+    int row = peak.row(), pad = peak.pad(), time = peak.time();
+    float central_charge = chargeMap[peak].unpack();
+
+    peak_positions[batch_counter] = peak;
+    central_charges[batch_counter] = central_charge;
+
+    // unsigned int batch_offset = batch_counter * clusterer.nnClusterizerElementSize;
+    for (int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++) {
+      bool push_mc_label = (r == 0);
+      int pad_offset = GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);
+      int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow);
+      for (int p = -clusterer.nnClusterizerSizeInputPad; p <= clusterer.nnClusterizerSizeInputPad; p++) {
+        push_mc_label &= (std::abs(p) < 2); // Use inner 5x5 window
+        bool is_boundary = GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p + pad_offset, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
+        for (int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++) {
+          push_mc_label &= (std::abs(t) < 2); // Use inner 5x5 window
+          if (!is_boundary) {
+            ChargePos tmp_pos(row + r, pad + p + pad_offset, time + t);
+            input_data[write_idx] = (T)(chargeMap[tmp_pos].unpack() / central_charge);
+            if (push_mc_label) {
+              ChargePos tmp_pos_mc(row, pad + p, time + t);
+              CPU_ONLY(labelAcc->collect(tmp_pos, chargeMap[tmp_pos_mc].unpack()));
             }
-            write_idx++;
           }
+          write_idx++;
         }
       }
-      if(clusterer.nnClusterizerAddIndexData){
-        input_data[write_idx] = (T)(clusterer.mISlice / 36.f);
-        input_data[write_idx + 1] = (T)(row / 152.f);
-        input_data[write_idx + 2] = (T)((float)pad / clusterer.Param().tpcGeometry.NPads(row));
-        write_idx+=3;
-        // if(idx == 100){
-        //   LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]";
-        // }
-      }
     }
+    if (clusterer.nnClusterizerAddIndexData) {
+      input_data[write_idx] = (T)(clusterer.mISlice / 36.f);
+      input_data[write_idx + 1] = (T)(row / 152.f);
+      input_data[write_idx + 2] = (T)((float)pad / clusterer.Param().tpcGeometry.NPads(row));
+      write_idx += 3;
+      // if(idx == 100){
+      //   LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]";
+      // }
+    }
+  }
 
-    std::vector<int> index_class_2;
-    std::vector<float> out_class = clusterer.model_class.inference<T,float>(input_data);
-    // LOG(info) << "input_data.size(): " << input_data.size() << "; write_idx: " << write_idx << "; out_class.size(): " << out_class.size();
-    int num_output_classes = clusterer.model_class.getNumOutputNodes()[0][1];
-
-    if(num_output_classes > 1){
-      std::vector<float> tmp_out_class(clusterer.nnClusterizerBatchedMode);
-      for(int cls_idx = 0; cls_idx < clusterer.nnClusterizerBatchedMode; cls_idx++){
-        auto elem_iterator = out_class.begin() + (unsigned int)(cls_idx*num_output_classes);
-        tmp_out_class[cls_idx] = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator+num_output_classes)) - 1; // -1 since 2-class classifier will have 3 outputs: classes 0, 1, 2
-        if(tmp_out_class[cls_idx] > 1){
-          index_class_2.push_back(cls_idx);
-        }
+  std::vector<int> index_class_2;
+  std::vector<float> out_class = clusterer.model_class.inference<T, float>(input_data);
+  // LOG(info) << "input_data.size(): " << input_data.size() << "; write_idx: " << write_idx << "; out_class.size(): " << out_class.size();
+  int num_output_classes = clusterer.model_class.getNumOutputNodes()[0][1];
+
+  if (num_output_classes > 1) {
+    std::vector<float> tmp_out_class(clusterer.nnClusterizerBatchedMode);
+    for (int cls_idx = 0; cls_idx < clusterer.nnClusterizerBatchedMode; cls_idx++) {
+      auto elem_iterator = out_class.begin() + (unsigned int)(cls_idx * num_output_classes);
+      tmp_out_class[cls_idx] = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + num_output_classes)) - 1; // -1 since 2-class classifier will have 3 outputs: classes 0, 1, 2
+      if (tmp_out_class[cls_idx] > 1) {
+        index_class_2.push_back(cls_idx);
       }
-      out_class = tmp_out_class;
     }
+    out_class = tmp_out_class;
+  }
 
-    if(!clusterer.nnClusterizerUseCFregression) {
+  if (!clusterer.nnClusterizerUseCFregression) {
 
-      std::vector<float> out_reg = clusterer.model_reg_1.inference<T,float>(input_data), tmp_out_reg_2;
-      if(index_class_2.size() > 0){
-        std::vector<T> tmp_in_reg_2(index_class_2.size() * clusterer.nnClusterizerElementSize);
-        int fill_counter = 0;
-        for(int cls_idx : index_class_2){
-          int from_idx = cls_idx*clusterer.nnClusterizerElementSize, to_idx = fill_counter * clusterer.nnClusterizerElementSize;
-          for(int reg_idx = 0; reg_idx < clusterer.nnClusterizerElementSize; reg_idx++){
-            tmp_in_reg_2[to_idx + reg_idx] = input_data[from_idx + reg_idx];
-          }
-          fill_counter++;
+    std::vector<float> out_reg = clusterer.model_reg_1.inference<T, float>(input_data), tmp_out_reg_2;
+    if (index_class_2.size() > 0) {
+      std::vector<T> tmp_in_reg_2(index_class_2.size() * clusterer.nnClusterizerElementSize);
+      int fill_counter = 0;
+      for (int cls_idx : index_class_2) {
+        int from_idx = cls_idx * clusterer.nnClusterizerElementSize, to_idx = fill_counter * clusterer.nnClusterizerElementSize;
+        for (int reg_idx = 0; reg_idx < clusterer.nnClusterizerElementSize; reg_idx++) {
+          tmp_in_reg_2[to_idx + reg_idx] = input_data[from_idx + reg_idx];
         }
-        tmp_out_reg_2 = clusterer.model_reg_2.inference<T,float>(input_data);
-      }
-
-      input_data.clear();
-
-      if((clusterer.nnClusterizerVerbosity >= 4) && glo_idx == 0){
-        LOG(info) << "[CF] Classification model: " << out_class[0] << " (>? " << clusterer.nnClassThreshold << ")";
-        LOG(info) << "[CF] Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4];
-      }
-
-      int num_outputs_1 = clusterer.model_reg_1.getNumOutputNodes()[0][1], num_outputs_2 = 0, counter_class_2_idcs = 0;
-      if(num_output_classes > 1){
-        num_outputs_2 = clusterer.model_reg_2.getNumOutputNodes()[0][1];
+        fill_counter++;
       }
+      tmp_out_reg_2 = clusterer.model_reg_2.inference<T, float>(input_data);
+    }
 
-      for(int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) {
-
-        if (glo_idx + element >= clusternum) {
-          return;
-        }
-
-        int model_output_index = element*num_outputs_1;
-        if(out_class[element] > clusterer.nnClassThreshold) {
-          if((num_output_classes == 1) || ((num_output_classes > 1) && (out_class[element] < 2))) {
-            // CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
-            ClusterAccumulator pc;
-
-            ClusterAccumulator dummy_pc;
-            CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
-
-            // Dummy build to push MC labels
-            buildCluster(
-              calib,
-              chargeMap,
-              peak_positions[element],
-              smem.posBcast,
-              smem.buf,
-              smem.innerAboveThreshold,
-              &dummy_pc,
-              labelAcc);
-
-            if (fragment.isOverlap(peak_positions[element].time())) {
-              if (clusterPosInRow) {
-                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-              }
-              continue;
-            }
+    input_data.clear();
 
-            pc.setFull(central_charges[element] * out_reg[model_output_index + 4], peak_positions[element].pad() + out_reg[model_output_index + 0], out_reg[model_output_index + 2], fragment.start + peak_positions[element].time() + out_reg[model_output_index + 1], out_reg[model_output_index + 3], 0, 0);
-            // LOG(info) << "Example: " << num_outputs_1 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
-
-            tpc::ClusterNative myCluster;
-            bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
-            if (rejectCluster) {
-              if(clusterer.nnClusterizerVerbosity > 3){
-                LOG(warning) << "[CF] Cluster rejected!";
-              }
-              if (clusterPosInRow) {
-                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-              }
-              continue;
-            }
-
-            uint rowIndex = 0;
-            if (clusterByRow != nullptr) {
-              rowIndex = sortIntoBuckets(
-                clusterer,
-                myCluster,
-                peak_positions[element].row(),
-                maxClusterPerRow,
-                clusterInRow,
-                clusterByRow);
-              if (clusterPosInRow != nullptr) {
-                clusterPosInRow[glo_idx + element] = rowIndex;
-              }
-            } else if (clusterPosInRow) {
-              rowIndex = clusterPosInRow[glo_idx + element];
-            }
-            CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
-          } else {
-            model_output_index = index_class_2[counter_class_2_idcs]*num_outputs_2;
-            counter_class_2_idcs++;
-
-            // Cluster 1
-            CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
-            ClusterAccumulator pc;
-
-            if (fragment.isOverlap(peak_positions[element].time())) {
-              if (clusterPosInRow) {
-                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-              }
-              continue;
-            }
+    if ((clusterer.nnClusterizerVerbosity >= 4) && glo_idx == 0) {
+      LOG(info) << "[CF] Classification model: " << out_class[0] << " (>? " << clusterer.nnClassThreshold << ")";
+      LOG(info) << "[CF] Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4];
+    }
 
-            pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 8], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 4], tmp_out_reg_2[model_output_index + 2], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 2], tmp_out_reg_2[model_output_index + 6], 0, 0);
-            // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
-
-            tpc::ClusterNative myCluster;
-            bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
-            if (rejectCluster) {
-              if(clusterer.nnClusterizerVerbosity > 3){
-                LOG(warning) << "[CF] Cluster rejected!";
-              }
-              if (clusterPosInRow) {
-                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-              }
-              continue;
-            }
+    int num_outputs_1 = clusterer.model_reg_1.getNumOutputNodes()[0][1], num_outputs_2 = 0, counter_class_2_idcs = 0;
+    if (num_output_classes > 1) {
+      num_outputs_2 = clusterer.model_reg_2.getNumOutputNodes()[0][1];
+    }
 
-            uint rowIndex = 0;
-            if (clusterByRow != nullptr) {
-              rowIndex = sortIntoBuckets(
-                clusterer,
-                myCluster,
-                peak_positions[element].row(),
-                maxClusterPerRow,
-                clusterInRow,
-                clusterByRow);
-              if (clusterPosInRow != nullptr) {
-                clusterPosInRow[glo_idx + element] = rowIndex;
-              }
-            } else if (clusterPosInRow) {
-              rowIndex = clusterPosInRow[glo_idx + element];
-            }
-            CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
-
-            // Cluster 2
-            CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
-            pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 9], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 1], tmp_out_reg_2[model_output_index + 5], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 3], tmp_out_reg_2[model_output_index + 7], 0, 0);
-            // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
-            rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
-            if (rejectCluster) {
-              if(clusterer.nnClusterizerVerbosity > 3){
-                LOG(warning) << "[CF] Cluster rejected!";
-              }
-              if (clusterPosInRow) {
-                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-              }
-              continue;
-            }
+    for (int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) {
 
-            rowIndex = 0;
-            if (clusterByRow != nullptr) {
-              rowIndex = sortIntoBuckets(
-                clusterer,
-                myCluster,
-                peak_positions[element].row(),
-                maxClusterPerRow,
-                clusterInRow,
-                clusterByRow);
-              if (clusterPosInRow != nullptr) {
-                clusterPosInRow[glo_idx + element] = rowIndex;
-              }
-            } else if (clusterPosInRow) {
-              rowIndex = clusterPosInRow[glo_idx + element];
-            }
-            CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
-          }
-        }
+      if (glo_idx + element >= clusternum) {
+        return;
       }
 
-    } else {
-
-      input_data.clear();
-      for(int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) {
-        if (glo_idx + element >= clusternum) {
-          return;
-        }
-
-        if(out_class[element] > clusterer.nnClassThreshold) {
-
+      int model_output_index = element * num_outputs_1;
+      if (out_class[element] > clusterer.nnClassThreshold) {
+        if ((num_output_classes == 1) || ((num_output_classes > 1) && (out_class[element] < 2))) {
+          // CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
           ClusterAccumulator pc;
+
+          ClusterAccumulator dummy_pc;
           CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
 
+          // Dummy build to push MC labels
           buildCluster(
             calib,
             chargeMap,
@@ -370,7 +228,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
             smem.posBcast,
             smem.buf,
             smem.innerAboveThreshold,
-            &pc,
+            &dummy_pc,
             labelAcc);
 
           if (fragment.isOverlap(peak_positions[element].time())) {
@@ -379,20 +237,67 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
             }
             continue;
           }
-          pc.finalize(peak_positions[element], central_charges[element], fragment.start, clusterer.Param().tpcGeometry);
+
+          pc.setFull(central_charges[element] * out_reg[model_output_index + 4], peak_positions[element].pad() + out_reg[model_output_index + 0], out_reg[model_output_index + 2], fragment.start + peak_positions[element].time() + out_reg[model_output_index + 1], out_reg[model_output_index + 3], 0, 0);
+          // LOG(info) << "Example: " << num_outputs_1 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
 
           tpc::ClusterNative myCluster;
           bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+          if (rejectCluster) {
+            if (clusterer.nnClusterizerVerbosity > 3) {
+              LOG(warning) << "[CF] Cluster rejected!";
+            }
+            if (clusterPosInRow) {
+              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+            }
+            continue;
+          }
+
+          uint rowIndex = 0;
+          if (clusterByRow != nullptr) {
+            rowIndex = sortIntoBuckets(
+              clusterer,
+              myCluster,
+              peak_positions[element].row(),
+              maxClusterPerRow,
+              clusterInRow,
+              clusterByRow);
+            if (clusterPosInRow != nullptr) {
+              clusterPosInRow[glo_idx + element] = rowIndex;
+            }
+          } else if (clusterPosInRow) {
+            rowIndex = clusterPosInRow[glo_idx + element];
+          }
+          CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
+        } else {
+          model_output_index = index_class_2[counter_class_2_idcs] * num_outputs_2;
+          counter_class_2_idcs++;
 
+          // Cluster 1
+          CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+          ClusterAccumulator pc;
+
+          if (fragment.isOverlap(peak_positions[element].time())) {
+            if (clusterPosInRow) {
+              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+            }
+            continue;
+          }
+
+          pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 8], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 4], tmp_out_reg_2[model_output_index + 2], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 2], tmp_out_reg_2[model_output_index + 6], 0, 0);
+          // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
+
+          tpc::ClusterNative myCluster;
+          bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
           if (rejectCluster) {
-              if(clusterer.nnClusterizerVerbosity > 3){
-                LOG(warning) << "[CF] Cluster rejected!";
-              }
-              if (clusterPosInRow) {
-                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-              }
-              continue;
+            if (clusterer.nnClusterizerVerbosity > 3) {
+              LOG(warning) << "[CF] Cluster rejected!";
+            }
+            if (clusterPosInRow) {
+              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
             }
+            continue;
+          }
 
           uint rowIndex = 0;
           if (clusterByRow != nullptr) {
@@ -409,18 +314,112 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
           } else if (clusterPosInRow) {
             rowIndex = clusterPosInRow[glo_idx + element];
           }
+          CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
 
+          // Cluster 2
+          CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+          pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 9], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 1], tmp_out_reg_2[model_output_index + 5], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 3], tmp_out_reg_2[model_output_index + 7], 0, 0);
+          // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
+          rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+          if (rejectCluster) {
+            if (clusterer.nnClusterizerVerbosity > 3) {
+              LOG(warning) << "[CF] Cluster rejected!";
+            }
+            if (clusterPosInRow) {
+              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+            }
+            continue;
+          }
+
+          rowIndex = 0;
+          if (clusterByRow != nullptr) {
+            rowIndex = sortIntoBuckets(
+              clusterer,
+              myCluster,
+              peak_positions[element].row(),
+              maxClusterPerRow,
+              clusterInRow,
+              clusterByRow);
+            if (clusterPosInRow != nullptr) {
+              clusterPosInRow[glo_idx + element] = rowIndex;
+            }
+          } else if (clusterPosInRow) {
+            rowIndex = clusterPosInRow[glo_idx + element];
+          }
           CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
         }
       }
     }
 
-    if(clusterer.nnClusterizerVerbosity > 4){
-      LOG(info) << "[CF] Clusterization done!";
-    }
-}
+  } else {
+
+    input_data.clear();
+    for (int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) {
+      if (glo_idx + element >= clusternum) {
+        return;
+      }
+
+      if (out_class[element] > clusterer.nnClassThreshold) {
 
+        ClusterAccumulator pc;
+        CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
 
+        buildCluster(
+          calib,
+          chargeMap,
+          peak_positions[element],
+          smem.posBcast,
+          smem.buf,
+          smem.innerAboveThreshold,
+          &pc,
+          labelAcc);
+
+        if (fragment.isOverlap(peak_positions[element].time())) {
+          if (clusterPosInRow) {
+            clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+          }
+          continue;
+        }
+        pc.finalize(peak_positions[element], central_charges[element], fragment.start, clusterer.Param().tpcGeometry);
+
+        tpc::ClusterNative myCluster;
+        bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+
+        if (rejectCluster) {
+          if (clusterer.nnClusterizerVerbosity > 3) {
+            LOG(warning) << "[CF] Cluster rejected!";
+          }
+          if (clusterPosInRow) {
+            clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+          }
+          continue;
+        }
+
+        uint rowIndex = 0;
+        if (clusterByRow != nullptr) {
+          rowIndex = sortIntoBuckets(
+            clusterer,
+            myCluster,
+            peak_positions[element].row(),
+            maxClusterPerRow,
+            clusterInRow,
+            clusterByRow);
+          if (clusterPosInRow != nullptr) {
+            clusterPosInRow[glo_idx + element] = rowIndex;
+          }
+        } else if (clusterPosInRow) {
+          rowIndex = clusterPosInRow[glo_idx + element];
+        }
+
+        CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
+      }
+    }
+  }
+
+  if (clusterer.nnClusterizerVerbosity > 4) {
+    LOG(info) << "[CF] Clusterization done!";
+  }
+}
 
 GPUdii() void GPUTPCNNClusterizer::computeClustersImpl(int nBlocks, int nThreads, int iBlock, int iThread,
                                                        processorType& clusterer,
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 51a5c29022421..98d979d28cf15 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -66,20 +66,20 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
   static int rowOffset(int, int);
   static bool isBoundary(int, int, int, const GPUTPCGeometry&);
 
-  template<class T>
+  template <class T>
   static GPUd() void nn_clusterizer(int, int, int, int,
-                              processorType&,
-                              const CfFragment&,
-                              GPUSharedMemory&,
-                              const Array2D<PackedCharge>&,
-                              const ChargePos*,
-                              const GPUSettingsRec&,
-                              MCLabelAccumulator*,
-                              uint,
-                              uint,
-                              uint*,
-                              tpc::ClusterNative*,
-                              uint*);
+                                    processorType&,
+                                    const CfFragment&,
+                                    GPUSharedMemory&,
+                                    const Array2D<PackedCharge>&,
+                                    const ChargePos*,
+                                    const GPUSettingsRec&,
+                                    MCLabelAccumulator*,
+                                    uint,
+                                    uint,
+                                    uint*,
+                                    tpc::ClusterNative*,
+                                    uint*);
 
  private:
   // ---------------------------------

From 25093b33e1472d21a14e6396aa1d9fe1953d6b1b Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 18 Nov 2024 12:50:31 +0100
Subject: [PATCH 16/23] Adding an ONNX CPU library in the O2 framework

---
 Common/CMakeLists.txt                         |   1 +
 Common/ML/CMakeLists.txt                      |  15 +
 Common/ML/include/ML/3rdparty/GPUORTFloat16.h | 867 ++++++++++++++++++
 Common/ML/include/ML/ort_interface.h          |  94 ++
 Common/ML/src/ort_interface.cxx               | 262 ++++++
 5 files changed, 1239 insertions(+)
 create mode 100644 Common/ML/CMakeLists.txt
 create mode 100644 Common/ML/include/ML/3rdparty/GPUORTFloat16.h
 create mode 100644 Common/ML/include/ML/ort_interface.h
 create mode 100644 Common/ML/src/ort_interface.cxx

diff --git a/Common/CMakeLists.txt b/Common/CMakeLists.txt
index f435e269575aa..0b92758e45f43 100644
--- a/Common/CMakeLists.txt
+++ b/Common/CMakeLists.txt
@@ -16,5 +16,6 @@ add_subdirectory(Types)
 add_subdirectory(Utils)
 add_subdirectory(SimConfig)
 add_subdirectory(DCAFitter)
+add_subdirectory(ML)
 
 o2_data_file(COPY maps DESTINATION Common)
diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt
new file mode 100644
index 0000000000000..954d29d6e2793
--- /dev/null
+++ b/Common/ML/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+# See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+# All rights not expressly granted are reserved.
+#
+# This software is distributed under the terms of the GNU General Public
+# License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+#
+# In applying this license CERN does not waive the privileges and immunities
+# granted to it by virtue of its status as an Intergovernmental Organization
+# or submit itself to any jurisdiction.
+
+o2_add_library(ML
+               SOURCES src/ort_interface.cxx
+               TARGETVARNAME targetName
+               PRIVATE_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime)
\ No newline at end of file
diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
new file mode 100644
index 0000000000000..db65328409d3c
--- /dev/null
+++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
@@ -0,0 +1,867 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// This code was created from:
+//    - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_float16.h
+//    - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+
+#include <stdint.h>
+#include <cmath>
+#include <cstring>
+#include <limits>
+
+namespace o2
+{
+
+namespace OrtDataType
+{
+
+namespace detail
+{
+
+enum class endian {
+#if defined(_WIN32)
+  little = 0,
+  big = 1,
+  native = little,
+#elif defined(__GNUC__) || defined(__clang__)
+  little = __ORDER_LITTLE_ENDIAN__,
+  big = __ORDER_BIG_ENDIAN__,
+  native = __BYTE_ORDER__,
+#else
+#error OrtDataType::detail::endian is not implemented in this environment.
+#endif
+};
+
+static_assert(
+  endian::native == endian::little || endian::native == endian::big,
+  "Only little-endian or big-endian native byte orders are supported.");
+
+} // namespace detail
+
+/// <summary>
+/// Shared implementation between public and internal classes. CRTP pattern.
+/// </summary>
+template <class Derived>
+struct Float16Impl {
+ protected:
+  /// <summary>
+  /// Converts from float to uint16_t float16 representation
+  /// </summary>
+  /// <param name="v"></param>
+  /// <returns></returns>
+  constexpr static uint16_t ToUint16Impl(float v) noexcept;
+
+  /// <summary>
+  /// Converts float16 to float
+  /// </summary>
+  /// <returns>float representation of float16 value</returns>
+  float ToFloatImpl() const noexcept;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  uint16_t AbsImpl() const noexcept
+  {
+    return static_cast<uint16_t>(val & ~kSignMask);
+  }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  uint16_t NegateImpl() const noexcept
+  {
+    return IsNaN() ? val : static_cast<uint16_t>(val ^ kSignMask);
+  }
+
+ public:
+  // uint16_t special values
+  static constexpr uint16_t kSignMask = 0x8000U;
+  static constexpr uint16_t kBiasedExponentMask = 0x7C00U;
+  static constexpr uint16_t kPositiveInfinityBits = 0x7C00U;
+  static constexpr uint16_t kNegativeInfinityBits = 0xFC00U;
+  static constexpr uint16_t kPositiveQNaNBits = 0x7E00U;
+  static constexpr uint16_t kNegativeQNaNBits = 0xFE00U;
+  static constexpr uint16_t kEpsilonBits = 0x4170U;
+  static constexpr uint16_t kMinValueBits = 0xFBFFU; // Minimum normal number
+  static constexpr uint16_t kMaxValueBits = 0x7BFFU; // Largest normal number
+  static constexpr uint16_t kOneBits = 0x3C00U;
+  static constexpr uint16_t kMinusOneBits = 0xBC00U;
+
+  uint16_t val{0};
+
+  Float16Impl() = default;
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  bool IsNegative() const noexcept
+  {
+    return static_cast<int16_t>(val) < 0;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  bool IsNaN() const noexcept
+  {
+    return AbsImpl() > kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  bool IsFinite() const noexcept
+  {
+    return AbsImpl() < kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  bool IsPositiveInfinity() const noexcept
+  {
+    return val == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  bool IsNegativeInfinity() const noexcept
+  {
+    return val == kNegativeInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  bool IsInfinity() const noexcept
+  {
+    return AbsImpl() == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  bool IsNaNOrZero() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs == 0 || abs > kPositiveInfinityBits);
+  }
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsNormal() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)          // is finite
+           && (abs != 0)                          // is not zero
+           && ((abs & kBiasedExponentMask) != 0); // is not subnormal (has a non-zero exponent)
+  }
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsSubnormal() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)          // is finite
+           && (abs != 0)                          // is not zero
+           && ((abs & kBiasedExponentMask) == 0); // is subnormal (has a zero exponent)
+  }
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); }
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  static bool AreZero(const Float16Impl& lhs, const Float16Impl& rhs) noexcept
+  {
+    return static_cast<uint16_t>((lhs.val | rhs.val) & ~kSignMask) == 0;
+  }
+
+  bool operator==(const Float16Impl& rhs) const noexcept
+  {
+    if (IsNaN() || rhs.IsNaN()) {
+      // IEEE defines that NaN is not equal to anything, including itself.
+      return false;
+    }
+    return val == rhs.val;
+  }
+
+  bool operator!=(const Float16Impl& rhs) const noexcept { return !(*this == rhs); }
+
+  bool operator<(const Float16Impl& rhs) const noexcept
+  {
+    if (IsNaN() || rhs.IsNaN()) {
+      // IEEE defines that NaN is unordered with respect to everything, including itself.
+      return false;
+    }
+
+    const bool left_is_negative = IsNegative();
+    if (left_is_negative != rhs.IsNegative()) {
+      // When the signs of left and right differ, we know that left is less than right if it is
+      // the negative value. The exception to this is if both values are zero, in which case IEEE
+      // says they should be equal, even if the signs differ.
+      return left_is_negative && !AreZero(*this, rhs);
+    }
+    return (val != rhs.val) && ((val < rhs.val) ^ left_is_negative);
+  }
+};
+
+// The following Float16_t conversions are based on the code from
+// Eigen library.
+
+// The conversion routines are Copyright (c) Fabian Giesen, 2016.
+// The original license follows:
+//
+// Copyright (c) Fabian Giesen, 2016
+// All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+namespace detail
+{
+union float32_bits {
+  unsigned int u;
+  float f;
+};
+}; // namespace detail
+
+template <class Derived>
+inline constexpr uint16_t Float16Impl<Derived>::ToUint16Impl(float v) noexcept
+{
+  detail::float32_bits f{};
+  f.f = v;
+
+  constexpr detail::float32_bits f32infty = {255 << 23};
+  constexpr detail::float32_bits f16max = {(127 + 16) << 23};
+  constexpr detail::float32_bits denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23};
+  constexpr unsigned int sign_mask = 0x80000000u;
+  uint16_t val = static_cast<uint16_t>(0x0u);
+
+  unsigned int sign = f.u & sign_mask;
+  f.u ^= sign;
+
+  // NOTE all the integer compares in this function can be safely
+  // compiled into signed compares since all operands are below
+  // 0x80000000. Important if you want fast straight SSE2 code
+  // (since there's no unsigned PCMPGTD).
+
+  if (f.u >= f16max.u) {                        // result is Inf or NaN (all exponent bits set)
+    val = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
+  } else {                                      // (De)normalized number or zero
+    if (f.u < (113 << 23)) {                    // resulting FP16 is subnormal or zero
+      // use a magic value to align our 10 mantissa bits at the bottom of
+      // the float. as long as FP addition is round-to-nearest-even this
+      // just works.
+      f.f += denorm_magic.f;
+
+      // and one integer subtract of the bias later, we have our final float!
+      val = static_cast<uint16_t>(f.u - denorm_magic.u);
+    } else {
+      unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
+
+      // update exponent, rounding bias part 1
+      // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but
+      // without arithmetic overflow.
+      f.u += 0xc8000fffU;
+      // rounding bias part 2
+      f.u += mant_odd;
+      // take the bits!
+      val = static_cast<uint16_t>(f.u >> 13);
+    }
+  }
+
+  val |= static_cast<uint16_t>(sign >> 16);
+  return val;
+}
+
+template <class Derived>
+inline float Float16Impl<Derived>::ToFloatImpl() const noexcept
+{
+  constexpr detail::float32_bits magic = {113 << 23};
+  constexpr unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
+  detail::float32_bits o{};
+
+  o.u = (val & 0x7fff) << 13;           // exponent/mantissa bits
+  unsigned int exp = shifted_exp & o.u; // just the exponent
+  o.u += (127 - 15) << 23;              // exponent adjust
+
+  // handle exponent special cases
+  if (exp == shifted_exp) {  // Inf/NaN?
+    o.u += (128 - 16) << 23; // extra exp adjust
+  } else if (exp == 0) {     // Zero/Denormal?
+    o.u += 1 << 23;          // extra exp adjust
+    o.f -= magic.f;          // re-normalize
+  }
+
+  // Attempt to workaround the Internal Compiler Error on ARM64
+  // for bitwise | operator, including std::bitset
+#if (defined _MSC_VER) && (defined _M_ARM || defined _M_ARM64 || defined _M_ARM64EC)
+  if (IsNegative()) {
+    return -o.f;
+  }
+#else
+  // original code:
+  o.u |= (val & 0x8000U) << 16U; // sign bit
+#endif
+  return o.f;
+}
+
+/// Shared implementation between public and internal classes. CRTP pattern.
+template <class Derived>
+struct BFloat16Impl {
+ protected:
+  /// <summary>
+  /// Converts from float to uint16_t float16 representation
+  /// </summary>
+  /// <param name="v"></param>
+  /// <returns></returns>
+  static uint16_t ToUint16Impl(float v) noexcept;
+
+  /// <summary>
+  /// Converts bfloat16 to float
+  /// </summary>
+  /// <returns>float representation of bfloat16 value</returns>
+  float ToFloatImpl() const noexcept;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  uint16_t AbsImpl() const noexcept
+  {
+    return static_cast<uint16_t>(val & ~kSignMask);
+  }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  uint16_t NegateImpl() const noexcept
+  {
+    return IsNaN() ? val : static_cast<uint16_t>(val ^ kSignMask);
+  }
+
+ public:
+  // uint16_t special values
+  static constexpr uint16_t kSignMask = 0x8000U;
+  static constexpr uint16_t kBiasedExponentMask = 0x7F80U;
+  static constexpr uint16_t kPositiveInfinityBits = 0x7F80U;
+  static constexpr uint16_t kNegativeInfinityBits = 0xFF80U;
+  static constexpr uint16_t kPositiveQNaNBits = 0x7FC1U;
+  static constexpr uint16_t kNegativeQNaNBits = 0xFFC1U;
+  static constexpr uint16_t kSignaling_NaNBits = 0x7F80U;
+  static constexpr uint16_t kEpsilonBits = 0x0080U;
+  static constexpr uint16_t kMinValueBits = 0xFF7FU;
+  static constexpr uint16_t kMaxValueBits = 0x7F7FU;
+  static constexpr uint16_t kRoundToNearest = 0x7FFFU;
+  static constexpr uint16_t kOneBits = 0x3F80U;
+  static constexpr uint16_t kMinusOneBits = 0xBF80U;
+
+  uint16_t val{0};
+
+  BFloat16Impl() = default;
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  bool IsNegative() const noexcept
+  {
+    return static_cast<int16_t>(val) < 0;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  bool IsNaN() const noexcept
+  {
+    return AbsImpl() > kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  bool IsFinite() const noexcept
+  {
+    return AbsImpl() < kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  bool IsPositiveInfinity() const noexcept
+  {
+    return val == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  bool IsNegativeInfinity() const noexcept
+  {
+    return val == kNegativeInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  bool IsInfinity() const noexcept
+  {
+    return AbsImpl() == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  bool IsNaNOrZero() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs == 0 || abs > kPositiveInfinityBits);
+  }
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsNormal() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)          // is finite
+           && (abs != 0)                          // is not zero
+           && ((abs & kBiasedExponentMask) != 0); // is not subnormal (has a non-zero exponent)
+  }
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsSubnormal() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)          // is finite
+           && (abs != 0)                          // is not zero
+           && ((abs & kBiasedExponentMask) == 0); // is subnormal (has a zero exponent)
+  }
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); }
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  static bool AreZero(const BFloat16Impl& lhs, const BFloat16Impl& rhs) noexcept
+  {
+    // IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+    // for two values by or'ing the private bits together and stripping the sign. They are both zero,
+    // and therefore equivalent, if the resulting value is still zero.
+    return static_cast<uint16_t>((lhs.val | rhs.val) & ~kSignMask) == 0;
+  }
+};
+
+template <class Derived>
+inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
+{
+  uint16_t result;
+  if (std::isnan(v)) {
+    result = kPositiveQNaNBits;
+  } else {
+    auto get_msb_half = [](float fl) {
+      uint16_t result;
+#ifdef __cpp_if_constexpr
+      if constexpr (detail::endian::native == detail::endian::little)
+#else
+      if (detail::endian::native == detail::endian::little)
+#endif
+      {
+        std::memcpy(&result, reinterpret_cast<char*>(&fl) + sizeof(uint16_t), sizeof(uint16_t));
+      } else {
+        std::memcpy(&result, &fl, sizeof(uint16_t));
+      }
+      return result;
+    };
+
+    uint16_t upper_bits = get_msb_half(v);
+    union {
+      uint32_t U32;
+      float F32;
+    };
+    F32 = v;
+    U32 += (upper_bits & 1) + kRoundToNearest;
+    result = get_msb_half(F32);
+  }
+  return result;
+}
+
+template <class Derived>
+inline float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
+{
+  if (IsNaN()) {
+    return std::numeric_limits<float>::quiet_NaN();
+  }
+  float result;
+  char* const first = reinterpret_cast<char*>(&result);
+  char* const second = first + sizeof(uint16_t);
+#ifdef __cpp_if_constexpr
+  if constexpr (detail::endian::native == detail::endian::little)
+#else
+  if (detail::endian::native == detail::endian::little)
+#endif
+  {
+    std::memset(first, 0, sizeof(uint16_t));
+    std::memcpy(second, &val, sizeof(uint16_t));
+  } else {
+    std::memcpy(first, &val, sizeof(uint16_t));
+    std::memset(second, 0, sizeof(uint16_t));
+  }
+  return result;
+}
+
+/** \brief IEEE 754 half-precision floating point data type
+ *
+ * \details This struct is used for converting float to float16 and back
+ * so the user could feed inputs and fetch outputs using these type.
+ *
+ * The size of the structure should align with uint16_t and one can freely cast
+ * uint16_t buffers to/from Ort::Float16_t to feed and retrieve data.
+ *
+ * \code{.unparsed}
+ * // This example demonstrates converion from float to float16
+ * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
+ * std::vector<Ort::Float16_t> fp16_values;
+ * fp16_values.reserve(std::size(values));
+ * std::transform(std::begin(values), std::end(values), std::back_inserter(fp16_values),
+ *     [](float value) { return Ort::Float16_t(value); });
+ *
+ * \endcode
+ */
+struct Float16_t : OrtDataType::Float16Impl<Float16_t> {
+ private:
+  /// <summary>
+  /// Constructor from a 16-bit representation of a float16 value
+  /// No conversion is done here.
+  /// </summary>
+  /// <param name="v">16-bit representation</param>
+  constexpr explicit Float16_t(uint16_t v) noexcept { val = v; }
+
+ public:
+  using Base = OrtDataType::Float16Impl<Float16_t>;
+
+  /// <summary>
+  /// Default constructor
+  /// </summary>
+  Float16_t() = default;
+
+  /// <summary>
+  /// Explicit conversion to uint16_t representation of float16.
+  /// </summary>
+  /// <param name="v">uint16_t bit representation of float16</param>
+  /// <returns>new instance of Float16_t</returns>
+  constexpr static Float16_t FromBits(uint16_t v) noexcept { return Float16_t(v); }
+
+  /// <summary>
+  /// __ctor from float. Float is converted into float16 16-bit representation.
+  /// </summary>
+  /// <param name="v">float value</param>
+  explicit Float16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
+
+  /// <summary>
+  /// Converts float16 to float
+  /// </summary>
+  /// <returns>float representation of float16 value</returns>
+  float ToFloat() const noexcept { return Base::ToFloatImpl(); }
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  using Base::IsNegative;
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  using Base::IsNaN;
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  using Base::IsFinite;
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  using Base::IsPositiveInfinity;
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  using Base::IsNegativeInfinity;
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  using Base::IsInfinity;
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  using Base::IsNaNOrZero;
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsNormal;
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsSubnormal;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  using Base::Abs;
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  using Base::Negate;
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  using Base::AreZero;
+
+  /// <summary>
+  /// User defined conversion operator. Converts Float16_t to float.
+  /// </summary>
+  explicit operator float() const noexcept { return ToFloat(); }
+
+  using Base::operator==;
+  using Base::operator!=;
+  using Base::operator<;
+};
+
+static_assert(sizeof(Float16_t) == sizeof(uint16_t), "Sizes must match");
+
+/** \brief bfloat16 (Brain Floating Point) data type
+ *
+ * \details This struct is used for converting float to bfloat16 and back
+ * so the user could feed inputs and fetch outputs using these type.
+ *
+ * The size of the structure should align with uint16_t and one can freely cast
+ * uint16_t buffers to/from Ort::BFloat16_t to feed and retrieve data.
+ *
+ * \code{.unparsed}
+ * // This example demonstrates converion from float to float16
+ * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
+ * std::vector<Ort::BFloat16_t> bfp16_values;
+ * bfp16_values.reserve(std::size(values));
+ * std::transform(std::begin(values), std::end(values), std::back_inserter(bfp16_values),
+ *     [](float value) { return Ort::BFloat16_t(value); });
+ *
+ * \endcode
+ */
+struct BFloat16_t : OrtDataType::BFloat16Impl<BFloat16_t> {
+ private:
+  /// <summary>
+  /// Constructor from a uint16_t representation of bfloat16
+  /// used in FromBits() to escape overload resolution issue with
+  /// constructor from float.
+  /// No conversion is done.
+  /// </summary>
+  /// <param name="v">16-bit bfloat16 value</param>
+  constexpr explicit BFloat16_t(uint16_t v) noexcept { val = v; }
+
+ public:
+  using Base = OrtDataType::BFloat16Impl<BFloat16_t>;
+
+  BFloat16_t() = default;
+
+  /// <summary>
+  /// Explicit conversion to uint16_t representation of bfloat16.
+  /// </summary>
+  /// <param name="v">uint16_t bit representation of bfloat16</param>
+  /// <returns>new instance of BFloat16_t</returns>
+  static constexpr BFloat16_t FromBits(uint16_t v) noexcept { return BFloat16_t(v); }
+
+  /// <summary>
+  /// __ctor from float. Float is converted into bfloat16 16-bit representation.
+  /// </summary>
+  /// <param name="v">float value</param>
+  explicit BFloat16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
+
+  /// <summary>
+  /// Converts bfloat16 to float
+  /// </summary>
+  /// <returns>float representation of bfloat16 value</returns>
+  float ToFloat() const noexcept { return Base::ToFloatImpl(); }
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  using Base::IsNegative;
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  using Base::IsNaN;
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  using Base::IsFinite;
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  using Base::IsPositiveInfinity;
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  using Base::IsNegativeInfinity;
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  using Base::IsInfinity;
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  using Base::IsNaNOrZero;
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsNormal;
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsSubnormal;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  using Base::Abs;
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  using Base::Negate;
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  using Base::AreZero;
+
+  /// <summary>
+  /// User defined conversion operator. Converts BFloat16_t to float.
+  /// </summary>
+  explicit operator float() const noexcept { return ToFloat(); }
+
+  // We do not have an inherited impl for the below operators
+  // as the internal class implements them a little differently
+  bool operator==(const BFloat16_t& rhs) const noexcept;
+  bool operator!=(const BFloat16_t& rhs) const noexcept { return !(*this == rhs); }
+  bool operator<(const BFloat16_t& rhs) const noexcept;
+};
+
+static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match");
+
+} // namespace OrtDataType
+
+} // namespace o2
\ No newline at end of file
diff --git a/Common/ML/include/ML/ort_interface.h b/Common/ML/include/ML/ort_interface.h
new file mode 100644
index 0000000000000..a365860db3279
--- /dev/null
+++ b/Common/ML/include/ML/ort_interface.h
@@ -0,0 +1,94 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file     ort_interface.h
+/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
+/// \brief    A header library for loading ONNX models and inferencing them on CPU and GPU
+
+#ifndef O2_ML_ONNX_INTERFACE_H
+#define O2_ML_ONNX_INTERFACE_H
+
+// C++ and system includes
+#include <vector>
+#include <string>
+#include <memory>
+#include <map>
+#include <thread>
+
+// O2 includes
+#include "Framework/Logger.h"
+
+namespace o2
+{
+
+namespace ml
+{
+
+class OrtModel
+{
+
+  public:
+    // Constructor
+    OrtModel() = default;
+    OrtModel(std::unordered_map<std::string, std::string> optionsMap){ reset(optionsMap); }
+    void init(std::unordered_map<std::string, std::string> optionsMap){ reset(optionsMap); }
+    void reset(std::unordered_map<std::string, std::string>);
+
+    virtual ~OrtModel() = default;
+
+    // Conversion
+    template<class I, class O>
+    std::vector<O> v2v(std::vector<I>&, bool = true);
+
+    // Inferencing
+    template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
+    std::vector<O> inference(std::vector<I>&);
+
+    template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+    std::vector<O> inference(std::vector<std::vector<I>>&);
+
+    // template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
+    // std::vector<O> inference(std::vector<I>&);
+
+    // Reset session
+    void resetSession();
+
+    std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
+    std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
+    std::vector<std::string> getInputNames() const { return mInputNames; }
+    std::vector<std::string> getOutputNames() const { return mOutputNames; }
+
+    void setActiveThreads(int threads) { intraOpNumThreads = threads; }
+
+  private:
+
+    // ORT variables -> need to be hidden as Pimpl
+    struct OrtVariables;
+    OrtVariables* pImplOrt;
+
+    // Input & Output specifications of the loaded network
+    std::vector<const char*> inputNamesChar, outputNamesChar;
+    std::vector<std::string> mInputNames, mOutputNames;
+    std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes;
+
+    // Environment settings
+    std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda
+    int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
+
+    std::string printShape(const std::vector<int64_t>&);
+
+};
+
+} // namespace ml
+
+} // namespace ml
+
+#endif // O2_ML_ORT_INTERFACE_H
\ No newline at end of file
diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx
new file mode 100644
index 0000000000000..84a06ce1da068
--- /dev/null
+++ b/Common/ML/src/ort_interface.cxx
@@ -0,0 +1,262 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file     ort_interface.cxx
+/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
+/// \brief    A header library for loading ONNX models and inferencing them on CPU and GPU
+
+#include "ML/ort_interface.h"
+#include "ML/3rdparty/GPUORTFloat16.h"
+
+// ONNX includes
+#include <onnxruntime_cxx_api.h>
+
+namespace o2
+{
+
+namespace ml
+{
+
+struct OrtModel::OrtVariables {  // The actual implementation is hidden in the .cxx file
+  // ORT runtime objects
+  Ort::RunOptions runOptions;
+  std::shared_ptr<Ort::Env> env = nullptr;
+  std::shared_ptr<Ort::Session> session = nullptr; ///< ONNX session
+  Ort::SessionOptions sessionOptions;
+  Ort::AllocatorWithDefaultOptions allocator;
+  Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
+};
+
+void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
+
+  pImplOrt = new OrtVariables();
+
+  // Load from options map
+  if(!optionsMap.contains("model-path")){
+    LOG(fatal) << "(ORT) Model path cannot be empty!";
+  }
+  modelPath = optionsMap["model-path"];
+  device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU");
+  dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
+  deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
+  allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
+  intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ?  std::stoi(optionsMap["intra-op-num-threads"]) : 0);
+  loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0);
+  enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0);
+  enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0);
+
+  std::string dev_mem_str = "Hip";
+#ifdef ORT_ROCM_BUILD
+  if(device == "ROCM") {
+    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId));
+    LOG(info) << "(ORT) ROCM execution provider set";
+  }
+#endif
+#ifdef ORT_MIGRAPHX_BUILD
+  if(device == "MIGRAPHX") {
+    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId));
+    LOG(info) << "(ORT) MIGraphX execution provider set";
+  }
+#endif
+#ifdef ORT_CUDA_BUILD
+  if(device == "CUDA") {
+    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId));
+    LOG(info) << "(ORT) CUDA execution provider set";
+    dev_mem_str = "Cuda";
+  }
+#endif
+
+  if(allocateDeviceMemory){
+    pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
+    LOG(info) << "(ORT) Memory info set to on-device memory";
+  }
+
+  if(device == "CPU") {
+    (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
+    if(intraOpNumThreads > 1){
+      (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL);
+    } else if(intraOpNumThreads == 1){
+      (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+    }
+    LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads";
+  }
+
+  (pImplOrt->sessionOptions).DisableMemPattern();
+  (pImplOrt->sessionOptions).DisableCpuMemArena();
+
+  if(enableProfiling){
+    if(optionsMap.contains("profiling-output-path")){
+      (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str());
+    } else {
+      LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now.";
+      (pImplOrt->sessionOptions).DisableProfiling();
+    }
+  } else {
+    (pImplOrt->sessionOptions).DisableProfiling();
+  }
+  (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations));
+  (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel));
+
+  pImplOrt->env = std::make_shared<Ort::Env>(OrtLoggingLevel(loggingLevel), (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str()));
+  (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
+
+  for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
+      mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
+  }
+  for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
+      mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+  }
+  for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
+      mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
+  }
+  for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
+      mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+  }
+
+  inputNamesChar.resize(mInputNames.size(), nullptr);
+  std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar),
+      [&](const std::string& str) { return str.c_str(); });
+  outputNamesChar.resize(mOutputNames.size(), nullptr);
+  std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
+      [&](const std::string& str) { return str.c_str(); });
+
+  // Print names
+  if(loggingLevel > 1) {
+    LOG(info) << "Input Nodes:";
+    for (size_t i = 0; i < mInputNames.size(); i++) {
+      LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
+    }
+
+    LOG(info) << "Output Nodes:";
+    for (size_t i = 0; i < mOutputNames.size(); i++) {
+      LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]);
+    }
+  }
+}
+
+void OrtModel::resetSession() { 
+  (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
+}
+
+template<class I, class O>
+std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput) {
+  if constexpr (std::is_same_v<I,O>){
+    return input;
+  } else {
+    std::vector<O> output(input.size());
+    std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); });
+    if(clearInput) input.clear();
+    return output;
+  }
+}
+
+template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+std::vector<O> OrtModel::inference(std::vector<I>& input){
+  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(input)).data(), input.size(), inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
+  std::vector<O> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input){
+  std::vector<Ort::Value> inputTensor;
+  for(auto i : input){
+    std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+    inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(i)).data(), i.size(), inputShape.data(), inputShape.size()));
+  }
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
+  std::vector<O> outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+std::string OrtModel::printShape(const std::vector<int64_t>& v)
+{
+  std::stringstream ss("");
+  for (size_t i = 0; i < v.size() - 1; i++)
+    ss << v[i] << "x";
+  ss << v[v.size() - 1];
+  return ss.str();
+}
+
+template <> std::vector<float> OrtModel::inference<float, float>(std::vector<float>& input) {
+  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  float* outputValues = outputTensors[0].template GetTensorMutableData<float>();
+  std::vector<float> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+template <> std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>& input) {
+  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  float* outputValues = outputTensors[0].template GetTensorMutableData<float>();
+  std::vector<float> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>& input) {
+  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  OrtDataType::Float16_t* outputValues = reinterpret_cast<OrtDataType::Float16_t*>(outputTensors[0].template GetTensorMutableData<Ort::Float16_t>());
+  std::vector<OrtDataType::Float16_t> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDataType::Float16_t>(std::vector<float>& input) {
+  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  OrtDataType::Float16_t* outputValues = reinterpret_cast<OrtDataType::Float16_t*>(outputTensors[0].template GetTensorMutableData<Ort::Float16_t>());
+  std::vector<OrtDataType::Float16_t> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>& input) {
+  std::vector<Ort::Value> inputTensor;
+  for(auto i : input){
+    std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+    inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(i.data()), i.size(), inputShape.data(), inputShape.size()));
+  }
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  OrtDataType::Float16_t* outputValues = reinterpret_cast<OrtDataType::Float16_t*>(outputTensors[0].template GetTensorMutableData<Ort::Float16_t>());
+  std::vector<OrtDataType::Float16_t> outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+} // namespace ml
+
+} // namespace o2
\ No newline at end of file

From 9232328476bbafb06cc660c2f122d81b67da9d73 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Mon, 18 Nov 2024 18:48:18 +0000
Subject: [PATCH 17/23] Please consider the following formatting changes

---
 Common/ML/include/ML/ort_interface.h | 76 ++++++++++++------------
 Common/ML/src/ort_interface.cxx      | 88 ++++++++++++++++------------
 2 files changed, 89 insertions(+), 75 deletions(-)

diff --git a/Common/ML/include/ML/ort_interface.h b/Common/ML/include/ML/ort_interface.h
index a365860db3279..2fe9a44a0623c 100644
--- a/Common/ML/include/ML/ort_interface.h
+++ b/Common/ML/include/ML/ort_interface.h
@@ -35,60 +35,58 @@ namespace ml
 class OrtModel
 {
 
-  public:
-    // Constructor
-    OrtModel() = default;
-    OrtModel(std::unordered_map<std::string, std::string> optionsMap){ reset(optionsMap); }
-    void init(std::unordered_map<std::string, std::string> optionsMap){ reset(optionsMap); }
-    void reset(std::unordered_map<std::string, std::string>);
+ public:
+  // Constructor
+  OrtModel() = default;
+  OrtModel(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
+  void init(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
+  void reset(std::unordered_map<std::string, std::string>);
 
-    virtual ~OrtModel() = default;
+  virtual ~OrtModel() = default;
 
-    // Conversion
-    template<class I, class O>
-    std::vector<O> v2v(std::vector<I>&, bool = true);
+  // Conversion
+  template <class I, class O>
+  std::vector<O> v2v(std::vector<I>&, bool = true);
 
-    // Inferencing
-    template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
-    std::vector<O> inference(std::vector<I>&);
+  // Inferencing
+  template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
+  std::vector<O> inference(std::vector<I>&);
 
-    template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-    std::vector<O> inference(std::vector<std::vector<I>>&);
+  template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+  std::vector<O> inference(std::vector<std::vector<I>>&);
 
-    // template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
-    // std::vector<O> inference(std::vector<I>&);
+  // template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
+  // std::vector<O> inference(std::vector<I>&);
 
-    // Reset session
-    void resetSession();
+  // Reset session
+  void resetSession();
 
-    std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
-    std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
-    std::vector<std::string> getInputNames() const { return mInputNames; }
-    std::vector<std::string> getOutputNames() const { return mOutputNames; }
+  std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
+  std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
+  std::vector<std::string> getInputNames() const { return mInputNames; }
+  std::vector<std::string> getOutputNames() const { return mOutputNames; }
 
-    void setActiveThreads(int threads) { intraOpNumThreads = threads; }
+  void setActiveThreads(int threads) { intraOpNumThreads = threads; }
 
-  private:
+ private:
+  // ORT variables -> need to be hidden as Pimpl
+  struct OrtVariables;
+  OrtVariables* pImplOrt;
 
-    // ORT variables -> need to be hidden as Pimpl
-    struct OrtVariables;
-    OrtVariables* pImplOrt;
+  // Input & Output specifications of the loaded network
+  std::vector<const char*> inputNamesChar, outputNamesChar;
+  std::vector<std::string> mInputNames, mOutputNames;
+  std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes;
 
-    // Input & Output specifications of the loaded network
-    std::vector<const char*> inputNamesChar, outputNamesChar;
-    std::vector<std::string> mInputNames, mOutputNames;
-    std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes;
-
-    // Environment settings
-    std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda
-    int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
-
-    std::string printShape(const std::vector<int64_t>&);
+  // Environment settings
+  std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda
+  int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
 
+  std::string printShape(const std::vector<int64_t>&);
 };
 
 } // namespace ml
 
-} // namespace ml
+} // namespace o2
 
 #endif // O2_ML_ORT_INTERFACE_H
\ No newline at end of file
diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx
index 84a06ce1da068..8ebe0588b4a2b 100644
--- a/Common/ML/src/ort_interface.cxx
+++ b/Common/ML/src/ort_interface.cxx
@@ -25,7 +25,7 @@ namespace o2
 namespace ml
 {
 
-struct OrtModel::OrtVariables {  // The actual implementation is hidden in the .cxx file
+struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file
   // ORT runtime objects
   Ort::RunOptions runOptions;
   std::shared_ptr<Ort::Env> env = nullptr;
@@ -35,12 +35,13 @@ struct OrtModel::OrtVariables {  // The actual implementation is hidden in the .
   Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
 };
 
-void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
+void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
+{
 
   pImplOrt = new OrtVariables();
 
   // Load from options map
-  if(!optionsMap.contains("model-path")){
+  if (!optionsMap.contains("model-path")) {
     LOG(fatal) << "(ORT) Model path cannot be empty!";
   }
   modelPath = optionsMap["model-path"];
@@ -48,42 +49,42 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
   dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
   deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
   allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
-  intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ?  std::stoi(optionsMap["intra-op-num-threads"]) : 0);
+  intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);
   loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0);
   enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0);
   enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0);
 
   std::string dev_mem_str = "Hip";
 #ifdef ORT_ROCM_BUILD
-  if(device == "ROCM") {
+  if (device == "ROCM") {
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId));
     LOG(info) << "(ORT) ROCM execution provider set";
   }
 #endif
 #ifdef ORT_MIGRAPHX_BUILD
-  if(device == "MIGRAPHX") {
+  if (device == "MIGRAPHX") {
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId));
     LOG(info) << "(ORT) MIGraphX execution provider set";
   }
 #endif
 #ifdef ORT_CUDA_BUILD
-  if(device == "CUDA") {
+  if (device == "CUDA") {
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId));
     LOG(info) << "(ORT) CUDA execution provider set";
     dev_mem_str = "Cuda";
   }
 #endif
 
-  if(allocateDeviceMemory){
+  if (allocateDeviceMemory) {
     pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
     LOG(info) << "(ORT) Memory info set to on-device memory";
   }
 
-  if(device == "CPU") {
+  if (device == "CPU") {
     (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
-    if(intraOpNumThreads > 1){
+    if (intraOpNumThreads > 1) {
       (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL);
-    } else if(intraOpNumThreads == 1){
+    } else if (intraOpNumThreads == 1) {
       (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
     }
     LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads";
@@ -92,8 +93,8 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
   (pImplOrt->sessionOptions).DisableMemPattern();
   (pImplOrt->sessionOptions).DisableCpuMemArena();
 
-  if(enableProfiling){
-    if(optionsMap.contains("profiling-output-path")){
+  if (enableProfiling) {
+    if (optionsMap.contains("profiling-output-path")) {
       (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str());
     } else {
       LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now.";
@@ -109,27 +110,27 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
   (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
 
   for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
-      mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
+    mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
   }
   for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
-      mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+    mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
   }
   for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
-      mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
+    mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
   }
   for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
-      mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+    mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
   }
 
   inputNamesChar.resize(mInputNames.size(), nullptr);
   std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar),
-      [&](const std::string& str) { return str.c_str(); });
+                 [&](const std::string& str) { return str.c_str(); });
   outputNamesChar.resize(mOutputNames.size(), nullptr);
   std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
-      [&](const std::string& str) { return str.c_str(); });
+                 [&](const std::string& str) { return str.c_str(); });
 
   // Print names
-  if(loggingLevel > 1) {
+  if (loggingLevel > 1) {
     LOG(info) << "Input Nodes:";
     for (size_t i = 0; i < mInputNames.size(); i++) {
       LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
@@ -142,24 +143,28 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
   }
 }
 
-void OrtModel::resetSession() { 
+void OrtModel::resetSession()
+{
   (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
 }
 
-template<class I, class O>
-std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput) {
-  if constexpr (std::is_same_v<I,O>){
+template <class I, class O>
+std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput)
+{
+  if constexpr (std::is_same_v<I, O>) {
     return input;
   } else {
     std::vector<O> output(input.size());
     std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); });
-    if(clearInput) input.clear();
+    if (clearInput)
+      input.clear();
     return output;
   }
 }
 
-template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-std::vector<O> OrtModel::inference(std::vector<I>& input){
+template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+std::vector<O> OrtModel::inference(std::vector<I>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(input)).data(), input.size(), inputShape.data(), inputShape.size()));
@@ -171,10 +176,11 @@ std::vector<O> OrtModel::inference(std::vector<I>& input){
   return outputValuesVec;
 }
 
-template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input){
+template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
+{
   std::vector<Ort::Value> inputTensor;
-  for(auto i : input){
+  for (auto i : input) {
     std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
     inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(i)).data(), i.size(), inputShape.data(), inputShape.size()));
   }
@@ -195,7 +201,9 @@ std::string OrtModel::printShape(const std::vector<int64_t>& v)
   return ss.str();
 }
 
-template <> std::vector<float> OrtModel::inference<float, float>(std::vector<float>& input) {
+template <>
+std::vector<float> OrtModel::inference<float, float>(std::vector<float>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size()));
@@ -207,7 +215,9 @@ template <> std::vector<float> OrtModel::inference<float, float>(std::vector<flo
   return outputValuesVec;
 }
 
-template <> std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>& input) {
+template <>
+std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
@@ -219,7 +229,9 @@ template <> std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float
   return outputValuesVec;
 }
 
-template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>& input) {
+template <>
+std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
@@ -231,7 +243,9 @@ template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType:
   return outputValuesVec;
 }
 
-template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDataType::Float16_t>(std::vector<float>& input) {
+template <>
+std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDataType::Float16_t>(std::vector<float>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
@@ -243,9 +257,11 @@ template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDa
   return outputValuesVec;
 }
 
-template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>& input) {
+template <>
+std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>& input)
+{
   std::vector<Ort::Value> inputTensor;
-  for(auto i : input){
+  for (auto i : input) {
     std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
     inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(i.data()), i.size(), inputShape.data(), inputShape.size()));
   }

From 7251c5cfb30266479d3f8d7df38c733ba65add77 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 19 Nov 2024 09:23:26 +0100
Subject: [PATCH 18/23] Fixing macOS build issues with calling O*.data()

---
 Common/ML/src/ort_interface.cxx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx
index 8ebe0588b4a2b..222dab55e6e6b 100644
--- a/Common/ML/src/ort_interface.cxx
+++ b/Common/ML/src/ort_interface.cxx
@@ -167,7 +167,7 @@ std::vector<O> OrtModel::inference(std::vector<I>& input)
 {
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
-  inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(input)).data(), input.size(), inputShape.data(), inputShape.size()));
+  inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, reinterpret_cast<O*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
   // input.clear();
   auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
   O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
@@ -182,7 +182,7 @@ std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
   std::vector<Ort::Value> inputTensor;
   for (auto i : input) {
     std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
-    inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(i)).data(), i.size(), inputShape.data(), inputShape.size()));
+    inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, reinterpret_cast<O*>(i.data()), i.size(), inputShape.data(), inputShape.size()));
   }
   // input.clear();
   auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());

From d0f4dd8271a880c3152cc4e7ae511bb8439aa466 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 19 Nov 2024 20:40:17 +0100
Subject: [PATCH 19/23] Fixing compiler issues and char -> uint8_t

---
 Common/ML/src/ort_interface.cxx | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx
index 222dab55e6e6b..cf60a3369613a 100644
--- a/Common/ML/src/ort_interface.cxx
+++ b/Common/ML/src/ort_interface.cxx
@@ -107,7 +107,7 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
   (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel));
 
   pImplOrt->env = std::make_shared<Ort::Env>(OrtLoggingLevel(loggingLevel), (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str()));
-  (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
+  pImplOrt->session = std::make_shared<Ort::Session>(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions);
 
   for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
     mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
@@ -145,7 +145,7 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
 
 void OrtModel::resetSession()
 {
-  (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
+  (pImplOrt->session).reset(std::make_shared<Ort::Session>{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
 }
 
 template <class I, class O>
@@ -156,8 +156,9 @@ std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput)
   } else {
     std::vector<O> output(input.size());
     std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); });
-    if (clearInput)
+    if (clearInput) {
       input.clear();
+    }
     return output;
   }
 }
@@ -195,8 +196,9 @@ std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
 std::string OrtModel::printShape(const std::vector<int64_t>& v)
 {
   std::stringstream ss("");
-  for (size_t i = 0; i < v.size() - 1; i++)
+  for (size_t i = 0; i < v.size() - 1; i++) {
     ss << v[i] << "x";
+  }
   ss << v[v.size() - 1];
   return ss.str();
 }

From 7859ab25223ec10c475bbbfa4c6b2da09dfcc609 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 19 Nov 2024 21:09:14 +0100
Subject: [PATCH 20/23] Fixing curly braces

---
 Common/ML/src/ort_interface.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx
index cf60a3369613a..feeebe99fa6fa 100644
--- a/Common/ML/src/ort_interface.cxx
+++ b/Common/ML/src/ort_interface.cxx
@@ -145,7 +145,7 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
 
 void OrtModel::resetSession()
 {
-  (pImplOrt->session).reset(std::make_shared<Ort::Session>{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
+  (pImplOrt->session).reset(std::make_shared<Ort::Session>(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions));
 }
 
 template <class I, class O>

From c6cb3e6f2992f9328185c360c1590a412f401575 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 19 Nov 2024 22:29:48 +0100
Subject: [PATCH 21/23] Fixing std::make_shared

---
 Common/ML/src/ort_interface.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx
index feeebe99fa6fa..160fdbadf84e4 100644
--- a/Common/ML/src/ort_interface.cxx
+++ b/Common/ML/src/ort_interface.cxx
@@ -145,7 +145,7 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
 
 void OrtModel::resetSession()
 {
-  (pImplOrt->session).reset(std::make_shared<Ort::Session>(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions));
+  pImplOrt->session = std::make_shared<Ort::Session>(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions);
 }
 
 template <class I, class O>

From 40bc4371920d9f7b51469d58135d7ee742ea5606 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 20 Nov 2024 10:38:11 +0100
Subject: [PATCH 22/23] Changing order for <CommonUtils/StringUtils.h>

---
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index bccff6328cb1d..c528f65c3924f 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -12,6 +12,8 @@
 /// \file GPUChainTrackingClusterizer.cxx
 /// \author David Rohr
 
+#include <CommonUtils/StringUtils.h>
+
 #include "GPUChainTracking.h"
 #include "GPUChainTrackingDefs.h"
 #include "GPULogging.h"
@@ -37,7 +39,6 @@
 #endif
 
 #include "utils/strtag.h"
-#include <CommonUtils/StringUtils.h>
 
 #ifndef GPUCA_NO_VC
 #include <Vc/Vc>

From 52b033f0c9594fc5238c986037c3dc9645a04841 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 17 Dec 2024 22:46:16 +0100
Subject: [PATCH 23/23] Bug-fixing file name

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index fd420357073e9..af5315ddae4ac 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -19,7 +19,7 @@
 #include "GPUProcessor.h"
 #include "GPUDataTypes.h"
 #include "CfFragment.h"
-#include "ML/ort_interface.h"
+#include "ML/OrtInterface.h"
 #include "ML/3rdparty/GPUORTFloat16.h"
 
 using namespace o2::ml;