From bb2cb6e48d12f71fb634b1429bf284db23bb97ee Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Mon, 21 Oct 2024 07:41:20 +0000
Subject: [PATCH] Please consider the following formatting changes

---
 Common/ML/include/ML/ort_interface.h          |  76 ++-
 Common/ML/src/ort_interface.cxx               |  88 +--
 .../Global/GPUChainTrackingClusterizer.cxx    |  23 +-
 .../TPCClusterFinder/ClusterAccumulator.h     |   3 +-
 .../TPCClusterFinder/GPUTPCClusterFinder.h    |   2 +-
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 539 +++++++++---------
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  26 +-
 7 files changed, 385 insertions(+), 372 deletions(-)
diff --git a/Common/ML/include/ML/ort_interface.h b/Common/ML/include/ML/ort_interface.h
index a365860db3279..2fe9a44a0623c 100644
--- a/Common/ML/include/ML/ort_interface.h
+++ b/Common/ML/include/ML/ort_interface.h
@@ -35,60 +35,58 @@ namespace ml
 class OrtModel
 {
 
-  public:
-    // Constructor
-    OrtModel() = default;
-    OrtModel(std::unordered_map<std::string, std::string> optionsMap){ reset(optionsMap); }
-    void init(std::unordered_map<std::string, std::string> optionsMap){ reset(optionsMap); }
-    void reset(std::unordered_map<std::string, std::string>);
+ public:
+  // Constructor
+  OrtModel() = default;
+  OrtModel(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
+  void init(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
+  void reset(std::unordered_map<std::string, std::string>);
 
-    virtual ~OrtModel() = default;
+  virtual ~OrtModel() = default;
 
-    // Conversion
-    template<class I, class O>
-    std::vector<O> v2v(std::vector<I>&, bool = true);
+  // Conversion
+  template <class I, class O>
+  std::vector<O> v2v(std::vector<I>&, bool = true);
 
-    // Inferencing
-    template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
-    std::vector<O> inference(std::vector<I>&);
+  // Inferencing
+  template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
+  std::vector<O> inference(std::vector<I>&);
 
-    template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-    std::vector<O> inference(std::vector<std::vector<I>>&);
+  template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+  std::vector<O> inference(std::vector<std::vector<I>>&);
 
-    // template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
-    // std::vector<O> inference(std::vector<I>&);
+  // template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
+  // std::vector<O> inference(std::vector<I>&);
 
-    // Reset session
-    void resetSession();
+  // Reset session
+  void resetSession();
 
-    std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
-    std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
-    std::vector<std::string> getInputNames() const { return mInputNames; }
-    std::vector<std::string> getOutputNames() const { return mOutputNames; }
+  std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
+  std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
+  std::vector<std::string> getInputNames() const { return mInputNames; }
+  std::vector<std::string> getOutputNames() const { return mOutputNames; }
 
-    void setActiveThreads(int threads) { intraOpNumThreads = threads; }
+  void setActiveThreads(int threads) { intraOpNumThreads = threads; }
 
-  private:
+ private:
+  // ORT variables -> need to be hidden as Pimpl
+  struct OrtVariables;
+  OrtVariables* pImplOrt;
 
-    // ORT variables -> need to be hidden as Pimpl
-    struct OrtVariables;
-    OrtVariables* pImplOrt;
+  // Input & Output specifications of the loaded network
+  std::vector<const char*> inputNamesChar, outputNamesChar;
+  std::vector<std::string> mInputNames, mOutputNames;
+  std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes;
 
-    // Input & Output specifications of the loaded network
-    std::vector<const char*> inputNamesChar, outputNamesChar;
-    std::vector<std::string> mInputNames, mOutputNames;
-    std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes;
-
-    // Environment settings
-    std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda
-    int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
-
-    std::string printShape(const std::vector<int64_t>&);
+  // Environment settings
+  std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda
+  int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
 
+  std::string printShape(const std::vector<int64_t>&);
 };
 
 } // namespace ml
 
-} // namespace ml
+} // namespace o2
 
 #endif // O2_ML_ORT_INTERFACE_H
\ No newline at end of file
diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx
index 84a06ce1da068..8ebe0588b4a2b 100644
--- a/Common/ML/src/ort_interface.cxx
+++ b/Common/ML/src/ort_interface.cxx
@@ -25,7 +25,7 @@ namespace o2
 namespace ml
 {
 
-struct OrtModel::OrtVariables {  // The actual implementation is hidden in the .cxx file
+struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file
   // ORT runtime objects
   Ort::RunOptions runOptions;
   std::shared_ptr<Ort::Env> env = nullptr;
@@ -35,12 +35,13 @@ struct OrtModel::OrtVariables {  // The actual implementation is hidden in the .
   Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
 };
 
-void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
+void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
+{
 
   pImplOrt = new OrtVariables();
 
   // Load from options map
-  if(!optionsMap.contains("model-path")){
+  if (!optionsMap.contains("model-path")) {
     LOG(fatal) << "(ORT) Model path cannot be empty!";
   }
   modelPath = optionsMap["model-path"];
@@ -48,42 +49,42 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
   dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
   deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
   allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
-  intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ?  std::stoi(optionsMap["intra-op-num-threads"]) : 0);
+  intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);
   loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0);
   enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0);
   enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0);
 
   std::string dev_mem_str = "Hip";
 #ifdef ORT_ROCM_BUILD
-  if(device == "ROCM") {
+  if (device == "ROCM") {
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId));
     LOG(info) << "(ORT) ROCM execution provider set";
   }
 #endif
 #ifdef ORT_MIGRAPHX_BUILD
-  if(device == "MIGRAPHX") {
+  if (device == "MIGRAPHX") {
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId));
     LOG(info) << "(ORT) MIGraphX execution provider set";
   }
 #endif
 #ifdef ORT_CUDA_BUILD
-  if(device == "CUDA") {
+  if (device == "CUDA") {
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId));
     LOG(info) << "(ORT) CUDA execution provider set";
     dev_mem_str = "Cuda";
   }
 #endif
 
-  if(allocateDeviceMemory){
+  if (allocateDeviceMemory) {
     pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
     LOG(info) << "(ORT) Memory info set to on-device memory";
   }
 
-  if(device == "CPU") {
+  if (device == "CPU") {
     (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
-    if(intraOpNumThreads > 1){
+    if (intraOpNumThreads > 1) {
       (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL);
-    } else if(intraOpNumThreads == 1){
+    } else if (intraOpNumThreads == 1) {
       (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
     }
     LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads";
@@ -92,8 +93,8 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
   (pImplOrt->sessionOptions).DisableMemPattern();
   (pImplOrt->sessionOptions).DisableCpuMemArena();
 
-  if(enableProfiling){
-    if(optionsMap.contains("profiling-output-path")){
+  if (enableProfiling) {
+    if (optionsMap.contains("profiling-output-path")) {
       (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str());
     } else {
       LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now.";
@@ -109,27 +110,27 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
   (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
 
   for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
-      mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
+    mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
   }
   for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
-      mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+    mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
   }
   for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
-      mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
+    mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
   }
   for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
-      mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+    mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
   }
 
   inputNamesChar.resize(mInputNames.size(), nullptr);
   std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar),
-      [&](const std::string& str) { return str.c_str(); });
+                 [&](const std::string& str) { return str.c_str(); });
   outputNamesChar.resize(mOutputNames.size(), nullptr);
   std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
-      [&](const std::string& str) { return str.c_str(); });
+                 [&](const std::string& str) { return str.c_str(); });
 
   // Print names
-  if(loggingLevel > 1) {
+  if (loggingLevel > 1) {
     LOG(info) << "Input Nodes:";
     for (size_t i = 0; i < mInputNames.size(); i++) {
       LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
@@ -142,24 +143,28 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
   }
 }
 
-void OrtModel::resetSession() { 
+void OrtModel::resetSession()
+{
   (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
 }
 
-template<class I, class O>
-std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput) {
-  if constexpr (std::is_same_v<I,O>){
+template <class I, class O>
+std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput)
+{
+  if constexpr (std::is_same_v<I, O>) {
     return input;
   } else {
     std::vector<O> output(input.size());
     std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); });
-    if(clearInput) input.clear();
+    if (clearInput)
+      input.clear();
     return output;
   }
 }
 
-template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-std::vector<O> OrtModel::inference(std::vector<I>& input){
+template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+std::vector<O> OrtModel::inference(std::vector<I>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(input)).data(), input.size(), inputShape.data(), inputShape.size()));
@@ -171,10 +176,11 @@ std::vector<O> OrtModel::inference(std::vector<I>& input){
   return outputValuesVec;
 }
 
-template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input){
+template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
+{
   std::vector<Ort::Value> inputTensor;
-  for(auto i : input){
+  for (auto i : input) {
     std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
     inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(i)).data(), i.size(), inputShape.data(), inputShape.size()));
   }
@@ -195,7 +201,9 @@ std::string OrtModel::printShape(const std::vector<int64_t>& v)
   return ss.str();
 }
 
-template <> std::vector<float> OrtModel::inference<float, float>(std::vector<float>& input) {
+template <>
+std::vector<float> OrtModel::inference<float, float>(std::vector<float>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size()));
@@ -207,7 +215,9 @@ template <> std::vector<float> OrtModel::inference<float, float>(std::vector<flo
   return outputValuesVec;
 }
 
-template <> std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>& input) {
+template <>
+std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
@@ -219,7 +229,9 @@ template <> std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float
   return outputValuesVec;
 }
 
-template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>& input) {
+template <>
+std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
@@ -231,7 +243,9 @@ template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType:
   return outputValuesVec;
 }
 
-template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDataType::Float16_t>(std::vector<float>& input) {
+template <>
+std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDataType::Float16_t>(std::vector<float>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
@@ -243,9 +257,11 @@ template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDa
   return outputValuesVec;
 }
 
-template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>& input) {
+template <>
+std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>& input)
+{
   std::vector<Ort::Value> inputTensor;
-  for(auto i : input){
+  for (auto i : input) {
     std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
     inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(i.data()), i.size(), inputShape.data(), inputShape.size()));
   }
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 0f22a7472feac..d8470fdc2bf10 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -838,7 +838,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (clusterer.mPmemory->counters.nPeaks == 0) {
           continue;
         }
-        if(!GetProcessingSettings().applyNNclusterizer){
+        if (!GetProcessingSettings().applyNNclusterizer) {
           runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
           runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
         } else {
@@ -875,14 +875,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
         DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
 
-        if(GetProcessingSettings().applyNNclusterizer){
+        if (GetProcessingSettings().applyNNclusterizer) {
           // Settings for the clusterizer
           clusterer.nnClusterizerUseCFregression = GetProcessingSettings().nnClusterizerUseCFregression;
           clusterer.nnClusterizerSizeInputRow = GetProcessingSettings().nnClusterizerSizeInputRow;
           clusterer.nnClusterizerSizeInputPad = GetProcessingSettings().nnClusterizerSizeInputPad;
           clusterer.nnClusterizerSizeInputTime = GetProcessingSettings().nnClusterizerSizeInputTime;
           clusterer.nnClusterizerAddIndexData = GetProcessingSettings().nnClusterizerAddIndexData;
-          clusterer.nnClusterizerElementSize = ((2*clusterer.nnClusterizerSizeInputRow + 1) * (2*clusterer.nnClusterizerSizeInputPad + 1) * (2*clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0);
+          clusterer.nnClusterizerElementSize = ((2 * clusterer.nnClusterizerSizeInputRow + 1) * (2 * clusterer.nnClusterizerSizeInputPad + 1) * (2 * clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0);
           clusterer.nnClusterizerBatchedMode = GetProcessingSettings().nnClusterizerBatchedMode;
           clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnInferenceVerbosity;
 
@@ -893,7 +893,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           // Settings for the neural network evaluation
           clusterer.OrtOptions = {
             {"model-path", GetProcessingSettings().nnClassificationPath},
-            {"device",  GetProcessingSettings().nnInferenceDevice},
+            {"device", GetProcessingSettings().nnInferenceDevice},
             {"device-id", std::to_string(GetProcessingSettings().nnInferenceDeviceId)},
             {"allocate-device-memory", std::to_string(GetProcessingSettings().nnInferenceAllocateDevMem)},
             {"dtype", GetProcessingSettings().nnInferenceDtype},
@@ -901,16 +901,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             {"enable-optimizations", std::to_string(GetProcessingSettings().nnInferenceEnableOrtOptimization)},
             {"enable-profiling", std::to_string(GetProcessingSettings().nnInferenceOrtProfiling)},
             {"profiling-output-path", GetProcessingSettings().nnInferenceOrtProfilingPath},
-            {"logging-level", std::to_string(GetProcessingSettings().nnInferenceVerbosity)}
-          };
+            {"logging-level", std::to_string(GetProcessingSettings().nnInferenceVerbosity)}};
           clusterer.model_class.init(clusterer.OrtOptions);
-          if(!clusterer.nnClusterizerUseCFregression){
+          if (!clusterer.nnClusterizerUseCFregression) {
             std::vector<std::string> reg_model_paths = o2::utils::Str::tokenize(GetProcessingSettings().nnRegressionPath, ':');
-            if(clusterer.model_class.getNumOutputNodes()[0][1] == 1){
+            if (clusterer.model_class.getNumOutputNodes()[0][1] == 1) {
               clusterer.OrtOptions["model-path"] = reg_model_paths[0];
               clusterer.model_reg_1.init(clusterer.OrtOptions);
             } else {
-              if(reg_model_paths.size() == 1){
+              if (reg_model_paths.size() == 1) {
                 clusterer.OrtOptions["model-path"] = reg_model_paths[0];
                 clusterer.model_reg_1.init(clusterer.OrtOptions);
               } else {
@@ -925,9 +924,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           }
 
-          if(clusterer.nnSigmoidTrafoClassThreshold){
+          if (clusterer.nnSigmoidTrafoClassThreshold) {
             // Inverse sigmoid transformation
-            clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold/(1.f-clusterer.nnClassThreshold));
+            clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold / (1.f - clusterer.nnClassThreshold));
           }
           runKernel<GPUTPCNNClusterizer>({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
         } else {
@@ -939,7 +938,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           if (doGPU) {
             SynchronizeStream(lane);
           }
-          if(!GetProcessingSettings().applyNNclusterizer){
+          if (!GetProcessingSettings().applyNNclusterizer) {
             runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
           } else {
             runKernel<GPUTPCNNClusterizer>({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
index d308b8bd6efa7..b7e535a107eac 100644
--- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
+++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
@@ -43,7 +43,8 @@ class ClusterAccumulator
   GPUd() void finalize(const ChargePos&, tpccf::Charge, tpccf::TPCTime, const GPUTPCGeometry&);
   GPUd() bool toNative(const ChargePos&, tpccf::Charge, tpc::ClusterNative&, const GPUParam&) const;
 
-  GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uint8_t splitInTime, uint8_t splitInPad){
+  GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uint8_t splitInTime, uint8_t splitInPad)
+  {
     mQtot = qtot;
     mPadMean = padMean;
     mPadSigma = padSigma;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index 130453e833911..fd420357073e9 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -158,7 +158,7 @@ class GPUTPCClusterFinder : public GPUProcessor
 
   std::unordered_map<std::string, std::string> OrtOptions;
   OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
-  
+
 #ifndef GPUCA_GPUCODE
   void DumpDigits(std::ostream& out);
   void DumpChargeMap(std::ostream& out, std::string_view);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index f5e094a3c363e..ba8fac2a397e9 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -34,15 +34,15 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo
 
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
-  if(clusterer.OrtOptions["dtype"].find("32") != std::string::npos){
+  if (clusterer.OrtOptions["dtype"].find("32") != std::string::npos) {
     GPUTPCNNClusterizer::nn_clusterizer<float>(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
-  } else if(clusterer.OrtOptions["dtype"].find("16") != std::string::npos) {
+  } else if (clusterer.OrtOptions["dtype"].find("16") != std::string::npos) {
     GPUTPCNNClusterizer::nn_clusterizer<OrtDataType::Float16_t>(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
   } else {
     LOG(fatal) << "Unsupported data type for neural network clusterizer!";
   }
   // tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
-// 
+  //
   // GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
 }
 
@@ -74,12 +74,12 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G
     }
   } else if (row <= 62 + global_shift) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
     return true;
-  } else if (row <= o2::tpc::constants::MAXGLOBALPADROW-1 + global_shift) {
-    //if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)- global_shift]) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)- global_shift]) / 2) {
-    //  return true;
-    //} else {
-    //  return false;
-    //}
+  } else if (row <= o2::tpc::constants::MAXGLOBALPADROW - 1 + global_shift) {
+    // if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)- global_shift]) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)- global_shift]) / 2) {
+    //   return true;
+    // } else {
+    //   return false;
+    // }
     if (pad < 0 || pad > geo.NPads(row)) {
       return true;
     } else {
@@ -92,277 +92,135 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G
 
 template <class T>
 GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int iBlock, int iThread,
-                                          processorType& clusterer,
-                                          const CfFragment& fragment,
-                                          GPUSharedMemory& smem,
-                                          const Array2D<PackedCharge>& chargeMap,
-                                          const ChargePos* filteredPeakPositions,
-                                          const GPUSettingsRec& calib,
-                                          MCLabelAccumulator* labelAcc,
-                                          uint clusternum,
-                                          uint maxClusterPerRow,
-                                          uint* clusterInRow,
-                                          tpc::ClusterNative* clusterByRow,
-                                          uint* clusterPosInRow){
-
-    uint glo_idx = get_global_id(0) * clusterer.nnClusterizerBatchedMode;
-    if(glo_idx >= clusternum){
-      return;
-    }
+                                                processorType& clusterer,
+                                                const CfFragment& fragment,
+                                                GPUSharedMemory& smem,
+                                                const Array2D<PackedCharge>& chargeMap,
+                                                const ChargePos* filteredPeakPositions,
+                                                const GPUSettingsRec& calib,
+                                                MCLabelAccumulator* labelAcc,
+                                                uint clusternum,
+                                                uint maxClusterPerRow,
+                                                uint* clusterInRow,
+                                                tpc::ClusterNative* clusterByRow,
+                                                uint* clusterPosInRow)
+{
+
+  uint glo_idx = get_global_id(0) * clusterer.nnClusterizerBatchedMode;
+  if (glo_idx >= clusternum) {
+    return;
+  }
 
-    std::vector<float> central_charges(clusterer.nnClusterizerBatchedMode, -1.f);
-    std::vector<T> input_data(clusterer.nnClusterizerElementSize * clusterer.nnClusterizerBatchedMode, (T)-1.f);
-    std::vector<ChargePos> peak_positions(clusterer.nnClusterizerBatchedMode);
-    unsigned int write_idx = 0;
-
-    for(int batch_counter = 0; batch_counter < clusterer.nnClusterizerBatchedMode; batch_counter++){
-
-      uint cls = CAMath::Min(glo_idx + batch_counter, clusternum - 1);
-
-      ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
-      int row = peak.row(), pad = peak.pad(), time = peak.time();
-      float central_charge = chargeMap[peak].unpack();
-
-      peak_positions[batch_counter] = peak;
-      central_charges[batch_counter] = central_charge;
-
-      // unsigned int batch_offset = batch_counter * clusterer.nnClusterizerElementSize;
-      for(int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++){
-        bool push_mc_label = (r == 0);
-        int pad_offset = GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);
-        int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow);
-        for(int p = -clusterer.nnClusterizerSizeInputPad; p <= clusterer.nnClusterizerSizeInputPad; p++){
-          push_mc_label &= (std::abs(p) < 2); // Use inner 5x5 window
-          bool is_boundary = GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p + pad_offset, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
-          for(int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++){
-            push_mc_label &= (std::abs(t) < 2); // Use inner 5x5 window
-            if(!is_boundary){
-              ChargePos tmp_pos(row + r, pad + p + pad_offset, time + t);
-              input_data[write_idx] = (T)(chargeMap[tmp_pos].unpack() / central_charge);
-              if(push_mc_label){
-                ChargePos tmp_pos_mc(row, pad + p, time + t);
-                CPU_ONLY(labelAcc->collect(tmp_pos, chargeMap[tmp_pos_mc].unpack()));
-              }
+  std::vector<float> central_charges(clusterer.nnClusterizerBatchedMode, -1.f);
+  std::vector<T> input_data(clusterer.nnClusterizerElementSize * clusterer.nnClusterizerBatchedMode, (T)-1.f);
+  std::vector<ChargePos> peak_positions(clusterer.nnClusterizerBatchedMode);
+  unsigned int write_idx = 0;
+
+  for (int batch_counter = 0; batch_counter < clusterer.nnClusterizerBatchedMode; batch_counter++) {
+
+    uint cls = CAMath::Min(glo_idx + batch_counter, clusternum - 1);
+
+    ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
+    int row = peak.row(), pad = peak.pad(), time = peak.time();
+    float central_charge = chargeMap[peak].unpack();
+
+    peak_positions[batch_counter] = peak;
+    central_charges[batch_counter] = central_charge;
+
+    // unsigned int batch_offset = batch_counter * clusterer.nnClusterizerElementSize;
+    for (int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++) {
+      bool push_mc_label = (r == 0);
+      int pad_offset = GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);
+      int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow);
+      for (int p = -clusterer.nnClusterizerSizeInputPad; p <= clusterer.nnClusterizerSizeInputPad; p++) {
+        push_mc_label &= (std::abs(p) < 2); // Use inner 5x5 window
+        bool is_boundary = GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p + pad_offset, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
+        for (int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++) {
+          push_mc_label &= (std::abs(t) < 2); // Use inner 5x5 window
+          if (!is_boundary) {
+            ChargePos tmp_pos(row + r, pad + p + pad_offset, time + t);
+            input_data[write_idx] = (T)(chargeMap[tmp_pos].unpack() / central_charge);
+            if (push_mc_label) {
+              ChargePos tmp_pos_mc(row, pad + p, time + t);
+              CPU_ONLY(labelAcc->collect(tmp_pos, chargeMap[tmp_pos_mc].unpack()));
             }
-            write_idx++;
           }
+          write_idx++;
         }
       }
-      if(clusterer.nnClusterizerAddIndexData){
-        input_data[write_idx] = (T)(clusterer.mISlice / 36.f);
-        input_data[write_idx + 1] = (T)(row / 152.f);
-        input_data[write_idx + 2] = (T)((float)pad / clusterer.Param().tpcGeometry.NPads(row));
-        write_idx+=3;
-        // if(idx == 100){
-        //   LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]";
-        // }
-      }
     }
+    if (clusterer.nnClusterizerAddIndexData) {
+      input_data[write_idx] = (T)(clusterer.mISlice / 36.f);
+      input_data[write_idx + 1] = (T)(row / 152.f);
+      input_data[write_idx + 2] = (T)((float)pad / clusterer.Param().tpcGeometry.NPads(row));
+      write_idx += 3;
+      // if(idx == 100){
+      //   LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]";
+      // }
+    }
+  }
 
-    std::vector<int> index_class_2;
-    std::vector<float> out_class = clusterer.model_class.inference<T,float>(input_data);
-    // LOG(info) << "input_data.size(): " << input_data.size() << "; write_idx: " << write_idx << "; out_class.size(): " << out_class.size();
-    int num_output_classes = clusterer.model_class.getNumOutputNodes()[0][1];
-
-    if(num_output_classes > 1){
-      std::vector<float> tmp_out_class(clusterer.nnClusterizerBatchedMode);
-      for(int cls_idx = 0; cls_idx < clusterer.nnClusterizerBatchedMode; cls_idx++){
-        auto elem_iterator = out_class.begin() + (unsigned int)(cls_idx*num_output_classes);
-        tmp_out_class[cls_idx] = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator+num_output_classes)) - 1; // -1 since 2-class classifier will have 3 outputs: classes 0, 1, 2
-        if(tmp_out_class[cls_idx] > 1){
-          index_class_2.push_back(cls_idx);
-        }
+  std::vector<int> index_class_2;
+  std::vector<float> out_class = clusterer.model_class.inference<T, float>(input_data);
+  // LOG(info) << "input_data.size(): " << input_data.size() << "; write_idx: " << write_idx << "; out_class.size(): " << out_class.size();
+  int num_output_classes = clusterer.model_class.getNumOutputNodes()[0][1];
+
+  if (num_output_classes > 1) {
+    std::vector<float> tmp_out_class(clusterer.nnClusterizerBatchedMode);
+    for (int cls_idx = 0; cls_idx < clusterer.nnClusterizerBatchedMode; cls_idx++) {
+      auto elem_iterator = out_class.begin() + (unsigned int)(cls_idx * num_output_classes);
+      tmp_out_class[cls_idx] = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + num_output_classes)) - 1; // -1 since 2-class classifier will have 3 outputs: classes 0, 1, 2
+      if (tmp_out_class[cls_idx] > 1) {
+        index_class_2.push_back(cls_idx);
       }
-      out_class = tmp_out_class;
     }
+    out_class = tmp_out_class;
+  }
 
-    if(!clusterer.nnClusterizerUseCFregression) {
+  if (!clusterer.nnClusterizerUseCFregression) {
 
-      std::vector<float> out_reg = clusterer.model_reg_1.inference<T,float>(input_data), tmp_out_reg_2;
-      if(index_class_2.size() > 0){
-        std::vector<T> tmp_in_reg_2(index_class_2.size() * clusterer.nnClusterizerElementSize);
-        int fill_counter = 0;
-        for(int cls_idx : index_class_2){
-          int from_idx = cls_idx*clusterer.nnClusterizerElementSize, to_idx = fill_counter * clusterer.nnClusterizerElementSize;
-          for(int reg_idx = 0; reg_idx < clusterer.nnClusterizerElementSize; reg_idx++){
-            tmp_in_reg_2[to_idx + reg_idx] = input_data[from_idx + reg_idx];
-          }
-          fill_counter++;
+    std::vector<float> out_reg = clusterer.model_reg_1.inference<T, float>(input_data), tmp_out_reg_2;
+    if (index_class_2.size() > 0) {
+      std::vector<T> tmp_in_reg_2(index_class_2.size() * clusterer.nnClusterizerElementSize);
+      int fill_counter = 0;
+      for (int cls_idx : index_class_2) {
+        int from_idx = cls_idx * clusterer.nnClusterizerElementSize, to_idx = fill_counter * clusterer.nnClusterizerElementSize;
+        for (int reg_idx = 0; reg_idx < clusterer.nnClusterizerElementSize; reg_idx++) {
+          tmp_in_reg_2[to_idx + reg_idx] = input_data[from_idx + reg_idx];
         }
-        tmp_out_reg_2 = clusterer.model_reg_2.inference<T,float>(input_data);
-      }
-
-      input_data.clear();
-
-      if((clusterer.nnClusterizerVerbosity >= 4) && glo_idx == 0){
-        LOG(info) << "[CF] Classification model: " << out_class[0] << " (>? " << clusterer.nnClassThreshold << ")";
-        LOG(info) << "[CF] Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4];
-      }
-
-      int num_outputs_1 = clusterer.model_reg_1.getNumOutputNodes()[0][1], num_outputs_2 = 0, counter_class_2_idcs = 0;
-      if(num_output_classes > 1){
-        num_outputs_2 = clusterer.model_reg_2.getNumOutputNodes()[0][1];
+        fill_counter++;
       }
+      tmp_out_reg_2 = clusterer.model_reg_2.inference<T, float>(input_data);
+    }
 
-      for(int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) {
-
-        if (glo_idx + element >= clusternum) {
-          return;
-        }
-
-        int model_output_index = element*num_outputs_1;
-        if(out_class[element] > clusterer.nnClassThreshold) {
-          if((num_output_classes == 1) || ((num_output_classes > 1) && (out_class[element] < 2))) {
-            // CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
-            ClusterAccumulator pc;
-
-            ClusterAccumulator dummy_pc;
-            CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
-
-            // Dummy build to push MC labels
-            buildCluster(
-              calib,
-              chargeMap,
-              peak_positions[element],
-              smem.posBcast,
-              smem.buf,
-              smem.innerAboveThreshold,
-              &dummy_pc,
-              labelAcc);
-
-            if (fragment.isOverlap(peak_positions[element].time())) {
-              if (clusterPosInRow) {
-                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-              }
-              continue;
-            }
+    input_data.clear();
 
-            pc.setFull(central_charges[element] * out_reg[model_output_index + 4], peak_positions[element].pad() + out_reg[model_output_index + 0], out_reg[model_output_index + 2], fragment.start + peak_positions[element].time() + out_reg[model_output_index + 1], out_reg[model_output_index + 3], 0, 0);
-            // LOG(info) << "Example: " << num_outputs_1 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
-
-            tpc::ClusterNative myCluster;
-            bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
-            if (rejectCluster) {
-              if(clusterer.nnClusterizerVerbosity > 3){
-                LOG(warning) << "[CF] Cluster rejected!";
-              }
-              if (clusterPosInRow) {
-                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-              }
-              continue;
-            }
-
-            uint rowIndex = 0;
-            if (clusterByRow != nullptr) {
-              rowIndex = sortIntoBuckets(
-                clusterer,
-                myCluster,
-                peak_positions[element].row(),
-                maxClusterPerRow,
-                clusterInRow,
-                clusterByRow);
-              if (clusterPosInRow != nullptr) {
-                clusterPosInRow[glo_idx + element] = rowIndex;
-              }
-            } else if (clusterPosInRow) {
-              rowIndex = clusterPosInRow[glo_idx + element];
-            }
-            CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
-          } else {
-            model_output_index = index_class_2[counter_class_2_idcs]*num_outputs_2;
-            counter_class_2_idcs++;
-
-            // Cluster 1
-            CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
-            ClusterAccumulator pc;
-
-            if (fragment.isOverlap(peak_positions[element].time())) {
-              if (clusterPosInRow) {
-                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-              }
-              continue;
-            }
+    if ((clusterer.nnClusterizerVerbosity >= 4) && glo_idx == 0) {
+      LOG(info) << "[CF] Classification model: " << out_class[0] << " (>? " << clusterer.nnClassThreshold << ")";
+      LOG(info) << "[CF] Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4];
+    }
 
-            pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 8], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 4], tmp_out_reg_2[model_output_index + 2], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 2], tmp_out_reg_2[model_output_index + 6], 0, 0);
-            // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
-
-            tpc::ClusterNative myCluster;
-            bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
-            if (rejectCluster) {
-              if(clusterer.nnClusterizerVerbosity > 3){
-                LOG(warning) << "[CF] Cluster rejected!";
-              }
-              if (clusterPosInRow) {
-                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-              }
-              continue;
-            }
+    int num_outputs_1 = clusterer.model_reg_1.getNumOutputNodes()[0][1], num_outputs_2 = 0, counter_class_2_idcs = 0;
+    if (num_output_classes > 1) {
+      num_outputs_2 = clusterer.model_reg_2.getNumOutputNodes()[0][1];
+    }
 
-            uint rowIndex = 0;
-            if (clusterByRow != nullptr) {
-              rowIndex = sortIntoBuckets(
-                clusterer,
-                myCluster,
-                peak_positions[element].row(),
-                maxClusterPerRow,
-                clusterInRow,
-                clusterByRow);
-              if (clusterPosInRow != nullptr) {
-                clusterPosInRow[glo_idx + element] = rowIndex;
-              }
-            } else if (clusterPosInRow) {
-              rowIndex = clusterPosInRow[glo_idx + element];
-            }
-            CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
-
-            // Cluster 2
-            CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
-            pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 9], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 1], tmp_out_reg_2[model_output_index + 5], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 3], tmp_out_reg_2[model_output_index + 7], 0, 0);
-            // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
-            rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
-            if (rejectCluster) {
-              if(clusterer.nnClusterizerVerbosity > 3){
-                LOG(warning) << "[CF] Cluster rejected!";
-              }
-              if (clusterPosInRow) {
-                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-              }
-              continue;
-            }
+    for (int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) {
 
-            rowIndex = 0;
-            if (clusterByRow != nullptr) {
-              rowIndex = sortIntoBuckets(
-                clusterer,
-                myCluster,
-                peak_positions[element].row(),
-                maxClusterPerRow,
-                clusterInRow,
-                clusterByRow);
-              if (clusterPosInRow != nullptr) {
-                clusterPosInRow[glo_idx + element] = rowIndex;
-              }
-            } else if (clusterPosInRow) {
-              rowIndex = clusterPosInRow[glo_idx + element];
-            }
-            CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
-          }
-        }
+      if (glo_idx + element >= clusternum) {
+        return;
       }
 
-    } else {
-
-      input_data.clear();
-      for(int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) {
-        if (glo_idx + element >= clusternum) {
-          return;
-        }
-
-        if(out_class[element] > clusterer.nnClassThreshold) {
-
+      int model_output_index = element * num_outputs_1;
+      if (out_class[element] > clusterer.nnClassThreshold) {
+        if ((num_output_classes == 1) || ((num_output_classes > 1) && (out_class[element] < 2))) {
+          // CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
           ClusterAccumulator pc;
+
+          ClusterAccumulator dummy_pc;
           CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
 
+          // Dummy build to push MC labels
           buildCluster(
             calib,
             chargeMap,
@@ -370,7 +228,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
             smem.posBcast,
             smem.buf,
             smem.innerAboveThreshold,
-            &pc,
+            &dummy_pc,
             labelAcc);
 
           if (fragment.isOverlap(peak_positions[element].time())) {
@@ -379,20 +237,67 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
             }
             continue;
           }
-          pc.finalize(peak_positions[element], central_charges[element], fragment.start, clusterer.Param().tpcGeometry);
+
+          pc.setFull(central_charges[element] * out_reg[model_output_index + 4], peak_positions[element].pad() + out_reg[model_output_index + 0], out_reg[model_output_index + 2], fragment.start + peak_positions[element].time() + out_reg[model_output_index + 1], out_reg[model_output_index + 3], 0, 0);
+          // LOG(info) << "Example: " << num_outputs_1 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
 
           tpc::ClusterNative myCluster;
           bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+          if (rejectCluster) {
+            if (clusterer.nnClusterizerVerbosity > 3) {
+              LOG(warning) << "[CF] Cluster rejected!";
+            }
+            if (clusterPosInRow) {
+              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+            }
+            continue;
+          }
+
+          uint rowIndex = 0;
+          if (clusterByRow != nullptr) {
+            rowIndex = sortIntoBuckets(
+              clusterer,
+              myCluster,
+              peak_positions[element].row(),
+              maxClusterPerRow,
+              clusterInRow,
+              clusterByRow);
+            if (clusterPosInRow != nullptr) {
+              clusterPosInRow[glo_idx + element] = rowIndex;
+            }
+          } else if (clusterPosInRow) {
+            rowIndex = clusterPosInRow[glo_idx + element];
+          }
+          CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
+        } else {
+          model_output_index = index_class_2[counter_class_2_idcs] * num_outputs_2;
+          counter_class_2_idcs++;
 
+          // Cluster 1
+          CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+          ClusterAccumulator pc;
+
+          if (fragment.isOverlap(peak_positions[element].time())) {
+            if (clusterPosInRow) {
+              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+            }
+            continue;
+          }
+
+          pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 8], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 4], tmp_out_reg_2[model_output_index + 2], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 2], tmp_out_reg_2[model_output_index + 6], 0, 0);
+          // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
+
+          tpc::ClusterNative myCluster;
+          bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
           if (rejectCluster) {
-              if(clusterer.nnClusterizerVerbosity > 3){
-                LOG(warning) << "[CF] Cluster rejected!";
-              }
-              if (clusterPosInRow) {
-                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-              }
-              continue;
+            if (clusterer.nnClusterizerVerbosity > 3) {
+              LOG(warning) << "[CF] Cluster rejected!";
+            }
+            if (clusterPosInRow) {
+              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
             }
+            continue;
+          }
 
           uint rowIndex = 0;
           if (clusterByRow != nullptr) {
@@ -409,18 +314,112 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
           } else if (clusterPosInRow) {
             rowIndex = clusterPosInRow[glo_idx + element];
           }
+          CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
 
+          // Cluster 2
+          CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+          pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 9], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 1], tmp_out_reg_2[model_output_index + 5], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 3], tmp_out_reg_2[model_output_index + 7], 0, 0);
+          // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
+          rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+          if (rejectCluster) {
+            if (clusterer.nnClusterizerVerbosity > 3) {
+              LOG(warning) << "[CF] Cluster rejected!";
+            }
+            if (clusterPosInRow) {
+              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+            }
+            continue;
+          }
+
+          rowIndex = 0;
+          if (clusterByRow != nullptr) {
+            rowIndex = sortIntoBuckets(
+              clusterer,
+              myCluster,
+              peak_positions[element].row(),
+              maxClusterPerRow,
+              clusterInRow,
+              clusterByRow);
+            if (clusterPosInRow != nullptr) {
+              clusterPosInRow[glo_idx + element] = rowIndex;
+            }
+          } else if (clusterPosInRow) {
+            rowIndex = clusterPosInRow[glo_idx + element];
+          }
           CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
         }
       }
     }
 
-    if(clusterer.nnClusterizerVerbosity > 4){
-      LOG(info) << "[CF] Clusterization done!";
-    }
-}
+  } else {
+
+    input_data.clear();
+    for (int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) {
+      if (glo_idx + element >= clusternum) {
+        return;
+      }
+
+      if (out_class[element] > clusterer.nnClassThreshold) {
 
+        ClusterAccumulator pc;
+        CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
 
+        buildCluster(
+          calib,
+          chargeMap,
+          peak_positions[element],
+          smem.posBcast,
+          smem.buf,
+          smem.innerAboveThreshold,
+          &pc,
+          labelAcc);
+
+        if (fragment.isOverlap(peak_positions[element].time())) {
+          if (clusterPosInRow) {
+            clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+          }
+          continue;
+        }
+        pc.finalize(peak_positions[element], central_charges[element], fragment.start, clusterer.Param().tpcGeometry);
+
+        tpc::ClusterNative myCluster;
+        bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+
+        if (rejectCluster) {
+          if (clusterer.nnClusterizerVerbosity > 3) {
+            LOG(warning) << "[CF] Cluster rejected!";
+          }
+          if (clusterPosInRow) {
+            clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+          }
+          continue;
+        }
+
+        uint rowIndex = 0;
+        if (clusterByRow != nullptr) {
+          rowIndex = sortIntoBuckets(
+            clusterer,
+            myCluster,
+            peak_positions[element].row(),
+            maxClusterPerRow,
+            clusterInRow,
+            clusterByRow);
+          if (clusterPosInRow != nullptr) {
+            clusterPosInRow[glo_idx + element] = rowIndex;
+          }
+        } else if (clusterPosInRow) {
+          rowIndex = clusterPosInRow[glo_idx + element];
+        }
+
+        CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
+      }
+    }
+  }
+
+  if (clusterer.nnClusterizerVerbosity > 4) {
+    LOG(info) << "[CF] Clusterization done!";
+  }
+}
 
 GPUdii() void GPUTPCNNClusterizer::computeClustersImpl(int nBlocks, int nThreads, int iBlock, int iThread,
                                                        processorType& clusterer,
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 51a5c29022421..98d979d28cf15 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -66,20 +66,20 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
   static int rowOffset(int, int);
   static bool isBoundary(int, int, int, const GPUTPCGeometry&);
 
-  template<class T>
+  template <class T>
   static GPUd() void nn_clusterizer(int, int, int, int,
-                              processorType&,
-                              const CfFragment&,
-                              GPUSharedMemory&,
-                              const Array2D<PackedCharge>&,
-                              const ChargePos*,
-                              const GPUSettingsRec&,
-                              MCLabelAccumulator*,
-                              uint,
-                              uint,
-                              uint*,
-                              tpc::ClusterNative*,
-                              uint*);
+                                    processorType&,
+                                    const CfFragment&,
+                                    GPUSharedMemory&,
+                                    const Array2D<PackedCharge>&,
+                                    const ChargePos*,
+                                    const GPUSettingsRec&,
+                                    MCLabelAccumulator*,
+                                    uint,
+                                    uint,
+                                    uint*,
+                                    tpc::ClusterNative*,
+                                    uint*);
 
  private:
   // ---------------------------------