diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index dd3480cae86bd..dc34ecebe78e9 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -197,6 +197,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2" OR GPUCA_CONFIG_O2_EXTENSIONS)
         TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx
         TPCClusterFinder/GPUTPCCFPeakFinder.cxx
         TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx
+        TPCClusterFinder/GPUTPCNNClusterizer.cxx
         TPCClusterFinder/GPUTPCCFClusterizer.cxx
         TPCClusterFinder/GPUTPCCFDeconvolution.cxx
         TPCClusterFinder/GPUTPCCFMCLabelFlattener.cxx
@@ -307,6 +308,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
                                        O2::GPUCommon
                                        O2::ReconstructionDataFormats
                                        O2::TPCFastTransformation
+                                       O2::ML
                  PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC
                  SOURCES ${SRCS_DATATYPES})
   target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAVE_O2HEADERS)
diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
index 3852d37f6facf..fc8af23c810f8 100644
--- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
+++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
@@ -81,6 +81,7 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 512
   #define GPUCA_LB_GPUTPCCFDeconvolution 512
   #define GPUCA_LB_GPUTPCCFClusterizer 448
+  #define GPUCA_LB_GPUTPCNNClusterizer 448
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -147,6 +148,7 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 512
   #define GPUCA_LB_GPUTPCCFDeconvolution 512
   #define GPUCA_LB_GPUTPCCFClusterizer 512
+  #define GPUCA_LB_GPUTPCNNClusterizer 512
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -213,6 +215,7 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 448
   #define GPUCA_LB_GPUTPCCFDeconvolution 384
   #define GPUCA_LB_GPUTPCCFClusterizer 448
+  #define GPUCA_LB_GPUTPCNNClusterizer 448
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -489,6 +492,9 @@
   #ifndef GPUCA_LB_GPUTPCCFClusterizer
     #define GPUCA_LB_GPUTPCCFClusterizer 512
   #endif
+  #ifndef GPUCA_LB_GPUTPCNNClusterizer
+    #define GPUCA_LB_GPUTPCNNClusterizer 512
+  #endif
   #ifndef GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU
     #define GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU 256
   #endif
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 07cd320140909..eea8b3f0bbfe7 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -302,6 +302,26 @@ AddOption(printSettings, bool, false, "", 0, "Print all settings when initializi
 AddVariable(eventDisplay, GPUCA_NAMESPACE::gpu::GPUDisplayFrontendInterface*, nullptr)
 AddSubConfig(GPUSettingsProcessingRTC, rtc)
 AddSubConfig(GPUSettingsProcessingParam, param)
+AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural network clusterizer should be used.")
+AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)")
+AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id")
+AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference")
+AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
+AddOption(nnInferenceThreadsPerNN, int, 0, "", 0, "Number of threads used to evaluate one neural network")
+AddOption(nnInferenceEnableOrtOptimization, unsigned int, 1, "", 0, "Enables graph optimizations in ONNX Runtime. Can be greater than 1!")
+AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime")
+AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If mmInferenceOrtProfiling is set, the path to store the profiling data")
+AddOption(nnInferenceVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs")
+AddOption(nnClusterizerAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input")
+AddOption(nnClusterizerSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnClusterizerSizeInputPad, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnClusterizerSizeInputTime, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnClusterizerUseCFregression, int, 0, "", 0, "(bool, default = false) If true, use the regression from the native clusterizer and not the NN")
+AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable")
+AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
+AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
+AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
+AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
 AddHelp("help", 'h')
 EndConfig()
 #endif // __OPENCL__
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 4bc0ee4e91ff1..73462066f7746 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -12,6 +12,8 @@
 /// \file GPUChainTrackingClusterizer.cxx
 /// \author David Rohr
 
+#include <CommonUtils/StringUtils.h>
+
 #include "GPUChainTracking.h"
 #include "GPUChainTrackingDefs.h"
 #include "GPULogging.h"
@@ -849,8 +851,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (clusterer.mPmemory->counters.nPeaks == 0) {
           continue;
         }
-        runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
-        runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+        if (!GetProcessingSettings().applyNNclusterizer) {
+          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+        } else {
+          // FIXME: This potentially needs to be removed when I actually apply the NN. For now its only to make the code work
+          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+        }
         if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 3, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaks, *mDebugFile)) {
           clusterer.DumpPeakMap(*mDebugFile, "Suppressed Peaks");
         }
@@ -884,14 +892,76 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
         DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
 
-        runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSlice}}, 0);
+        if (GetProcessingSettings().applyNNclusterizer) {
+          // Settings for the clusterizer
+          clusterer.nnClusterizerUseCFregression = GetProcessingSettings().nnClusterizerUseCFregression;
+          clusterer.nnClusterizerSizeInputRow = GetProcessingSettings().nnClusterizerSizeInputRow;
+          clusterer.nnClusterizerSizeInputPad = GetProcessingSettings().nnClusterizerSizeInputPad;
+          clusterer.nnClusterizerSizeInputTime = GetProcessingSettings().nnClusterizerSizeInputTime;
+          clusterer.nnClusterizerAddIndexData = GetProcessingSettings().nnClusterizerAddIndexData;
+          clusterer.nnClusterizerElementSize = ((2 * clusterer.nnClusterizerSizeInputRow + 1) * (2 * clusterer.nnClusterizerSizeInputPad + 1) * (2 * clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0);
+          clusterer.nnClusterizerBatchedMode = GetProcessingSettings().nnClusterizerBatchedMode;
+          clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnInferenceVerbosity;
+
+          // Settings for the NN evaluation
+          clusterer.nnClassThreshold = GetProcessingSettings().nnClassThreshold;
+          clusterer.nnSigmoidTrafoClassThreshold = GetProcessingSettings().nnSigmoidTrafoClassThreshold;
+
+          // Settings for the neural network evaluation
+          clusterer.OrtOptions = {
+            {"model-path", GetProcessingSettings().nnClassificationPath},
+            {"device", GetProcessingSettings().nnInferenceDevice},
+            {"device-id", std::to_string(GetProcessingSettings().nnInferenceDeviceId)},
+            {"allocate-device-memory", std::to_string(GetProcessingSettings().nnInferenceAllocateDevMem)},
+            {"dtype", GetProcessingSettings().nnInferenceDtype},
+            {"intra-op-num-threads", std::to_string(GetProcessingSettings().nnInferenceThreadsPerNN)},
+            {"enable-optimizations", std::to_string(GetProcessingSettings().nnInferenceEnableOrtOptimization)},
+            {"enable-profiling", std::to_string(GetProcessingSettings().nnInferenceOrtProfiling)},
+            {"profiling-output-path", GetProcessingSettings().nnInferenceOrtProfilingPath},
+            {"logging-level", std::to_string(GetProcessingSettings().nnInferenceVerbosity)}};
+          clusterer.model_class.init(clusterer.OrtOptions);
+          if (!clusterer.nnClusterizerUseCFregression) {
+            std::vector<std::string> reg_model_paths = o2::utils::Str::tokenize(GetProcessingSettings().nnRegressionPath, ':');
+            if (clusterer.model_class.getNumOutputNodes()[0][1] == 1) {
+              clusterer.OrtOptions["model-path"] = reg_model_paths[0];
+              clusterer.model_reg_1.init(clusterer.OrtOptions);
+            } else {
+              if (reg_model_paths.size() == 1) {
+                clusterer.OrtOptions["model-path"] = reg_model_paths[0];
+                clusterer.model_reg_1.init(clusterer.OrtOptions);
+              } else {
+                clusterer.OrtOptions["model-path"] = reg_model_paths[0];
+                clusterer.model_reg_1.init(clusterer.OrtOptions);
+                clusterer.OrtOptions["model-path"] = reg_model_paths[1];
+                clusterer.model_reg_2.init(clusterer.OrtOptions);
+              }
+            }
+          } else {
+            runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
+            DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
+          }
+
+          if (clusterer.nnSigmoidTrafoClassThreshold) {
+            // Inverse sigmoid transformation
+            clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold / (1.f - clusterer.nnClassThreshold));
+          }
+          runKernel<GPUTPCNNClusterizer>({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
+        } else {
+          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
+        }
+
         if (doGPU && propagateMCLabels) {
           TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane);
           if (doGPU) {
             SynchronizeStream(lane);
           }
-          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+          if (!GetProcessingSettings().applyNNclusterizer) {
+            runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+          } else {
+            runKernel<GPUTPCNNClusterizer>({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+          }
         }
+
         if (GetProcessingSettings().debugLevel >= 3) {
           GPUInfo("Sector %02d Fragment %02d Lane %d: Found clusters: digits %u peaks %u clusters %u", iSlice, fragment.index, lane, (int32_t)clusterer.mPmemory->counters.nPositions, (int32_t)clusterer.mPmemory->counters.nPeaks, (int32_t)clusterer.mPmemory->counters.nClusters);
         }
diff --git a/GPU/GPUTracking/TPCClusterFinder/ChargePos.h b/GPU/GPUTracking/TPCClusterFinder/ChargePos.h
index f5ca9dbedd5ac..c2ee542f65434 100644
--- a/GPU/GPUTracking/TPCClusterFinder/ChargePos.h
+++ b/GPU/GPUTracking/TPCClusterFinder/ChargePos.h
@@ -47,6 +47,7 @@ struct ChargePos {
   GPUdi() tpccf::Row row() const { return gpad / TPC_PADS_PER_ROW_PADDED; }
   GPUdi() tpccf::Pad pad() const { return gpad % TPC_PADS_PER_ROW_PADDED - GPUCF_PADDING_PAD; }
   GPUdi() tpccf::TPCFragmentTime time() const { return timePadded - GPUCF_PADDING_TIME; }
+  GPUdi() tpccf::TPCFragmentTime globalTime() const { return timePadded; }
 
  private:
   // Maps the position of a pad given as row and index in that row to a unique
diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
index c6a05c46a7642..b7e535a107eac 100644
--- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
+++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
@@ -43,6 +43,24 @@ class ClusterAccumulator
   GPUd() void finalize(const ChargePos&, tpccf::Charge, tpccf::TPCTime, const GPUTPCGeometry&);
   GPUd() bool toNative(const ChargePos&, tpccf::Charge, tpc::ClusterNative&, const GPUParam&) const;
 
+  GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uint8_t splitInTime, uint8_t splitInPad)
+  {
+    mQtot = qtot;
+    mPadMean = padMean;
+    mPadSigma = padSigma;
+    mTimeMean = timeMean;
+    mTimeSigma = timeSigma;
+    mSplitInTime = splitInTime;
+    mSplitInPad = splitInPad;
+  }
+  GPUd() void setQtot(float qtot) { mQtot = qtot; }
+  GPUd() void setPadMean(float padMean) { mPadMean = padMean; }
+  GPUd() void setPadSigma(float padSigma) { mPadSigma = padSigma; }
+  GPUd() void setTimeMean(float timeMean) { mTimeMean = timeMean; }
+  GPUd() void setTimeSigma(float timeSigma) { mTimeSigma = timeSigma; }
+  GPUd() void setSplitInTime(uint8_t splitInTime) { mSplitInTime = splitInTime; }
+  GPUd() void setSplitInPad(uint8_t splitInPad) { mSplitInPad = splitInPad; }
+
  private:
   float mQtot = 0;
   float mPadMean = 0;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index d4838dda26fdd..af5315ddae4ac 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -19,6 +19,10 @@
 #include "GPUProcessor.h"
 #include "GPUDataTypes.h"
 #include "CfFragment.h"
+#include "ML/OrtInterface.h"
+#include "ML/3rdparty/GPUORTFloat16.h"
+
+using namespace o2::ml;
 
 namespace o2
 {
@@ -141,6 +145,20 @@ class GPUTPCClusterFinder : public GPUProcessor
   int16_t mZSOffsetId = -1;
   int16_t mOutputId = -1;
 
+  int nnClusterizerSizeInputRow = 3;
+  int nnClusterizerSizeInputPad = 3;
+  int nnClusterizerSizeInputTime = 3;
+  int nnClusterizerElementSize = -1;
+  bool nnClusterizerAddIndexData = true;
+  float nnClassThreshold = 0.16;
+  bool nnSigmoidTrafoClassThreshold = 1;
+  int nnClusterizerUseCFregression = 0;
+  int nnClusterizerBatchedMode = 1;
+  int nnClusterizerVerbosity = 0;
+
+  std::unordered_map<std::string, std::string> OrtOptions;
+  OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
+
 #ifndef GPUCA_GPUCODE
   void DumpDigits(std::ostream& out);
   void DumpChargeMap(std::ostream& out, std::string_view);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
new file mode 100644
index 0000000000000..ba8fac2a397e9
--- /dev/null
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -0,0 +1,655 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCNNClusterizer.cxx
+/// \author Christian Sonnabend
+
+#include "GPUTPCNNClusterizer.h"
+
+#include "CfConsts.h"
+#include "CfUtils.h"
+#include "ClusterAccumulator.h"
+#if !defined(GPUCA_GPUCODE)
+#include "GPUHostDataTypes.h"
+#include "MCLabelAccumulator.h"
+#endif
+
+using namespace GPUCA_NAMESPACE::gpu;
+using namespace GPUCA_NAMESPACE::gpu::tpccf;
+
+template <>
+GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC)
+{
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CPU_ONLY(
+    MCLabelAccumulator labelAcc(clusterer));
+
+  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+
+  if (clusterer.OrtOptions["dtype"].find("32") != std::string::npos) {
+    GPUTPCNNClusterizer::nn_clusterizer<float>(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
+  } else if (clusterer.OrtOptions["dtype"].find("16") != std::string::npos) {
+    GPUTPCNNClusterizer::nn_clusterizer<OrtDataType::Float16_t>(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
+  } else {
+    LOG(fatal) << "Unsupported data type for neural network clusterizer!";
+  }
+  // tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  //
+  // GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
+}
+
+int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo)
+{
+  return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2);
+}
+
+int GPUTPCNNClusterizer::rowOffset(int row, int global_shift)
+{
+  return (row > 62 ? global_shift : 0);
+}
+
+// ---------------------------------
+bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo)
+{
+  if (pad < 0 || row < 0) { // Faster short-circuit
+    return true;
+  } else if (row <= 62) {
+    // if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)) / 2) {
+    //   return true;
+    // } else {
+    //   return false;
+    // }
+    if (pad < 0 || pad > geo.NPads(row)) {
+      return true;
+    } else {
+      return false;
+    }
+  } else if (row <= 62 + global_shift) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
+    return true;
+  } else if (row <= o2::tpc::constants::MAXGLOBALPADROW - 1 + global_shift) {
+    // if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)- global_shift]) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)- global_shift]) / 2) {
+    //   return true;
+    // } else {
+    //   return false;
+    // }
+    if (pad < 0 || pad > geo.NPads(row)) {
+      return true;
+    } else {
+      return false;
+    }
+  } else {
+    return true;
+  }
+}
+
+template <class T>
+GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int iBlock, int iThread,
+                                                processorType& clusterer,
+                                                const CfFragment& fragment,
+                                                GPUSharedMemory& smem,
+                                                const Array2D<PackedCharge>& chargeMap,
+                                                const ChargePos* filteredPeakPositions,
+                                                const GPUSettingsRec& calib,
+                                                MCLabelAccumulator* labelAcc,
+                                                uint clusternum,
+                                                uint maxClusterPerRow,
+                                                uint* clusterInRow,
+                                                tpc::ClusterNative* clusterByRow,
+                                                uint* clusterPosInRow)
+{
+
+  uint glo_idx = get_global_id(0) * clusterer.nnClusterizerBatchedMode;
+  if (glo_idx >= clusternum) {
+    return;
+  }
+
+  std::vector<float> central_charges(clusterer.nnClusterizerBatchedMode, -1.f);
+  std::vector<T> input_data(clusterer.nnClusterizerElementSize * clusterer.nnClusterizerBatchedMode, (T)-1.f);
+  std::vector<ChargePos> peak_positions(clusterer.nnClusterizerBatchedMode);
+  unsigned int write_idx = 0;
+
+  for (int batch_counter = 0; batch_counter < clusterer.nnClusterizerBatchedMode; batch_counter++) {
+
+    uint cls = CAMath::Min(glo_idx + batch_counter, clusternum - 1);
+
+    ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
+    int row = peak.row(), pad = peak.pad(), time = peak.time();
+    float central_charge = chargeMap[peak].unpack();
+
+    peak_positions[batch_counter] = peak;
+    central_charges[batch_counter] = central_charge;
+
+    // unsigned int batch_offset = batch_counter * clusterer.nnClusterizerElementSize;
+    for (int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++) {
+      bool push_mc_label = (r == 0);
+      int pad_offset = GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);
+      int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow);
+      for (int p = -clusterer.nnClusterizerSizeInputPad; p <= clusterer.nnClusterizerSizeInputPad; p++) {
+        push_mc_label &= (std::abs(p) < 2); // Use inner 5x5 window
+        bool is_boundary = GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p + pad_offset, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
+        for (int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++) {
+          push_mc_label &= (std::abs(t) < 2); // Use inner 5x5 window
+          if (!is_boundary) {
+            ChargePos tmp_pos(row + r, pad + p + pad_offset, time + t);
+            input_data[write_idx] = (T)(chargeMap[tmp_pos].unpack() / central_charge);
+            if (push_mc_label) {
+              ChargePos tmp_pos_mc(row, pad + p, time + t);
+              CPU_ONLY(labelAcc->collect(tmp_pos, chargeMap[tmp_pos_mc].unpack()));
+            }
+          }
+          write_idx++;
+        }
+      }
+    }
+    if (clusterer.nnClusterizerAddIndexData) {
+      input_data[write_idx] = (T)(clusterer.mISlice / 36.f);
+      input_data[write_idx + 1] = (T)(row / 152.f);
+      input_data[write_idx + 2] = (T)((float)pad / clusterer.Param().tpcGeometry.NPads(row));
+      write_idx += 3;
+      // if(idx == 100){
+      //   LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]";
+      // }
+    }
+  }
+
+  std::vector<int> index_class_2;
+  std::vector<float> out_class = clusterer.model_class.inference<T, float>(input_data);
+  // LOG(info) << "input_data.size(): " << input_data.size() << "; write_idx: " << write_idx << "; out_class.size(): " << out_class.size();
+  int num_output_classes = clusterer.model_class.getNumOutputNodes()[0][1];
+
+  if (num_output_classes > 1) {
+    std::vector<float> tmp_out_class(clusterer.nnClusterizerBatchedMode);
+    for (int cls_idx = 0; cls_idx < clusterer.nnClusterizerBatchedMode; cls_idx++) {
+      auto elem_iterator = out_class.begin() + (unsigned int)(cls_idx * num_output_classes);
+      tmp_out_class[cls_idx] = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + num_output_classes)) - 1; // -1 since 2-class classifier will have 3 outputs: classes 0, 1, 2
+      if (tmp_out_class[cls_idx] > 1) {
+        index_class_2.push_back(cls_idx);
+      }
+    }
+    out_class = tmp_out_class;
+  }
+
+  if (!clusterer.nnClusterizerUseCFregression) {
+
+    std::vector<float> out_reg = clusterer.model_reg_1.inference<T, float>(input_data), tmp_out_reg_2;
+    if (index_class_2.size() > 0) {
+      std::vector<T> tmp_in_reg_2(index_class_2.size() * clusterer.nnClusterizerElementSize);
+      int fill_counter = 0;
+      for (int cls_idx : index_class_2) {
+        int from_idx = cls_idx * clusterer.nnClusterizerElementSize, to_idx = fill_counter * clusterer.nnClusterizerElementSize;
+        for (int reg_idx = 0; reg_idx < clusterer.nnClusterizerElementSize; reg_idx++) {
+          tmp_in_reg_2[to_idx + reg_idx] = input_data[from_idx + reg_idx];
+        }
+        fill_counter++;
+      }
+      tmp_out_reg_2 = clusterer.model_reg_2.inference<T, float>(input_data);
+    }
+
+    input_data.clear();
+
+    if ((clusterer.nnClusterizerVerbosity >= 4) && glo_idx == 0) {
+      LOG(info) << "[CF] Classification model: " << out_class[0] << " (>? " << clusterer.nnClassThreshold << ")";
+      LOG(info) << "[CF] Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4];
+    }
+
+    int num_outputs_1 = clusterer.model_reg_1.getNumOutputNodes()[0][1], num_outputs_2 = 0, counter_class_2_idcs = 0;
+    if (num_output_classes > 1) {
+      num_outputs_2 = clusterer.model_reg_2.getNumOutputNodes()[0][1];
+    }
+
+    for (int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) {
+
+      if (glo_idx + element >= clusternum) {
+        return;
+      }
+
+      int model_output_index = element * num_outputs_1;
+      if (out_class[element] > clusterer.nnClassThreshold) {
+        if ((num_output_classes == 1) || ((num_output_classes > 1) && (out_class[element] < 2))) {
+          // CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+          ClusterAccumulator pc;
+
+          ClusterAccumulator dummy_pc;
+          CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+
+          // Dummy build to push MC labels
+          buildCluster(
+            calib,
+            chargeMap,
+            peak_positions[element],
+            smem.posBcast,
+            smem.buf,
+            smem.innerAboveThreshold,
+            &dummy_pc,
+            labelAcc);
+
+          if (fragment.isOverlap(peak_positions[element].time())) {
+            if (clusterPosInRow) {
+              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+            }
+            continue;
+          }
+
+          pc.setFull(central_charges[element] * out_reg[model_output_index + 4], peak_positions[element].pad() + out_reg[model_output_index + 0], out_reg[model_output_index + 2], fragment.start + peak_positions[element].time() + out_reg[model_output_index + 1], out_reg[model_output_index + 3], 0, 0);
+          // LOG(info) << "Example: " << num_outputs_1 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
+
+          tpc::ClusterNative myCluster;
+          bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+          if (rejectCluster) {
+            if (clusterer.nnClusterizerVerbosity > 3) {
+              LOG(warning) << "[CF] Cluster rejected!";
+            }
+            if (clusterPosInRow) {
+              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+            }
+            continue;
+          }
+
+          uint rowIndex = 0;
+          if (clusterByRow != nullptr) {
+            rowIndex = sortIntoBuckets(
+              clusterer,
+              myCluster,
+              peak_positions[element].row(),
+              maxClusterPerRow,
+              clusterInRow,
+              clusterByRow);
+            if (clusterPosInRow != nullptr) {
+              clusterPosInRow[glo_idx + element] = rowIndex;
+            }
+          } else if (clusterPosInRow) {
+            rowIndex = clusterPosInRow[glo_idx + element];
+          }
+          CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
+        } else {
+          model_output_index = index_class_2[counter_class_2_idcs] * num_outputs_2;
+          counter_class_2_idcs++;
+
+          // Cluster 1
+          CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+          ClusterAccumulator pc;
+
+          if (fragment.isOverlap(peak_positions[element].time())) {
+            if (clusterPosInRow) {
+              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+            }
+            continue;
+          }
+
+          pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 8], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 4], tmp_out_reg_2[model_output_index + 2], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 2], tmp_out_reg_2[model_output_index + 6], 0, 0);
+          // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
+
+          tpc::ClusterNative myCluster;
+          bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+          if (rejectCluster) {
+            if (clusterer.nnClusterizerVerbosity > 3) {
+              LOG(warning) << "[CF] Cluster rejected!";
+            }
+            if (clusterPosInRow) {
+              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+            }
+            continue;
+          }
+
+          uint rowIndex = 0;
+          if (clusterByRow != nullptr) {
+            rowIndex = sortIntoBuckets(
+              clusterer,
+              myCluster,
+              peak_positions[element].row(),
+              maxClusterPerRow,
+              clusterInRow,
+              clusterByRow);
+            if (clusterPosInRow != nullptr) {
+              clusterPosInRow[glo_idx + element] = rowIndex;
+            }
+          } else if (clusterPosInRow) {
+            rowIndex = clusterPosInRow[glo_idx + element];
+          }
+          CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
+
+          // Cluster 2
+          CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+          pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 9], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 1], tmp_out_reg_2[model_output_index + 5], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 3], tmp_out_reg_2[model_output_index + 7], 0, 0);
+          // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
+          rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+          if (rejectCluster) {
+            if (clusterer.nnClusterizerVerbosity > 3) {
+              LOG(warning) << "[CF] Cluster rejected!";
+            }
+            if (clusterPosInRow) {
+              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+            }
+            continue;
+          }
+
+          rowIndex = 0;
+          if (clusterByRow != nullptr) {
+            rowIndex = sortIntoBuckets(
+              clusterer,
+              myCluster,
+              peak_positions[element].row(),
+              maxClusterPerRow,
+              clusterInRow,
+              clusterByRow);
+            if (clusterPosInRow != nullptr) {
+              clusterPosInRow[glo_idx + element] = rowIndex;
+            }
+          } else if (clusterPosInRow) {
+            rowIndex = clusterPosInRow[glo_idx + element];
+          }
+          CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
+        }
+      }
+    }
+
+  } else {
+
+    input_data.clear();
+    for (int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) {
+      if (glo_idx + element >= clusternum) {
+        return;
+      }
+
+      if (out_class[element] > clusterer.nnClassThreshold) {
+
+        ClusterAccumulator pc;
+        CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+
+        buildCluster(
+          calib,
+          chargeMap,
+          peak_positions[element],
+          smem.posBcast,
+          smem.buf,
+          smem.innerAboveThreshold,
+          &pc,
+          labelAcc);
+
+        if (fragment.isOverlap(peak_positions[element].time())) {
+          if (clusterPosInRow) {
+            clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+          }
+          continue;
+        }
+        pc.finalize(peak_positions[element], central_charges[element], fragment.start, clusterer.Param().tpcGeometry);
+
+        tpc::ClusterNative myCluster;
+        bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+
+        if (rejectCluster) {
+          if (clusterer.nnClusterizerVerbosity > 3) {
+            LOG(warning) << "[CF] Cluster rejected!";
+          }
+          if (clusterPosInRow) {
+            clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+          }
+          continue;
+        }
+
+        uint rowIndex = 0;
+        if (clusterByRow != nullptr) {
+          rowIndex = sortIntoBuckets(
+            clusterer,
+            myCluster,
+            peak_positions[element].row(),
+            maxClusterPerRow,
+            clusterInRow,
+            clusterByRow);
+          if (clusterPosInRow != nullptr) {
+            clusterPosInRow[glo_idx + element] = rowIndex;
+          }
+        } else if (clusterPosInRow) {
+          rowIndex = clusterPosInRow[glo_idx + element];
+        }
+
+        CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
+      }
+    }
+  }
+
+  if (clusterer.nnClusterizerVerbosity > 4) {
+    LOG(info) << "[CF] Clusterization done!";
+  }
+}
+
+GPUdii() void GPUTPCNNClusterizer::computeClustersImpl(int nBlocks, int nThreads, int iBlock, int iThread,
+                                                       processorType& clusterer,
+                                                       const CfFragment& fragment,
+                                                       GPUSharedMemory& smem,
+                                                       const Array2D<PackedCharge>& chargeMap,
+                                                       const ChargePos* filteredPeakPositions,
+                                                       const GPUSettingsRec& calib,
+                                                       MCLabelAccumulator* labelAcc,
+                                                       uint clusternum,
+                                                       uint maxClusterPerRow,
+                                                       uint* clusterInRow,
+                                                       tpc::ClusterNative* clusterByRow,
+                                                       uint* clusterPosInRow)
+{
+  uint idx = get_global_id(0);
+
+  // For certain configurations dummy work items are added, so the total
+  // number of work items is dividable by 64.
+  // These dummy items also compute the last cluster but discard the result.
+  ChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)];
+  Charge charge = chargeMap[pos].unpack();
+
+  ClusterAccumulator pc;
+  CPU_ONLY(labelAcc->collect(pos, charge));
+
+  buildCluster(
+    calib,
+    chargeMap,
+    pos,
+    smem.posBcast,
+    smem.buf,
+    smem.innerAboveThreshold,
+    &pc,
+    labelAcc);
+
+  if (idx >= clusternum) {
+    return;
+  }
+  if (fragment.isOverlap(pos.time())) {
+    if (clusterPosInRow) {
+      clusterPosInRow[idx] = maxClusterPerRow;
+    }
+    return;
+  }
+  pc.finalize(pos, charge, fragment.start, clusterer.Param().tpcGeometry);
+
+  tpc::ClusterNative myCluster;
+  bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param());
+
+  if (rejectCluster) {
+    if (clusterPosInRow) {
+      clusterPosInRow[idx] = maxClusterPerRow;
+    }
+    return;
+  }
+
+  uint rowIndex = 0;
+  if (clusterByRow != nullptr) {
+    rowIndex = sortIntoBuckets(
+      clusterer,
+      myCluster,
+      pos.row(),
+      maxClusterPerRow,
+      clusterInRow,
+      clusterByRow);
+    if (clusterPosInRow != nullptr) {
+      clusterPosInRow[idx] = rowIndex;
+    }
+  } else if (clusterPosInRow) {
+    rowIndex = clusterPosInRow[idx];
+  }
+
+  CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow));
+}
+
+GPUdii() void GPUTPCNNClusterizer::updateClusterInner(
+  const GPUSettingsRec& calib,
+  ushort lid,
+  ushort N,
+  const PackedCharge* buf,
+  const ChargePos& pos,
+  ClusterAccumulator* cluster,
+  MCLabelAccumulator* labelAcc,
+  uint8_t* innerAboveThreshold)
+{
+  uint8_t aboveThreshold = 0;
+
+  GPUCA_UNROLL(U(), U())
+  for (ushort i = 0; i < N; i++) {
+    Delta2 d = cfconsts::InnerNeighbors[i];
+
+    PackedCharge p = buf[N * lid + i];
+
+    Charge q = cluster->updateInner(p, d);
+
+    CPU_ONLY(
+      labelAcc->collect(pos.delta(d), q));
+
+    aboveThreshold |= (uint8_t(q > calib.tpc.cfInnerThreshold) << i);
+  }
+
+  innerAboveThreshold[lid] = aboveThreshold;
+
+  GPUbarrier();
+}
+
+GPUdii() void GPUTPCNNClusterizer::updateClusterOuter(
+  ushort lid,
+  ushort N,
+  ushort M,
+  ushort offset,
+  const PackedCharge* buf,
+  const ChargePos& pos,
+  ClusterAccumulator* cluster,
+  MCLabelAccumulator* labelAcc)
+{
+  GPUCA_UNROLL(U(), U())
+  for (ushort i = offset; i < M + offset; i++) {
+    PackedCharge p = buf[N * lid + i];
+
+    Delta2 d = cfconsts::OuterNeighbors[i];
+
+    Charge q = cluster->updateOuter(p, d);
+    static_cast<void>(q); // Avoid unused varible warning on GPU.
+
+    CPU_ONLY(
+      labelAcc->collect(pos.delta(d), q));
+  }
+}
+
+GPUdii() void GPUTPCNNClusterizer::buildCluster(
+  const GPUSettingsRec& calib,
+  const Array2D<PackedCharge>& chargeMap,
+  ChargePos pos,
+  ChargePos* posBcast,
+  PackedCharge* buf,
+  uint8_t* innerAboveThreshold,
+  ClusterAccumulator* myCluster,
+  MCLabelAccumulator* labelAcc)
+{
+  ushort ll = get_local_id(0);
+
+  posBcast[ll] = pos;
+  GPUbarrier();
+
+  CfUtils::blockLoad<PackedCharge>(
+    chargeMap,
+    SCRATCH_PAD_WORK_GROUP_SIZE,
+    SCRATCH_PAD_WORK_GROUP_SIZE,
+    ll,
+    0,
+    8,
+    cfconsts::InnerNeighbors,
+    posBcast,
+    buf);
+  updateClusterInner(
+    calib,
+    ll,
+    8,
+    buf,
+    pos,
+    myCluster,
+    labelAcc,
+    innerAboveThreshold);
+
+  ushort wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1) / 2;
+
+  bool inGroup1 = ll < wgSizeHalf;
+
+  ushort llhalf = (inGroup1) ? ll : (ll - wgSizeHalf);
+
+  CfUtils::condBlockLoad(
+    chargeMap,
+    wgSizeHalf,
+    SCRATCH_PAD_WORK_GROUP_SIZE,
+    ll,
+    0,
+    16,
+    cfconsts::OuterNeighbors,
+    posBcast,
+    innerAboveThreshold,
+    buf);
+
+  if (inGroup1) {
+    updateClusterOuter(
+      llhalf,
+      16,
+      16,
+      0,
+      buf,
+      pos,
+      myCluster,
+      labelAcc);
+  }
+
+#if defined(GPUCA_GPUCODE)
+  CfUtils::condBlockLoad(
+    chargeMap,
+    wgSizeHalf,
+    SCRATCH_PAD_WORK_GROUP_SIZE,
+    ll,
+    0,
+    16,
+    cfconsts::OuterNeighbors,
+    posBcast + wgSizeHalf,
+    innerAboveThreshold + wgSizeHalf,
+    buf);
+  if (!inGroup1) {
+    updateClusterOuter(
+      llhalf,
+      16,
+      16,
+      0,
+      buf,
+      pos,
+      myCluster,
+      labelAcc);
+  }
+#endif
+}
+
+GPUd() uint GPUTPCNNClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint row, uint maxElemsPerBucket, uint* elemsInBucket, tpc::ClusterNative* buckets)
+{
+  uint index = CAMath::AtomicAdd(&elemsInBucket[row], 1u);
+  if (index < maxElemsPerBucket) {
+    buckets[maxElemsPerBucket * row + index] = cluster;
+  } else {
+    clusterer.raiseError(GPUErrors::ERROR_CF_ROW_CLUSTER_OVERFLOW, clusterer.mISlice * 1000 + row, index, maxElemsPerBucket);
+    CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket);
+  }
+  return index;
+}
\ No newline at end of file
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
new file mode 100644
index 0000000000000..98d979d28cf15
--- /dev/null
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -0,0 +1,98 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCNNClusterizer.h
+/// \author Christian Sonnabend
+
+#ifndef O2_GPU_NN_CLUSTERIZER_H
+#define O2_GPU_NN_CLUSTERIZER_H
+
+#include "clusterFinderDefs.h"
+#include "GPUGeneralKernels.h"
+#include "GPUConstantMem.h"
+#include "GPUTPCClusterFinder.h"
+#include "Array2D.h"
+#include "PackedCharge.h"
+
+namespace o2::tpc
+{
+struct ClusterNative;
+} // namespace o2::tpc
+
+namespace GPUCA_NAMESPACE::gpu
+{
+
+class ClusterAccumulator;
+class MCLabelAccumulator;
+
+class GPUTPCNNClusterizer : public GPUKernelTemplate
+{
+ public:
+  static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizer);
+  struct GPUSharedMemory {
+    ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
+    PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N];
+    uint8_t innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE];
+  };
+
+#ifdef GPUCA_HAVE_O2HEADERS
+  typedef GPUTPCClusterFinder processorType;
+  GPUhdi() static processorType* Processor(GPUConstantMem& processors)
+  {
+    return processors.tpcClusterer;
+  }
+#endif
+
+  GPUhdi() CONSTEXPR static GPUDataTypes::RecoStep GetRecoStep()
+  {
+    return GPUDataTypes::RecoStep::TPCClusterFinding;
+  }
+
+  template <int iKernel = defaultKernel>
+  GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char);
+
+  static GPUd() void computeClustersImpl(int, int, int, int, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D<PackedCharge>&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint, uint, uint*, tpc::ClusterNative*, uint*);
+
+  static GPUd() void exec(int, int, int, int, GPUSharedMemory&, processorType&, char);
+  static int padOffset(int, int, const GPUTPCGeometry&);
+  static int rowOffset(int, int);
+  static bool isBoundary(int, int, int, const GPUTPCGeometry&);
+
+  template <class T>
+  static GPUd() void nn_clusterizer(int, int, int, int,
+                                    processorType&,
+                                    const CfFragment&,
+                                    GPUSharedMemory&,
+                                    const Array2D<PackedCharge>&,
+                                    const ChargePos*,
+                                    const GPUSettingsRec&,
+                                    MCLabelAccumulator*,
+                                    uint,
+                                    uint,
+                                    uint*,
+                                    tpc::ClusterNative*,
+                                    uint*);
+
+ private:
+  // ---------------------------------
+
+  static GPUd() void updateClusterInner(const GPUSettingsRec&, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uint8_t*);
+
+  static GPUd() void updateClusterOuter(ushort, ushort, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*);
+
+  static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);
+
+  static GPUd() uint sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint, uint, uint*, tpc::ClusterNative*);
+};
+
+} // namespace GPUCA_NAMESPACE::gpu
+
+#endif
\ No newline at end of file
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index f028c6990f267..f5d94562cf05d 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -118,6 +118,7 @@ o2_gpu_add_kernel("GPUTPCCFPeakFinder"                                "= TPCCLUS
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression"        "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks"             "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFDeconvolution"                             "= TPCCLUSTERFINDER"                                    LB      single)
+o2_gpu_add_kernel("GPUTPCNNClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single int8_t onlyMC)
 o2_gpu_add_kernel("GPUTPCCFClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single int8_t onlyMC)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets"           "= TPCCLUSTERFINDER"                                    NO      single)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten"                 "= TPCCLUSTERFINDER"                                    NO      single GPUTPCLinearLabels* out)