AliceO2Group · ChSonnabend · May 16, 2024 · May 24, 2024 · May 27, 2024 · May 27, 2024
@@ -197,6 +197,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2" OR GPUCA_CONFIG_O2_EXTENSIONS)
         TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx
         TPCClusterFinder/GPUTPCCFPeakFinder.cxx
         TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx
+        TPCClusterFinder/GPUTPCNNClusterizer.cxx
         TPCClusterFinder/GPUTPCCFClusterizer.cxx
         TPCClusterFinder/GPUTPCCFDeconvolution.cxx
         TPCClusterFinder/GPUTPCCFMCLabelFlattener.cxx
@@ -307,6 +308,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
                                        O2::GPUCommon
                                        O2::ReconstructionDataFormats
                                        O2::TPCFastTransformation
+                                       O2::ML
                  PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC
                  SOURCES ${SRCS_DATATYPES})
   target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAVE_O2HEADERS)

@@ -81,6 +81,7 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 512
   #define GPUCA_LB_GPUTPCCFDeconvolution 512
   #define GPUCA_LB_GPUTPCCFClusterizer 448
+  #define GPUCA_LB_GPUTPCNNClusterizer 448
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -147,6 +148,7 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 512
   #define GPUCA_LB_GPUTPCCFDeconvolution 512
   #define GPUCA_LB_GPUTPCCFClusterizer 512
+  #define GPUCA_LB_GPUTPCNNClusterizer 512
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -213,6 +215,7 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 448
   #define GPUCA_LB_GPUTPCCFDeconvolution 384
   #define GPUCA_LB_GPUTPCCFClusterizer 448
+  #define GPUCA_LB_GPUTPCNNClusterizer 448
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -489,6 +492,9 @@
   #ifndef GPUCA_LB_GPUTPCCFClusterizer
     #define GPUCA_LB_GPUTPCCFClusterizer 512
   #endif
+  #ifndef GPUCA_LB_GPUTPCNNClusterizer
+    #define GPUCA_LB_GPUTPCNNClusterizer 512
+  #endif
   #ifndef GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU
     #define GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU 256
   #endif

@@ -302,6 +302,26 @@ AddOption(printSettings, bool, false, "", 0, "Print all settings when initializi
 AddVariable(eventDisplay, GPUCA_NAMESPACE::gpu::GPUDisplayFrontendInterface*, nullptr)
 AddSubConfig(GPUSettingsProcessingRTC, rtc)
 AddSubConfig(GPUSettingsProcessingParam, param)
+AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural network clusterizer should be used.")
+AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)")
+AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id")
+AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference")
+AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
+AddOption(nnInferenceThreadsPerNN, int, 0, "", 0, "Number of threads used to evaluate one neural network")
+AddOption(nnInferenceEnableOrtOptimization, unsigned int, 1, "", 0, "Enables graph optimizations in ONNX Runtime. Can be greater than 1!")
+AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime")
+AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If mmInferenceOrtProfiling is set, the path to store the profiling data")
+AddOption(nnInferenceVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs")
+AddOption(nnClusterizerAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input")
+AddOption(nnClusterizerSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnClusterizerSizeInputPad, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnClusterizerSizeInputTime, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnClusterizerUseCFregression, int, 0, "", 0, "(bool, default = false) If true, use the regression from the native clusterizer and not the NN")
+AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable")
+AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
+AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
+AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
+AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
 AddHelp("help", 'h')
 EndConfig()
 #endif // __OPENCL__

@@ -12,6 +12,8 @@
 /// \file GPUChainTrackingClusterizer.cxx
 /// \author David Rohr
 
+#include <CommonUtils/StringUtils.h>
+
 #include "GPUChainTracking.h"
 #include "GPUChainTrackingDefs.h"
 #include "GPULogging.h"
@@ -849,8 +851,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (clusterer.mPmemory->counters.nPeaks == 0) {
           continue;
         }
-        runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
-        runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+        if (!GetProcessingSettings().applyNNclusterizer) {
+          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+        } else {
+          // FIXME: This potentially needs to be removed when I actually apply the NN. For now its only to make the code work
+          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+        }
         if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 3, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaks, *mDebugFile)) {
           clusterer.DumpPeakMap(*mDebugFile, "Suppressed Peaks");
         }
@@ -884,14 +892,76 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
         DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
 
-        runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSlice}}, 0);
+        if (GetProcessingSettings().applyNNclusterizer) {
+          // Settings for the clusterizer
+          clusterer.nnClusterizerUseCFregression = GetProcessingSettings().nnClusterizerUseCFregression;
+          clusterer.nnClusterizerSizeInputRow = GetProcessingSettings().nnClusterizerSizeInputRow;
+          clusterer.nnClusterizerSizeInputPad = GetProcessingSettings().nnClusterizerSizeInputPad;
+          clusterer.nnClusterizerSizeInputTime = GetProcessingSettings().nnClusterizerSizeInputTime;
+          clusterer.nnClusterizerAddIndexData = GetProcessingSettings().nnClusterizerAddIndexData;
+          clusterer.nnClusterizerElementSize = ((2 * clusterer.nnClusterizerSizeInputRow + 1) * (2 * clusterer.nnClusterizerSizeInputPad + 1) * (2 * clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0);
+          clusterer.nnClusterizerBatchedMode = GetProcessingSettings().nnClusterizerBatchedMode;
+          clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnInferenceVerbosity;
+
+          // Settings for the NN evaluation
+          clusterer.nnClassThreshold = GetProcessingSettings().nnClassThreshold;
+          clusterer.nnSigmoidTrafoClassThreshold = GetProcessingSettings().nnSigmoidTrafoClassThreshold;
+
+          // Settings for the neural network evaluation
+          clusterer.OrtOptions = {
+            {"model-path", GetProcessingSettings().nnClassificationPath},
+            {"device", GetProcessingSettings().nnInferenceDevice},
+            {"device-id", std::to_string(GetProcessingSettings().nnInferenceDeviceId)},
+            {"allocate-device-memory", std::to_string(GetProcessingSettings().nnInferenceAllocateDevMem)},
+            {"dtype", GetProcessingSettings().nnInferenceDtype},
+            {"intra-op-num-threads", std::to_string(GetProcessingSettings().nnInferenceThreadsPerNN)},
+            {"enable-optimizations", std::to_string(GetProcessingSettings().nnInferenceEnableOrtOptimization)},
+            {"enable-profiling", std::to_string(GetProcessingSettings().nnInferenceOrtProfiling)},
+            {"profiling-output-path", GetProcessingSettings().nnInferenceOrtProfilingPath},
+            {"logging-level", std::to_string(GetProcessingSettings().nnInferenceVerbosity)}};
+          clusterer.model_class.init(clusterer.OrtOptions);
+          if (!clusterer.nnClusterizerUseCFregression) {
+            std::vector<std::string> reg_model_paths = o2::utils::Str::tokenize(GetProcessingSettings().nnRegressionPath, ':');
+            if (clusterer.model_class.getNumOutputNodes()[0][1] == 1) {
+              clusterer.OrtOptions["model-path"] = reg_model_paths[0];
+              clusterer.model_reg_1.init(clusterer.OrtOptions);
+            } else {
+              if (reg_model_paths.size() == 1) {
+                clusterer.OrtOptions["model-path"] = reg_model_paths[0];
+                clusterer.model_reg_1.init(clusterer.OrtOptions);
+              } else {
+                clusterer.OrtOptions["model-path"] = reg_model_paths[0];
+                clusterer.model_reg_1.init(clusterer.OrtOptions);
+                clusterer.OrtOptions["model-path"] = reg_model_paths[1];
+                clusterer.model_reg_2.init(clusterer.OrtOptions);
+              }
+            }
+          } else {
+            runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
+            DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
+          }
+
+          if (clusterer.nnSigmoidTrafoClassThreshold) {
+            // Inverse sigmoid transformation
+            clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold / (1.f - clusterer.nnClassThreshold));
+          }
+          runKernel<GPUTPCNNClusterizer>({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
+        } else {
+          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
+        }
+
         if (doGPU && propagateMCLabels) {
           TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane);
           if (doGPU) {
             SynchronizeStream(lane);
           }
-          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+          if (!GetProcessingSettings().applyNNclusterizer) {
+            runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+          } else {
+            runKernel<GPUTPCNNClusterizer>({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+          }
         }
+
         if (GetProcessingSettings().debugLevel >= 3) {
           GPUInfo("Sector %02d Fragment %02d Lane %d: Found clusters: digits %u peaks %u clusters %u", iSlice, fragment.index, lane, (int32_t)clusterer.mPmemory->counters.nPositions, (int32_t)clusterer.mPmemory->counters.nPeaks, (int32_t)clusterer.mPmemory->counters.nClusters);
         }

@@ -47,6 +47,7 @@ struct ChargePos {
   GPUdi() tpccf::Row row() const { return gpad / TPC_PADS_PER_ROW_PADDED; }
   GPUdi() tpccf::Pad pad() const { return gpad % TPC_PADS_PER_ROW_PADDED - GPUCF_PADDING_PAD; }
   GPUdi() tpccf::TPCFragmentTime time() const { return timePadded - GPUCF_PADDING_TIME; }
+  GPUdi() tpccf::TPCFragmentTime globalTime() const { return timePadded; }
 
  private:
   // Maps the position of a pad given as row and index in that row to a unique

@@ -43,6 +43,24 @@ class ClusterAccumulator
   GPUd() void finalize(const ChargePos&, tpccf::Charge, tpccf::TPCTime, const GPUTPCGeometry&);
   GPUd() bool toNative(const ChargePos&, tpccf::Charge, tpc::ClusterNative&, const GPUParam&) const;
 
+  GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uint8_t splitInTime, uint8_t splitInPad)
+  {
+    mQtot = qtot;
+    mPadMean = padMean;
+    mPadSigma = padSigma;
+    mTimeMean = timeMean;
+    mTimeSigma = timeSigma;
+    mSplitInTime = splitInTime;
+    mSplitInPad = splitInPad;
+  }
+  GPUd() void setQtot(float qtot) { mQtot = qtot; }
+  GPUd() void setPadMean(float padMean) { mPadMean = padMean; }
+  GPUd() void setPadSigma(float padSigma) { mPadSigma = padSigma; }
+  GPUd() void setTimeMean(float timeMean) { mTimeMean = timeMean; }
+  GPUd() void setTimeSigma(float timeSigma) { mTimeSigma = timeSigma; }
+  GPUd() void setSplitInTime(uint8_t splitInTime) { mSplitInTime = splitInTime; }
+  GPUd() void setSplitInPad(uint8_t splitInPad) { mSplitInPad = splitInPad; }
+
  private:
   float mQtot = 0;
   float mPadMean = 0;

@@ -19,6 +19,10 @@
 #include "GPUProcessor.h"
 #include "GPUDataTypes.h"
 #include "CfFragment.h"
+#include "ML/OrtInterface.h"
+#include "ML/3rdparty/GPUORTFloat16.h"
+
+using namespace o2::ml;
 
 namespace o2
 {
@@ -141,6 +145,20 @@ class GPUTPCClusterFinder : public GPUProcessor
   int16_t mZSOffsetId = -1;
   int16_t mOutputId = -1;
 
+  int nnClusterizerSizeInputRow = 3;
+  int nnClusterizerSizeInputPad = 3;
+  int nnClusterizerSizeInputTime = 3;
+  int nnClusterizerElementSize = -1;
+  bool nnClusterizerAddIndexData = true;
+  float nnClassThreshold = 0.16;
+  bool nnSigmoidTrafoClassThreshold = 1;
+  int nnClusterizerUseCFregression = 0;
+  int nnClusterizerBatchedMode = 1;
+  int nnClusterizerVerbosity = 0;
+
+  std::unordered_map<std::string, std::string> OrtOptions;
+  OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
+
 #ifndef GPUCA_GPUCODE
   void DumpDigits(std::ostream& out);
   void DumpChargeMap(std::ostream& out, std::string_view);