diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index dd3480cae86bd..dc34ecebe78e9 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -197,6 +197,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2" OR GPUCA_CONFIG_O2_EXTENSIONS) TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx TPCClusterFinder/GPUTPCCFPeakFinder.cxx TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx + TPCClusterFinder/GPUTPCNNClusterizer.cxx TPCClusterFinder/GPUTPCCFClusterizer.cxx TPCClusterFinder/GPUTPCCFDeconvolution.cxx TPCClusterFinder/GPUTPCCFMCLabelFlattener.cxx @@ -307,6 +308,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") O2::GPUCommon O2::ReconstructionDataFormats O2::TPCFastTransformation + O2::ML PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC SOURCES ${SRCS_DATATYPES}) target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAVE_O2HEADERS) diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h index 3852d37f6facf..fc8af23c810f8 100644 --- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h +++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h @@ -81,6 +81,7 @@ #define GPUCA_LB_GPUTPCCFNoiseSuppression 512 #define GPUCA_LB_GPUTPCCFDeconvolution 512 #define GPUCA_LB_GPUTPCCFClusterizer 448 + #define GPUCA_LB_GPUTPCNNClusterizer 448 #define GPUCA_LB_COMPRESSION_GATHER 1024 #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5 #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20 @@ -147,6 +148,7 @@ #define GPUCA_LB_GPUTPCCFNoiseSuppression 512 #define GPUCA_LB_GPUTPCCFDeconvolution 512 #define GPUCA_LB_GPUTPCCFClusterizer 512 + #define GPUCA_LB_GPUTPCNNClusterizer 512 #define GPUCA_LB_COMPRESSION_GATHER 1024 #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5 #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20 @@ -213,6 +215,7 @@ #define GPUCA_LB_GPUTPCCFNoiseSuppression 448 #define GPUCA_LB_GPUTPCCFDeconvolution 384 #define GPUCA_LB_GPUTPCCFClusterizer 448 + #define GPUCA_LB_GPUTPCNNClusterizer 448 #define GPUCA_LB_COMPRESSION_GATHER 1024 #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4 #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20 @@ -489,6 +492,9 @@ #ifndef GPUCA_LB_GPUTPCCFClusterizer #define GPUCA_LB_GPUTPCCFClusterizer 512 #endif + #ifndef GPUCA_LB_GPUTPCNNClusterizer + #define GPUCA_LB_GPUTPCNNClusterizer 512 + #endif #ifndef GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU #define GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU 256 #endif diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 07cd320140909..eea8b3f0bbfe7 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -302,6 +302,26 @@ AddOption(printSettings, bool, false, "", 0, "Print all settings when initializi AddVariable(eventDisplay, GPUCA_NAMESPACE::gpu::GPUDisplayFrontendInterface*, nullptr) AddSubConfig(GPUSettingsProcessingRTC, rtc) AddSubConfig(GPUSettingsProcessingParam, param) +AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural network clusterizer should be used.") +AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)") +AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id") +AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference") +AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16 +AddOption(nnInferenceThreadsPerNN, int, 0, "", 0, "Number of threads used to evaluate one neural network") +AddOption(nnInferenceEnableOrtOptimization, unsigned int, 1, "", 0, "Enables graph optimizations in ONNX Runtime. Can be greater than 1!") +AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime") +AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If mmInferenceOrtProfiling is set, the path to store the profiling data") +AddOption(nnInferenceVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs") +AddOption(nnClusterizerAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input") +AddOption(nnClusterizerSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2") +AddOption(nnClusterizerSizeInputPad, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2") +AddOption(nnClusterizerSizeInputTime, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2") +AddOption(nnClusterizerUseCFregression, int, 0, "", 0, "(bool, default = false) If true, use the regression from the native clusterizer and not the NN") +AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable") +AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path") +AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.") +AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path") +AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).") AddHelp("help", 'h') EndConfig() #endif // __OPENCL__ diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 4bc0ee4e91ff1..73462066f7746 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -12,6 +12,8 @@ /// \file GPUChainTrackingClusterizer.cxx /// \author David Rohr +#include + #include "GPUChainTracking.h" #include "GPUChainTrackingDefs.h" #include "GPULogging.h" @@ -849,8 +851,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (clusterer.mPmemory->counters.nPeaks == 0) { continue; } - runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); - runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); + if (!GetProcessingSettings().applyNNclusterizer) { + runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); + runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); + } else { + // FIXME: This potentially needs to be removed when I actually apply the NN. For now its only to make the code work + runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); + runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); + } if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 3, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaks, *mDebugFile)) { clusterer.DumpPeakMap(*mDebugFile, "Suppressed Peaks"); } @@ -884,14 +892,76 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}}); DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSlice}}, 0); + if (GetProcessingSettings().applyNNclusterizer) { + // Settings for the clusterizer + clusterer.nnClusterizerUseCFregression = GetProcessingSettings().nnClusterizerUseCFregression; + clusterer.nnClusterizerSizeInputRow = GetProcessingSettings().nnClusterizerSizeInputRow; + clusterer.nnClusterizerSizeInputPad = GetProcessingSettings().nnClusterizerSizeInputPad; + clusterer.nnClusterizerSizeInputTime = GetProcessingSettings().nnClusterizerSizeInputTime; + clusterer.nnClusterizerAddIndexData = GetProcessingSettings().nnClusterizerAddIndexData; + clusterer.nnClusterizerElementSize = ((2 * clusterer.nnClusterizerSizeInputRow + 1) * (2 * clusterer.nnClusterizerSizeInputPad + 1) * (2 * clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0); + clusterer.nnClusterizerBatchedMode = GetProcessingSettings().nnClusterizerBatchedMode; + clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnInferenceVerbosity; + + // Settings for the NN evaluation + clusterer.nnClassThreshold = GetProcessingSettings().nnClassThreshold; + clusterer.nnSigmoidTrafoClassThreshold = GetProcessingSettings().nnSigmoidTrafoClassThreshold; + + // Settings for the neural network evaluation + clusterer.OrtOptions = { + {"model-path", GetProcessingSettings().nnClassificationPath}, + {"device", GetProcessingSettings().nnInferenceDevice}, + {"device-id", std::to_string(GetProcessingSettings().nnInferenceDeviceId)}, + {"allocate-device-memory", std::to_string(GetProcessingSettings().nnInferenceAllocateDevMem)}, + {"dtype", GetProcessingSettings().nnInferenceDtype}, + {"intra-op-num-threads", std::to_string(GetProcessingSettings().nnInferenceThreadsPerNN)}, + {"enable-optimizations", std::to_string(GetProcessingSettings().nnInferenceEnableOrtOptimization)}, + {"enable-profiling", std::to_string(GetProcessingSettings().nnInferenceOrtProfiling)}, + {"profiling-output-path", GetProcessingSettings().nnInferenceOrtProfilingPath}, + {"logging-level", std::to_string(GetProcessingSettings().nnInferenceVerbosity)}}; + clusterer.model_class.init(clusterer.OrtOptions); + if (!clusterer.nnClusterizerUseCFregression) { + std::vector reg_model_paths = o2::utils::Str::tokenize(GetProcessingSettings().nnRegressionPath, ':'); + if (clusterer.model_class.getNumOutputNodes()[0][1] == 1) { + clusterer.OrtOptions["model-path"] = reg_model_paths[0]; + clusterer.model_reg_1.init(clusterer.OrtOptions); + } else { + if (reg_model_paths.size() == 1) { + clusterer.OrtOptions["model-path"] = reg_model_paths[0]; + clusterer.model_reg_1.init(clusterer.OrtOptions); + } else { + clusterer.OrtOptions["model-path"] = reg_model_paths[0]; + clusterer.model_reg_1.init(clusterer.OrtOptions); + clusterer.OrtOptions["model-path"] = reg_model_paths[1]; + clusterer.model_reg_2.init(clusterer.OrtOptions); + } + } + } else { + runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}}); + DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); + } + + if (clusterer.nnSigmoidTrafoClassThreshold) { + // Inverse sigmoid transformation + clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold / (1.f - clusterer.nnClassThreshold)); + } + runKernel({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0); + } else { + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0); + } + if (doGPU && propagateMCLabels) { TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane); if (doGPU) { SynchronizeStream(lane); } - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); + if (!GetProcessingSettings().applyNNclusterizer) { + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); + } else { + runKernel({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); + } } + if (GetProcessingSettings().debugLevel >= 3) { GPUInfo("Sector %02d Fragment %02d Lane %d: Found clusters: digits %u peaks %u clusters %u", iSlice, fragment.index, lane, (int32_t)clusterer.mPmemory->counters.nPositions, (int32_t)clusterer.mPmemory->counters.nPeaks, (int32_t)clusterer.mPmemory->counters.nClusters); } diff --git a/GPU/GPUTracking/TPCClusterFinder/ChargePos.h b/GPU/GPUTracking/TPCClusterFinder/ChargePos.h index f5ca9dbedd5ac..c2ee542f65434 100644 --- a/GPU/GPUTracking/TPCClusterFinder/ChargePos.h +++ b/GPU/GPUTracking/TPCClusterFinder/ChargePos.h @@ -47,6 +47,7 @@ struct ChargePos { GPUdi() tpccf::Row row() const { return gpad / TPC_PADS_PER_ROW_PADDED; } GPUdi() tpccf::Pad pad() const { return gpad % TPC_PADS_PER_ROW_PADDED - GPUCF_PADDING_PAD; } GPUdi() tpccf::TPCFragmentTime time() const { return timePadded - GPUCF_PADDING_TIME; } + GPUdi() tpccf::TPCFragmentTime globalTime() const { return timePadded; } private: // Maps the position of a pad given as row and index in that row to a unique diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h index c6a05c46a7642..b7e535a107eac 100644 --- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h +++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h @@ -43,6 +43,24 @@ class ClusterAccumulator GPUd() void finalize(const ChargePos&, tpccf::Charge, tpccf::TPCTime, const GPUTPCGeometry&); GPUd() bool toNative(const ChargePos&, tpccf::Charge, tpc::ClusterNative&, const GPUParam&) const; + GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uint8_t splitInTime, uint8_t splitInPad) + { + mQtot = qtot; + mPadMean = padMean; + mPadSigma = padSigma; + mTimeMean = timeMean; + mTimeSigma = timeSigma; + mSplitInTime = splitInTime; + mSplitInPad = splitInPad; + } + GPUd() void setQtot(float qtot) { mQtot = qtot; } + GPUd() void setPadMean(float padMean) { mPadMean = padMean; } + GPUd() void setPadSigma(float padSigma) { mPadSigma = padSigma; } + GPUd() void setTimeMean(float timeMean) { mTimeMean = timeMean; } + GPUd() void setTimeSigma(float timeSigma) { mTimeSigma = timeSigma; } + GPUd() void setSplitInTime(uint8_t splitInTime) { mSplitInTime = splitInTime; } + GPUd() void setSplitInPad(uint8_t splitInPad) { mSplitInPad = splitInPad; } + private: float mQtot = 0; float mPadMean = 0; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h index d4838dda26fdd..af5315ddae4ac 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h @@ -19,6 +19,10 @@ #include "GPUProcessor.h" #include "GPUDataTypes.h" #include "CfFragment.h" +#include "ML/OrtInterface.h" +#include "ML/3rdparty/GPUORTFloat16.h" + +using namespace o2::ml; namespace o2 { @@ -141,6 +145,20 @@ class GPUTPCClusterFinder : public GPUProcessor int16_t mZSOffsetId = -1; int16_t mOutputId = -1; + int nnClusterizerSizeInputRow = 3; + int nnClusterizerSizeInputPad = 3; + int nnClusterizerSizeInputTime = 3; + int nnClusterizerElementSize = -1; + bool nnClusterizerAddIndexData = true; + float nnClassThreshold = 0.16; + bool nnSigmoidTrafoClassThreshold = 1; + int nnClusterizerUseCFregression = 0; + int nnClusterizerBatchedMode = 1; + int nnClusterizerVerbosity = 0; + + std::unordered_map OrtOptions; + OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters + #ifndef GPUCA_GPUCODE void DumpDigits(std::ostream& out); void DumpChargeMap(std::ostream& out, std::string_view); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx new file mode 100644 index 0000000000000..ba8fac2a397e9 --- /dev/null +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -0,0 +1,655 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCNNClusterizer.cxx +/// \author Christian Sonnabend + +#include "GPUTPCNNClusterizer.h" + +#include "CfConsts.h" +#include "CfUtils.h" +#include "ClusterAccumulator.h" +#if !defined(GPUCA_GPUCODE) +#include "GPUHostDataTypes.h" +#include "MCLabelAccumulator.h" +#endif + +using namespace GPUCA_NAMESPACE::gpu; +using namespace GPUCA_NAMESPACE::gpu::tpccf; + +template <> +GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC) +{ + Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CPU_ONLY( + MCLabelAccumulator labelAcc(clusterer)); + + tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; + + if (clusterer.OrtOptions["dtype"].find("32") != std::string::npos) { + GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); + } else if (clusterer.OrtOptions["dtype"].find("16") != std::string::npos) { + GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); + } else { + LOG(fatal) << "Unsupported data type for neural network clusterizer!"; + } + // tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; + // + // GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); +} + +int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo) +{ + return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2); +} + +int GPUTPCNNClusterizer::rowOffset(int row, int global_shift) +{ + return (row > 62 ? global_shift : 0); +} + +// --------------------------------- +bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo) +{ + if (pad < 0 || row < 0) { // Faster short-circuit + return true; + } else if (row <= 62) { + // if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)) / 2) { + // return true; + // } else { + // return false; + // } + if (pad < 0 || pad > geo.NPads(row)) { + return true; + } else { + return false; + } + } else if (row <= 62 + global_shift) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network + return true; + } else if (row <= o2::tpc::constants::MAXGLOBALPADROW - 1 + global_shift) { + // if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)- global_shift]) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)- global_shift]) / 2) { + // return true; + // } else { + // return false; + // } + if (pad < 0 || pad > geo.NPads(row)) { + return true; + } else { + return false; + } + } else { + return true; + } +} + +template +GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int iBlock, int iThread, + processorType& clusterer, + const CfFragment& fragment, + GPUSharedMemory& smem, + const Array2D& chargeMap, + const ChargePos* filteredPeakPositions, + const GPUSettingsRec& calib, + MCLabelAccumulator* labelAcc, + uint clusternum, + uint maxClusterPerRow, + uint* clusterInRow, + tpc::ClusterNative* clusterByRow, + uint* clusterPosInRow) +{ + + uint glo_idx = get_global_id(0) * clusterer.nnClusterizerBatchedMode; + if (glo_idx >= clusternum) { + return; + } + + std::vector central_charges(clusterer.nnClusterizerBatchedMode, -1.f); + std::vector input_data(clusterer.nnClusterizerElementSize * clusterer.nnClusterizerBatchedMode, (T)-1.f); + std::vector peak_positions(clusterer.nnClusterizerBatchedMode); + unsigned int write_idx = 0; + + for (int batch_counter = 0; batch_counter < clusterer.nnClusterizerBatchedMode; batch_counter++) { + + uint cls = CAMath::Min(glo_idx + batch_counter, clusternum - 1); + + ChargePos peak = clusterer.mPfilteredPeakPositions[cls]; + int row = peak.row(), pad = peak.pad(), time = peak.time(); + float central_charge = chargeMap[peak].unpack(); + + peak_positions[batch_counter] = peak; + central_charges[batch_counter] = central_charge; + + // unsigned int batch_offset = batch_counter * clusterer.nnClusterizerElementSize; + for (int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++) { + bool push_mc_label = (r == 0); + int pad_offset = GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry); + int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow); + for (int p = -clusterer.nnClusterizerSizeInputPad; p <= clusterer.nnClusterizerSizeInputPad; p++) { + push_mc_label &= (std::abs(p) < 2); // Use inner 5x5 window + bool is_boundary = GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p + pad_offset, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry); + for (int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++) { + push_mc_label &= (std::abs(t) < 2); // Use inner 5x5 window + if (!is_boundary) { + ChargePos tmp_pos(row + r, pad + p + pad_offset, time + t); + input_data[write_idx] = (T)(chargeMap[tmp_pos].unpack() / central_charge); + if (push_mc_label) { + ChargePos tmp_pos_mc(row, pad + p, time + t); + CPU_ONLY(labelAcc->collect(tmp_pos, chargeMap[tmp_pos_mc].unpack())); + } + } + write_idx++; + } + } + } + if (clusterer.nnClusterizerAddIndexData) { + input_data[write_idx] = (T)(clusterer.mISlice / 36.f); + input_data[write_idx + 1] = (T)(row / 152.f); + input_data[write_idx + 2] = (T)((float)pad / clusterer.Param().tpcGeometry.NPads(row)); + write_idx += 3; + // if(idx == 100){ + // LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]"; + // } + } + } + + std::vector index_class_2; + std::vector out_class = clusterer.model_class.inference(input_data); + // LOG(info) << "input_data.size(): " << input_data.size() << "; write_idx: " << write_idx << "; out_class.size(): " << out_class.size(); + int num_output_classes = clusterer.model_class.getNumOutputNodes()[0][1]; + + if (num_output_classes > 1) { + std::vector tmp_out_class(clusterer.nnClusterizerBatchedMode); + for (int cls_idx = 0; cls_idx < clusterer.nnClusterizerBatchedMode; cls_idx++) { + auto elem_iterator = out_class.begin() + (unsigned int)(cls_idx * num_output_classes); + tmp_out_class[cls_idx] = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + num_output_classes)) - 1; // -1 since 2-class classifier will have 3 outputs: classes 0, 1, 2 + if (tmp_out_class[cls_idx] > 1) { + index_class_2.push_back(cls_idx); + } + } + out_class = tmp_out_class; + } + + if (!clusterer.nnClusterizerUseCFregression) { + + std::vector out_reg = clusterer.model_reg_1.inference(input_data), tmp_out_reg_2; + if (index_class_2.size() > 0) { + std::vector tmp_in_reg_2(index_class_2.size() * clusterer.nnClusterizerElementSize); + int fill_counter = 0; + for (int cls_idx : index_class_2) { + int from_idx = cls_idx * clusterer.nnClusterizerElementSize, to_idx = fill_counter * clusterer.nnClusterizerElementSize; + for (int reg_idx = 0; reg_idx < clusterer.nnClusterizerElementSize; reg_idx++) { + tmp_in_reg_2[to_idx + reg_idx] = input_data[from_idx + reg_idx]; + } + fill_counter++; + } + tmp_out_reg_2 = clusterer.model_reg_2.inference(input_data); + } + + input_data.clear(); + + if ((clusterer.nnClusterizerVerbosity >= 4) && glo_idx == 0) { + LOG(info) << "[CF] Classification model: " << out_class[0] << " (>? " << clusterer.nnClassThreshold << ")"; + LOG(info) << "[CF] Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4]; + } + + int num_outputs_1 = clusterer.model_reg_1.getNumOutputNodes()[0][1], num_outputs_2 = 0, counter_class_2_idcs = 0; + if (num_output_classes > 1) { + num_outputs_2 = clusterer.model_reg_2.getNumOutputNodes()[0][1]; + } + + for (int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) { + + if (glo_idx + element >= clusternum) { + return; + } + + int model_output_index = element * num_outputs_1; + if (out_class[element] > clusterer.nnClassThreshold) { + if ((num_output_classes == 1) || ((num_output_classes > 1) && (out_class[element] < 2))) { + // CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); + ClusterAccumulator pc; + + ClusterAccumulator dummy_pc; + CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); + + // Dummy build to push MC labels + buildCluster( + calib, + chargeMap, + peak_positions[element], + smem.posBcast, + smem.buf, + smem.innerAboveThreshold, + &dummy_pc, + labelAcc); + + if (fragment.isOverlap(peak_positions[element].time())) { + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + + pc.setFull(central_charges[element] * out_reg[model_output_index + 4], peak_positions[element].pad() + out_reg[model_output_index + 0], out_reg[model_output_index + 2], fragment.start + peak_positions[element].time() + out_reg[model_output_index + 1], out_reg[model_output_index + 3], 0, 0); + // LOG(info) << "Example: " << num_outputs_1 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3]; + + tpc::ClusterNative myCluster; + bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); + if (rejectCluster) { + if (clusterer.nnClusterizerVerbosity > 3) { + LOG(warning) << "[CF] Cluster rejected!"; + } + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + + uint rowIndex = 0; + if (clusterByRow != nullptr) { + rowIndex = sortIntoBuckets( + clusterer, + myCluster, + peak_positions[element].row(), + maxClusterPerRow, + clusterInRow, + clusterByRow); + if (clusterPosInRow != nullptr) { + clusterPosInRow[glo_idx + element] = rowIndex; + } + } else if (clusterPosInRow) { + rowIndex = clusterPosInRow[glo_idx + element]; + } + CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); + } else { + model_output_index = index_class_2[counter_class_2_idcs] * num_outputs_2; + counter_class_2_idcs++; + + // Cluster 1 + CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); + ClusterAccumulator pc; + + if (fragment.isOverlap(peak_positions[element].time())) { + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + + pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 8], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 4], tmp_out_reg_2[model_output_index + 2], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 2], tmp_out_reg_2[model_output_index + 6], 0, 0); + // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3]; + + tpc::ClusterNative myCluster; + bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); + if (rejectCluster) { + if (clusterer.nnClusterizerVerbosity > 3) { + LOG(warning) << "[CF] Cluster rejected!"; + } + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + + uint rowIndex = 0; + if (clusterByRow != nullptr) { + rowIndex = sortIntoBuckets( + clusterer, + myCluster, + peak_positions[element].row(), + maxClusterPerRow, + clusterInRow, + clusterByRow); + if (clusterPosInRow != nullptr) { + clusterPosInRow[glo_idx + element] = rowIndex; + } + } else if (clusterPosInRow) { + rowIndex = clusterPosInRow[glo_idx + element]; + } + CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); + + // Cluster 2 + CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); + pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 9], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 1], tmp_out_reg_2[model_output_index + 5], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 3], tmp_out_reg_2[model_output_index + 7], 0, 0); + // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3]; + rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); + if (rejectCluster) { + if (clusterer.nnClusterizerVerbosity > 3) { + LOG(warning) << "[CF] Cluster rejected!"; + } + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + + rowIndex = 0; + if (clusterByRow != nullptr) { + rowIndex = sortIntoBuckets( + clusterer, + myCluster, + peak_positions[element].row(), + maxClusterPerRow, + clusterInRow, + clusterByRow); + if (clusterPosInRow != nullptr) { + clusterPosInRow[glo_idx + element] = rowIndex; + } + } else if (clusterPosInRow) { + rowIndex = clusterPosInRow[glo_idx + element]; + } + CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); + } + } + } + + } else { + + input_data.clear(); + for (int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) { + if (glo_idx + element >= clusternum) { + return; + } + + if (out_class[element] > clusterer.nnClassThreshold) { + + ClusterAccumulator pc; + CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); + + buildCluster( + calib, + chargeMap, + peak_positions[element], + smem.posBcast, + smem.buf, + smem.innerAboveThreshold, + &pc, + labelAcc); + + if (fragment.isOverlap(peak_positions[element].time())) { + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + pc.finalize(peak_positions[element], central_charges[element], fragment.start, clusterer.Param().tpcGeometry); + + tpc::ClusterNative myCluster; + bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); + + if (rejectCluster) { + if (clusterer.nnClusterizerVerbosity > 3) { + LOG(warning) << "[CF] Cluster rejected!"; + } + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + + uint rowIndex = 0; + if (clusterByRow != nullptr) { + rowIndex = sortIntoBuckets( + clusterer, + myCluster, + peak_positions[element].row(), + maxClusterPerRow, + clusterInRow, + clusterByRow); + if (clusterPosInRow != nullptr) { + clusterPosInRow[glo_idx + element] = rowIndex; + } + } else if (clusterPosInRow) { + rowIndex = clusterPosInRow[glo_idx + element]; + } + + CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); + } + } + } + + if (clusterer.nnClusterizerVerbosity > 4) { + LOG(info) << "[CF] Clusterization done!"; + } +} + +GPUdii() void GPUTPCNNClusterizer::computeClustersImpl(int nBlocks, int nThreads, int iBlock, int iThread, + processorType& clusterer, + const CfFragment& fragment, + GPUSharedMemory& smem, + const Array2D& chargeMap, + const ChargePos* filteredPeakPositions, + const GPUSettingsRec& calib, + MCLabelAccumulator* labelAcc, + uint clusternum, + uint maxClusterPerRow, + uint* clusterInRow, + tpc::ClusterNative* clusterByRow, + uint* clusterPosInRow) +{ + uint idx = get_global_id(0); + + // For certain configurations dummy work items are added, so the total + // number of work items is dividable by 64. + // These dummy items also compute the last cluster but discard the result. + ChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)]; + Charge charge = chargeMap[pos].unpack(); + + ClusterAccumulator pc; + CPU_ONLY(labelAcc->collect(pos, charge)); + + buildCluster( + calib, + chargeMap, + pos, + smem.posBcast, + smem.buf, + smem.innerAboveThreshold, + &pc, + labelAcc); + + if (idx >= clusternum) { + return; + } + if (fragment.isOverlap(pos.time())) { + if (clusterPosInRow) { + clusterPosInRow[idx] = maxClusterPerRow; + } + return; + } + pc.finalize(pos, charge, fragment.start, clusterer.Param().tpcGeometry); + + tpc::ClusterNative myCluster; + bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param()); + + if (rejectCluster) { + if (clusterPosInRow) { + clusterPosInRow[idx] = maxClusterPerRow; + } + return; + } + + uint rowIndex = 0; + if (clusterByRow != nullptr) { + rowIndex = sortIntoBuckets( + clusterer, + myCluster, + pos.row(), + maxClusterPerRow, + clusterInRow, + clusterByRow); + if (clusterPosInRow != nullptr) { + clusterPosInRow[idx] = rowIndex; + } + } else if (clusterPosInRow) { + rowIndex = clusterPosInRow[idx]; + } + + CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow)); +} + +GPUdii() void GPUTPCNNClusterizer::updateClusterInner( + const GPUSettingsRec& calib, + ushort lid, + ushort N, + const PackedCharge* buf, + const ChargePos& pos, + ClusterAccumulator* cluster, + MCLabelAccumulator* labelAcc, + uint8_t* innerAboveThreshold) +{ + uint8_t aboveThreshold = 0; + + GPUCA_UNROLL(U(), U()) + for (ushort i = 0; i < N; i++) { + Delta2 d = cfconsts::InnerNeighbors[i]; + + PackedCharge p = buf[N * lid + i]; + + Charge q = cluster->updateInner(p, d); + + CPU_ONLY( + labelAcc->collect(pos.delta(d), q)); + + aboveThreshold |= (uint8_t(q > calib.tpc.cfInnerThreshold) << i); + } + + innerAboveThreshold[lid] = aboveThreshold; + + GPUbarrier(); +} + +GPUdii() void GPUTPCNNClusterizer::updateClusterOuter( + ushort lid, + ushort N, + ushort M, + ushort offset, + const PackedCharge* buf, + const ChargePos& pos, + ClusterAccumulator* cluster, + MCLabelAccumulator* labelAcc) +{ + GPUCA_UNROLL(U(), U()) + for (ushort i = offset; i < M + offset; i++) { + PackedCharge p = buf[N * lid + i]; + + Delta2 d = cfconsts::OuterNeighbors[i]; + + Charge q = cluster->updateOuter(p, d); + static_cast(q); // Avoid unused varible warning on GPU. + + CPU_ONLY( + labelAcc->collect(pos.delta(d), q)); + } +} + +GPUdii() void GPUTPCNNClusterizer::buildCluster( + const GPUSettingsRec& calib, + const Array2D& chargeMap, + ChargePos pos, + ChargePos* posBcast, + PackedCharge* buf, + uint8_t* innerAboveThreshold, + ClusterAccumulator* myCluster, + MCLabelAccumulator* labelAcc) +{ + ushort ll = get_local_id(0); + + posBcast[ll] = pos; + GPUbarrier(); + + CfUtils::blockLoad( + chargeMap, + SCRATCH_PAD_WORK_GROUP_SIZE, + SCRATCH_PAD_WORK_GROUP_SIZE, + ll, + 0, + 8, + cfconsts::InnerNeighbors, + posBcast, + buf); + updateClusterInner( + calib, + ll, + 8, + buf, + pos, + myCluster, + labelAcc, + innerAboveThreshold); + + ushort wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1) / 2; + + bool inGroup1 = ll < wgSizeHalf; + + ushort llhalf = (inGroup1) ? ll : (ll - wgSizeHalf); + + CfUtils::condBlockLoad( + chargeMap, + wgSizeHalf, + SCRATCH_PAD_WORK_GROUP_SIZE, + ll, + 0, + 16, + cfconsts::OuterNeighbors, + posBcast, + innerAboveThreshold, + buf); + + if (inGroup1) { + updateClusterOuter( + llhalf, + 16, + 16, + 0, + buf, + pos, + myCluster, + labelAcc); + } + +#if defined(GPUCA_GPUCODE) + CfUtils::condBlockLoad( + chargeMap, + wgSizeHalf, + SCRATCH_PAD_WORK_GROUP_SIZE, + ll, + 0, + 16, + cfconsts::OuterNeighbors, + posBcast + wgSizeHalf, + innerAboveThreshold + wgSizeHalf, + buf); + if (!inGroup1) { + updateClusterOuter( + llhalf, + 16, + 16, + 0, + buf, + pos, + myCluster, + labelAcc); + } +#endif +} + +GPUd() uint GPUTPCNNClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint row, uint maxElemsPerBucket, uint* elemsInBucket, tpc::ClusterNative* buckets) +{ + uint index = CAMath::AtomicAdd(&elemsInBucket[row], 1u); + if (index < maxElemsPerBucket) { + buckets[maxElemsPerBucket * row + index] = cluster; + } else { + clusterer.raiseError(GPUErrors::ERROR_CF_ROW_CLUSTER_OVERFLOW, clusterer.mISlice * 1000 + row, index, maxElemsPerBucket); + CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket); + } + return index; +} \ No newline at end of file diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h new file mode 100644 index 0000000000000..98d979d28cf15 --- /dev/null +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -0,0 +1,98 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCNNClusterizer.h +/// \author Christian Sonnabend + +#ifndef O2_GPU_NN_CLUSTERIZER_H +#define O2_GPU_NN_CLUSTERIZER_H + +#include "clusterFinderDefs.h" +#include "GPUGeneralKernels.h" +#include "GPUConstantMem.h" +#include "GPUTPCClusterFinder.h" +#include "Array2D.h" +#include "PackedCharge.h" + +namespace o2::tpc +{ +struct ClusterNative; +} // namespace o2::tpc + +namespace GPUCA_NAMESPACE::gpu +{ + +class ClusterAccumulator; +class MCLabelAccumulator; + +class GPUTPCNNClusterizer : public GPUKernelTemplate +{ + public: + static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizer); + struct GPUSharedMemory { + ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE]; + PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N]; + uint8_t innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE]; + }; + +#ifdef GPUCA_HAVE_O2HEADERS + typedef GPUTPCClusterFinder processorType; + GPUhdi() static processorType* Processor(GPUConstantMem& processors) + { + return processors.tpcClusterer; + } +#endif + + GPUhdi() CONSTEXPR static GPUDataTypes::RecoStep GetRecoStep() + { + return GPUDataTypes::RecoStep::TPCClusterFinding; + } + + template + GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char); + + static GPUd() void computeClustersImpl(int, int, int, int, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint, uint, uint*, tpc::ClusterNative*, uint*); + + static GPUd() void exec(int, int, int, int, GPUSharedMemory&, processorType&, char); + static int padOffset(int, int, const GPUTPCGeometry&); + static int rowOffset(int, int); + static bool isBoundary(int, int, int, const GPUTPCGeometry&); + + template + static GPUd() void nn_clusterizer(int, int, int, int, + processorType&, + const CfFragment&, + GPUSharedMemory&, + const Array2D&, + const ChargePos*, + const GPUSettingsRec&, + MCLabelAccumulator*, + uint, + uint, + uint*, + tpc::ClusterNative*, + uint*); + + private: + // --------------------------------- + + static GPUd() void updateClusterInner(const GPUSettingsRec&, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uint8_t*); + + static GPUd() void updateClusterOuter(ushort, ushort, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*); + + static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*); + + static GPUd() uint sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint, uint, uint*, tpc::ClusterNative*); +}; + +} // namespace GPUCA_NAMESPACE::gpu + +#endif \ No newline at end of file diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index f028c6990f267..f5d94562cf05d 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -118,6 +118,7 @@ o2_gpu_add_kernel("GPUTPCCFPeakFinder" "= TPCCLUS o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression" "= TPCCLUSTERFINDER" LB single) o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks" "= TPCCLUSTERFINDER" LB single) o2_gpu_add_kernel("GPUTPCCFDeconvolution" "= TPCCLUSTERFINDER" LB single) +o2_gpu_add_kernel("GPUTPCNNClusterizer" "= TPCCLUSTERFINDER" LB single int8_t onlyMC) o2_gpu_add_kernel("GPUTPCCFClusterizer" "= TPCCLUSTERFINDER" LB single int8_t onlyMC) o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets" "= TPCCLUSTERFINDER" NO single) o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten" "= TPCCLUSTERFINDER" NO single GPUTPCLinearLabels* out)