From bb2cb6e48d12f71fb634b1429bf284db23bb97ee Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Mon, 21 Oct 2024 07:41:20 +0000 Subject: [PATCH] Please consider the following formatting changes --- Common/ML/include/ML/ort_interface.h | 76 ++- Common/ML/src/ort_interface.cxx | 88 +-- .../Global/GPUChainTrackingClusterizer.cxx | 23 +- .../TPCClusterFinder/ClusterAccumulator.h | 3 +- .../TPCClusterFinder/GPUTPCClusterFinder.h | 2 +- .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 539 +++++++++--------- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 26 +- 7 files changed, 385 insertions(+), 372 deletions(-) diff --git a/Common/ML/include/ML/ort_interface.h b/Common/ML/include/ML/ort_interface.h index a365860db3279..2fe9a44a0623c 100644 --- a/Common/ML/include/ML/ort_interface.h +++ b/Common/ML/include/ML/ort_interface.h @@ -35,60 +35,58 @@ namespace ml class OrtModel { - public: - // Constructor - OrtModel() = default; - OrtModel(std::unordered_map optionsMap){ reset(optionsMap); } - void init(std::unordered_map optionsMap){ reset(optionsMap); } - void reset(std::unordered_map); + public: + // Constructor + OrtModel() = default; + OrtModel(std::unordered_map optionsMap) { reset(optionsMap); } + void init(std::unordered_map optionsMap) { reset(optionsMap); } + void reset(std::unordered_map); - virtual ~OrtModel() = default; + virtual ~OrtModel() = default; - // Conversion - template - std::vector v2v(std::vector&, bool = true); + // Conversion + template + std::vector v2v(std::vector&, bool = true); - // Inferencing - template // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h - std::vector inference(std::vector&); + // Inferencing + template // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h + std::vector inference(std::vector&); - template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h - std::vector inference(std::vector>&); + template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h + std::vector inference(std::vector>&); - // template // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type - // std::vector inference(std::vector&); + // template // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type + // std::vector inference(std::vector&); - // Reset session - void resetSession(); + // Reset session + void resetSession(); - std::vector> getNumInputNodes() const { return mInputShapes; } - std::vector> getNumOutputNodes() const { return mOutputShapes; } - std::vector getInputNames() const { return mInputNames; } - std::vector getOutputNames() const { return mOutputNames; } + std::vector> getNumInputNodes() const { return mInputShapes; } + std::vector> getNumOutputNodes() const { return mOutputShapes; } + std::vector getInputNames() const { return mInputNames; } + std::vector getOutputNames() const { return mOutputNames; } - void setActiveThreads(int threads) { intraOpNumThreads = threads; } + void setActiveThreads(int threads) { intraOpNumThreads = threads; } - private: + private: + // ORT variables -> need to be hidden as Pimpl + struct OrtVariables; + OrtVariables* pImplOrt; - // ORT variables -> need to be hidden as Pimpl - struct OrtVariables; - OrtVariables* pImplOrt; + // Input & Output specifications of the loaded network + std::vector inputNamesChar, outputNamesChar; + std::vector mInputNames, mOutputNames; + std::vector> mInputShapes, mOutputShapes; - // Input & Output specifications of the loaded network - std::vector inputNamesChar, outputNamesChar; - std::vector mInputNames, mOutputNames; - std::vector> mInputShapes, mOutputShapes; - - // Environment settings - std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda - int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; - - std::string printShape(const std::vector&); + // Environment settings + std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda + int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; + std::string printShape(const std::vector&); }; } // namespace ml -} // namespace ml +} // namespace o2 #endif // O2_ML_ORT_INTERFACE_H \ No newline at end of file diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx index 84a06ce1da068..8ebe0588b4a2b 100644 --- a/Common/ML/src/ort_interface.cxx +++ b/Common/ML/src/ort_interface.cxx @@ -25,7 +25,7 @@ namespace o2 namespace ml { -struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file +struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file // ORT runtime objects Ort::RunOptions runOptions; std::shared_ptr env = nullptr; @@ -35,12 +35,13 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the . Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); }; -void OrtModel::reset(std::unordered_map optionsMap){ +void OrtModel::reset(std::unordered_map optionsMap) +{ pImplOrt = new OrtVariables(); // Load from options map - if(!optionsMap.contains("model-path")){ + if (!optionsMap.contains("model-path")) { LOG(fatal) << "(ORT) Model path cannot be empty!"; } modelPath = optionsMap["model-path"]; @@ -48,42 +49,42 @@ void OrtModel::reset(std::unordered_map optionsMap){ dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float"); deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0); allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); - intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); + intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0); enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0); enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); std::string dev_mem_str = "Hip"; #ifdef ORT_ROCM_BUILD - if(device == "ROCM") { + if (device == "ROCM") { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) ROCM execution provider set"; } #endif #ifdef ORT_MIGRAPHX_BUILD - if(device == "MIGRAPHX") { + if (device == "MIGRAPHX") { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) MIGraphX execution provider set"; } #endif #ifdef ORT_CUDA_BUILD - if(device == "CUDA") { + if (device == "CUDA") { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) CUDA execution provider set"; dev_mem_str = "Cuda"; } #endif - if(allocateDeviceMemory){ + if (allocateDeviceMemory) { pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); LOG(info) << "(ORT) Memory info set to on-device memory"; } - if(device == "CPU") { + if (device == "CPU") { (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads); - if(intraOpNumThreads > 1){ + if (intraOpNumThreads > 1) { (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL); - } else if(intraOpNumThreads == 1){ + } else if (intraOpNumThreads == 1) { (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); } LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads"; @@ -92,8 +93,8 @@ void OrtModel::reset(std::unordered_map optionsMap){ (pImplOrt->sessionOptions).DisableMemPattern(); (pImplOrt->sessionOptions).DisableCpuMemArena(); - if(enableProfiling){ - if(optionsMap.contains("profiling-output-path")){ + if (enableProfiling) { + if (optionsMap.contains("profiling-output-path")) { (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str()); } else { LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now."; @@ -109,27 +110,27 @@ void OrtModel::reset(std::unordered_map optionsMap){ (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { - mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); + mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); } for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { - mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); } for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { - mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get()); + mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get()); } for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { - mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); } inputNamesChar.resize(mInputNames.size(), nullptr); std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar), - [&](const std::string& str) { return str.c_str(); }); + [&](const std::string& str) { return str.c_str(); }); outputNamesChar.resize(mOutputNames.size(), nullptr); std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar), - [&](const std::string& str) { return str.c_str(); }); + [&](const std::string& str) { return str.c_str(); }); // Print names - if(loggingLevel > 1) { + if (loggingLevel > 1) { LOG(info) << "Input Nodes:"; for (size_t i = 0; i < mInputNames.size(); i++) { LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]); @@ -142,24 +143,28 @@ void OrtModel::reset(std::unordered_map optionsMap){ } } -void OrtModel::resetSession() { +void OrtModel::resetSession() +{ (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); } -template -std::vector OrtModel::v2v(std::vector& input, bool clearInput) { - if constexpr (std::is_same_v){ +template +std::vector OrtModel::v2v(std::vector& input, bool clearInput) +{ + if constexpr (std::is_same_v) { return input; } else { std::vector output(input.size()); std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); }); - if(clearInput) input.clear(); + if (clearInput) + input.clear(); return output; } } -template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h -std::vector OrtModel::inference(std::vector& input){ +template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h +std::vector OrtModel::inference(std::vector& input) +{ std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (reinterpret_cast(input)).data(), input.size(), inputShape.data(), inputShape.size())); @@ -171,10 +176,11 @@ std::vector OrtModel::inference(std::vector& input){ return outputValuesVec; } -template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h -std::vector OrtModel::inference(std::vector>& input){ +template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h +std::vector OrtModel::inference(std::vector>& input) +{ std::vector inputTensor; - for(auto i : input){ + for (auto i : input) { std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (reinterpret_cast(i)).data(), i.size(), inputShape.data(), inputShape.size())); } @@ -195,7 +201,9 @@ std::string OrtModel::printShape(const std::vector& v) return ss.str(); } -template <> std::vector OrtModel::inference(std::vector& input) { +template <> +std::vector OrtModel::inference(std::vector& input) +{ std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size())); @@ -207,7 +215,9 @@ template <> std::vector OrtModel::inference(std::vector std::vector OrtModel::inference(std::vector& input) { +template <> +std::vector OrtModel::inference(std::vector& input) +{ std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); @@ -219,7 +229,9 @@ template <> std::vector OrtModel::inference std::vector OrtModel::inference(std::vector& input) { +template <> +std::vector OrtModel::inference(std::vector& input) +{ std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); @@ -231,7 +243,9 @@ template <> std::vector OrtModel::inference std::vector OrtModel::inference(std::vector& input) { +template <> +std::vector OrtModel::inference(std::vector& input) +{ std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); @@ -243,9 +257,11 @@ template <> std::vector OrtModel::inference std::vector OrtModel::inference(std::vector>& input) { +template <> +std::vector OrtModel::inference(std::vector>& input) +{ std::vector inputTensor; - for(auto i : input){ + for (auto i : input) { std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(i.data()), i.size(), inputShape.data(), inputShape.size())); } diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 0f22a7472feac..d8470fdc2bf10 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -838,7 +838,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (clusterer.mPmemory->counters.nPeaks == 0) { continue; } - if(!GetProcessingSettings().applyNNclusterizer){ + if (!GetProcessingSettings().applyNNclusterizer) { runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); } else { @@ -875,14 +875,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}}); DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); - if(GetProcessingSettings().applyNNclusterizer){ + if (GetProcessingSettings().applyNNclusterizer) { // Settings for the clusterizer clusterer.nnClusterizerUseCFregression = GetProcessingSettings().nnClusterizerUseCFregression; clusterer.nnClusterizerSizeInputRow = GetProcessingSettings().nnClusterizerSizeInputRow; clusterer.nnClusterizerSizeInputPad = GetProcessingSettings().nnClusterizerSizeInputPad; clusterer.nnClusterizerSizeInputTime = GetProcessingSettings().nnClusterizerSizeInputTime; clusterer.nnClusterizerAddIndexData = GetProcessingSettings().nnClusterizerAddIndexData; - clusterer.nnClusterizerElementSize = ((2*clusterer.nnClusterizerSizeInputRow + 1) * (2*clusterer.nnClusterizerSizeInputPad + 1) * (2*clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0); + clusterer.nnClusterizerElementSize = ((2 * clusterer.nnClusterizerSizeInputRow + 1) * (2 * clusterer.nnClusterizerSizeInputPad + 1) * (2 * clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0); clusterer.nnClusterizerBatchedMode = GetProcessingSettings().nnClusterizerBatchedMode; clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnInferenceVerbosity; @@ -893,7 +893,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // Settings for the neural network evaluation clusterer.OrtOptions = { {"model-path", GetProcessingSettings().nnClassificationPath}, - {"device", GetProcessingSettings().nnInferenceDevice}, + {"device", GetProcessingSettings().nnInferenceDevice}, {"device-id", std::to_string(GetProcessingSettings().nnInferenceDeviceId)}, {"allocate-device-memory", std::to_string(GetProcessingSettings().nnInferenceAllocateDevMem)}, {"dtype", GetProcessingSettings().nnInferenceDtype}, @@ -901,16 +901,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) {"enable-optimizations", std::to_string(GetProcessingSettings().nnInferenceEnableOrtOptimization)}, {"enable-profiling", std::to_string(GetProcessingSettings().nnInferenceOrtProfiling)}, {"profiling-output-path", GetProcessingSettings().nnInferenceOrtProfilingPath}, - {"logging-level", std::to_string(GetProcessingSettings().nnInferenceVerbosity)} - }; + {"logging-level", std::to_string(GetProcessingSettings().nnInferenceVerbosity)}}; clusterer.model_class.init(clusterer.OrtOptions); - if(!clusterer.nnClusterizerUseCFregression){ + if (!clusterer.nnClusterizerUseCFregression) { std::vector reg_model_paths = o2::utils::Str::tokenize(GetProcessingSettings().nnRegressionPath, ':'); - if(clusterer.model_class.getNumOutputNodes()[0][1] == 1){ + if (clusterer.model_class.getNumOutputNodes()[0][1] == 1) { clusterer.OrtOptions["model-path"] = reg_model_paths[0]; clusterer.model_reg_1.init(clusterer.OrtOptions); } else { - if(reg_model_paths.size() == 1){ + if (reg_model_paths.size() == 1) { clusterer.OrtOptions["model-path"] = reg_model_paths[0]; clusterer.model_reg_1.init(clusterer.OrtOptions); } else { @@ -925,9 +924,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); } - if(clusterer.nnSigmoidTrafoClassThreshold){ + if (clusterer.nnSigmoidTrafoClassThreshold) { // Inverse sigmoid transformation - clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold/(1.f-clusterer.nnClassThreshold)); + clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold / (1.f - clusterer.nnClassThreshold)); } runKernel({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0); } else { @@ -939,7 +938,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (doGPU) { SynchronizeStream(lane); } - if(!GetProcessingSettings().applyNNclusterizer){ + if (!GetProcessingSettings().applyNNclusterizer) { runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); } else { runKernel({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h index d308b8bd6efa7..b7e535a107eac 100644 --- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h +++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h @@ -43,7 +43,8 @@ class ClusterAccumulator GPUd() void finalize(const ChargePos&, tpccf::Charge, tpccf::TPCTime, const GPUTPCGeometry&); GPUd() bool toNative(const ChargePos&, tpccf::Charge, tpc::ClusterNative&, const GPUParam&) const; - GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uint8_t splitInTime, uint8_t splitInPad){ + GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uint8_t splitInTime, uint8_t splitInPad) + { mQtot = qtot; mPadMean = padMean; mPadSigma = padSigma; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h index 130453e833911..fd420357073e9 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h @@ -158,7 +158,7 @@ class GPUTPCClusterFinder : public GPUProcessor std::unordered_map OrtOptions; OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters - + #ifndef GPUCA_GPUCODE void DumpDigits(std::ostream& out); void DumpChargeMap(std::ostream& out, std::string_view); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index f5e094a3c363e..ba8fac2a397e9 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -34,15 +34,15 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; - if(clusterer.OrtOptions["dtype"].find("32") != std::string::npos){ + if (clusterer.OrtOptions["dtype"].find("32") != std::string::npos) { GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); - } else if(clusterer.OrtOptions["dtype"].find("16") != std::string::npos) { + } else if (clusterer.OrtOptions["dtype"].find("16") != std::string::npos) { GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); } else { LOG(fatal) << "Unsupported data type for neural network clusterizer!"; } // tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; -// + // // GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); } @@ -74,12 +74,12 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G } } else if (row <= 62 + global_shift) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network return true; - } else if (row <= o2::tpc::constants::MAXGLOBALPADROW-1 + global_shift) { - //if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)- global_shift]) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)- global_shift]) / 2) { - // return true; - //} else { - // return false; - //} + } else if (row <= o2::tpc::constants::MAXGLOBALPADROW - 1 + global_shift) { + // if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)- global_shift]) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)- global_shift]) / 2) { + // return true; + // } else { + // return false; + // } if (pad < 0 || pad > geo.NPads(row)) { return true; } else { @@ -92,277 +92,135 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G template GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int iBlock, int iThread, - processorType& clusterer, - const CfFragment& fragment, - GPUSharedMemory& smem, - const Array2D& chargeMap, - const ChargePos* filteredPeakPositions, - const GPUSettingsRec& calib, - MCLabelAccumulator* labelAcc, - uint clusternum, - uint maxClusterPerRow, - uint* clusterInRow, - tpc::ClusterNative* clusterByRow, - uint* clusterPosInRow){ - - uint glo_idx = get_global_id(0) * clusterer.nnClusterizerBatchedMode; - if(glo_idx >= clusternum){ - return; - } + processorType& clusterer, + const CfFragment& fragment, + GPUSharedMemory& smem, + const Array2D& chargeMap, + const ChargePos* filteredPeakPositions, + const GPUSettingsRec& calib, + MCLabelAccumulator* labelAcc, + uint clusternum, + uint maxClusterPerRow, + uint* clusterInRow, + tpc::ClusterNative* clusterByRow, + uint* clusterPosInRow) +{ + + uint glo_idx = get_global_id(0) * clusterer.nnClusterizerBatchedMode; + if (glo_idx >= clusternum) { + return; + } - std::vector central_charges(clusterer.nnClusterizerBatchedMode, -1.f); - std::vector input_data(clusterer.nnClusterizerElementSize * clusterer.nnClusterizerBatchedMode, (T)-1.f); - std::vector peak_positions(clusterer.nnClusterizerBatchedMode); - unsigned int write_idx = 0; - - for(int batch_counter = 0; batch_counter < clusterer.nnClusterizerBatchedMode; batch_counter++){ - - uint cls = CAMath::Min(glo_idx + batch_counter, clusternum - 1); - - ChargePos peak = clusterer.mPfilteredPeakPositions[cls]; - int row = peak.row(), pad = peak.pad(), time = peak.time(); - float central_charge = chargeMap[peak].unpack(); - - peak_positions[batch_counter] = peak; - central_charges[batch_counter] = central_charge; - - // unsigned int batch_offset = batch_counter * clusterer.nnClusterizerElementSize; - for(int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++){ - bool push_mc_label = (r == 0); - int pad_offset = GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry); - int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow); - for(int p = -clusterer.nnClusterizerSizeInputPad; p <= clusterer.nnClusterizerSizeInputPad; p++){ - push_mc_label &= (std::abs(p) < 2); // Use inner 5x5 window - bool is_boundary = GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p + pad_offset, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry); - for(int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++){ - push_mc_label &= (std::abs(t) < 2); // Use inner 5x5 window - if(!is_boundary){ - ChargePos tmp_pos(row + r, pad + p + pad_offset, time + t); - input_data[write_idx] = (T)(chargeMap[tmp_pos].unpack() / central_charge); - if(push_mc_label){ - ChargePos tmp_pos_mc(row, pad + p, time + t); - CPU_ONLY(labelAcc->collect(tmp_pos, chargeMap[tmp_pos_mc].unpack())); - } + std::vector central_charges(clusterer.nnClusterizerBatchedMode, -1.f); + std::vector input_data(clusterer.nnClusterizerElementSize * clusterer.nnClusterizerBatchedMode, (T)-1.f); + std::vector peak_positions(clusterer.nnClusterizerBatchedMode); + unsigned int write_idx = 0; + + for (int batch_counter = 0; batch_counter < clusterer.nnClusterizerBatchedMode; batch_counter++) { + + uint cls = CAMath::Min(glo_idx + batch_counter, clusternum - 1); + + ChargePos peak = clusterer.mPfilteredPeakPositions[cls]; + int row = peak.row(), pad = peak.pad(), time = peak.time(); + float central_charge = chargeMap[peak].unpack(); + + peak_positions[batch_counter] = peak; + central_charges[batch_counter] = central_charge; + + // unsigned int batch_offset = batch_counter * clusterer.nnClusterizerElementSize; + for (int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++) { + bool push_mc_label = (r == 0); + int pad_offset = GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry); + int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow); + for (int p = -clusterer.nnClusterizerSizeInputPad; p <= clusterer.nnClusterizerSizeInputPad; p++) { + push_mc_label &= (std::abs(p) < 2); // Use inner 5x5 window + bool is_boundary = GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p + pad_offset, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry); + for (int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++) { + push_mc_label &= (std::abs(t) < 2); // Use inner 5x5 window + if (!is_boundary) { + ChargePos tmp_pos(row + r, pad + p + pad_offset, time + t); + input_data[write_idx] = (T)(chargeMap[tmp_pos].unpack() / central_charge); + if (push_mc_label) { + ChargePos tmp_pos_mc(row, pad + p, time + t); + CPU_ONLY(labelAcc->collect(tmp_pos, chargeMap[tmp_pos_mc].unpack())); } - write_idx++; } + write_idx++; } } - if(clusterer.nnClusterizerAddIndexData){ - input_data[write_idx] = (T)(clusterer.mISlice / 36.f); - input_data[write_idx + 1] = (T)(row / 152.f); - input_data[write_idx + 2] = (T)((float)pad / clusterer.Param().tpcGeometry.NPads(row)); - write_idx+=3; - // if(idx == 100){ - // LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]"; - // } - } } + if (clusterer.nnClusterizerAddIndexData) { + input_data[write_idx] = (T)(clusterer.mISlice / 36.f); + input_data[write_idx + 1] = (T)(row / 152.f); + input_data[write_idx + 2] = (T)((float)pad / clusterer.Param().tpcGeometry.NPads(row)); + write_idx += 3; + // if(idx == 100){ + // LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]"; + // } + } + } - std::vector index_class_2; - std::vector out_class = clusterer.model_class.inference(input_data); - // LOG(info) << "input_data.size(): " << input_data.size() << "; write_idx: " << write_idx << "; out_class.size(): " << out_class.size(); - int num_output_classes = clusterer.model_class.getNumOutputNodes()[0][1]; - - if(num_output_classes > 1){ - std::vector tmp_out_class(clusterer.nnClusterizerBatchedMode); - for(int cls_idx = 0; cls_idx < clusterer.nnClusterizerBatchedMode; cls_idx++){ - auto elem_iterator = out_class.begin() + (unsigned int)(cls_idx*num_output_classes); - tmp_out_class[cls_idx] = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator+num_output_classes)) - 1; // -1 since 2-class classifier will have 3 outputs: classes 0, 1, 2 - if(tmp_out_class[cls_idx] > 1){ - index_class_2.push_back(cls_idx); - } + std::vector index_class_2; + std::vector out_class = clusterer.model_class.inference(input_data); + // LOG(info) << "input_data.size(): " << input_data.size() << "; write_idx: " << write_idx << "; out_class.size(): " << out_class.size(); + int num_output_classes = clusterer.model_class.getNumOutputNodes()[0][1]; + + if (num_output_classes > 1) { + std::vector tmp_out_class(clusterer.nnClusterizerBatchedMode); + for (int cls_idx = 0; cls_idx < clusterer.nnClusterizerBatchedMode; cls_idx++) { + auto elem_iterator = out_class.begin() + (unsigned int)(cls_idx * num_output_classes); + tmp_out_class[cls_idx] = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + num_output_classes)) - 1; // -1 since 2-class classifier will have 3 outputs: classes 0, 1, 2 + if (tmp_out_class[cls_idx] > 1) { + index_class_2.push_back(cls_idx); } - out_class = tmp_out_class; } + out_class = tmp_out_class; + } - if(!clusterer.nnClusterizerUseCFregression) { + if (!clusterer.nnClusterizerUseCFregression) { - std::vector out_reg = clusterer.model_reg_1.inference(input_data), tmp_out_reg_2; - if(index_class_2.size() > 0){ - std::vector tmp_in_reg_2(index_class_2.size() * clusterer.nnClusterizerElementSize); - int fill_counter = 0; - for(int cls_idx : index_class_2){ - int from_idx = cls_idx*clusterer.nnClusterizerElementSize, to_idx = fill_counter * clusterer.nnClusterizerElementSize; - for(int reg_idx = 0; reg_idx < clusterer.nnClusterizerElementSize; reg_idx++){ - tmp_in_reg_2[to_idx + reg_idx] = input_data[from_idx + reg_idx]; - } - fill_counter++; + std::vector out_reg = clusterer.model_reg_1.inference(input_data), tmp_out_reg_2; + if (index_class_2.size() > 0) { + std::vector tmp_in_reg_2(index_class_2.size() * clusterer.nnClusterizerElementSize); + int fill_counter = 0; + for (int cls_idx : index_class_2) { + int from_idx = cls_idx * clusterer.nnClusterizerElementSize, to_idx = fill_counter * clusterer.nnClusterizerElementSize; + for (int reg_idx = 0; reg_idx < clusterer.nnClusterizerElementSize; reg_idx++) { + tmp_in_reg_2[to_idx + reg_idx] = input_data[from_idx + reg_idx]; } - tmp_out_reg_2 = clusterer.model_reg_2.inference(input_data); - } - - input_data.clear(); - - if((clusterer.nnClusterizerVerbosity >= 4) && glo_idx == 0){ - LOG(info) << "[CF] Classification model: " << out_class[0] << " (>? " << clusterer.nnClassThreshold << ")"; - LOG(info) << "[CF] Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4]; - } - - int num_outputs_1 = clusterer.model_reg_1.getNumOutputNodes()[0][1], num_outputs_2 = 0, counter_class_2_idcs = 0; - if(num_output_classes > 1){ - num_outputs_2 = clusterer.model_reg_2.getNumOutputNodes()[0][1]; + fill_counter++; } + tmp_out_reg_2 = clusterer.model_reg_2.inference(input_data); + } - for(int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) { - - if (glo_idx + element >= clusternum) { - return; - } - - int model_output_index = element*num_outputs_1; - if(out_class[element] > clusterer.nnClassThreshold) { - if((num_output_classes == 1) || ((num_output_classes > 1) && (out_class[element] < 2))) { - // CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); - ClusterAccumulator pc; - - ClusterAccumulator dummy_pc; - CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); - - // Dummy build to push MC labels - buildCluster( - calib, - chargeMap, - peak_positions[element], - smem.posBcast, - smem.buf, - smem.innerAboveThreshold, - &dummy_pc, - labelAcc); - - if (fragment.isOverlap(peak_positions[element].time())) { - if (clusterPosInRow) { - clusterPosInRow[glo_idx + element] = maxClusterPerRow; - } - continue; - } + input_data.clear(); - pc.setFull(central_charges[element] * out_reg[model_output_index + 4], peak_positions[element].pad() + out_reg[model_output_index + 0], out_reg[model_output_index + 2], fragment.start + peak_positions[element].time() + out_reg[model_output_index + 1], out_reg[model_output_index + 3], 0, 0); - // LOG(info) << "Example: " << num_outputs_1 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3]; - - tpc::ClusterNative myCluster; - bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); - if (rejectCluster) { - if(clusterer.nnClusterizerVerbosity > 3){ - LOG(warning) << "[CF] Cluster rejected!"; - } - if (clusterPosInRow) { - clusterPosInRow[glo_idx + element] = maxClusterPerRow; - } - continue; - } - - uint rowIndex = 0; - if (clusterByRow != nullptr) { - rowIndex = sortIntoBuckets( - clusterer, - myCluster, - peak_positions[element].row(), - maxClusterPerRow, - clusterInRow, - clusterByRow); - if (clusterPosInRow != nullptr) { - clusterPosInRow[glo_idx + element] = rowIndex; - } - } else if (clusterPosInRow) { - rowIndex = clusterPosInRow[glo_idx + element]; - } - CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); - } else { - model_output_index = index_class_2[counter_class_2_idcs]*num_outputs_2; - counter_class_2_idcs++; - - // Cluster 1 - CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); - ClusterAccumulator pc; - - if (fragment.isOverlap(peak_positions[element].time())) { - if (clusterPosInRow) { - clusterPosInRow[glo_idx + element] = maxClusterPerRow; - } - continue; - } + if ((clusterer.nnClusterizerVerbosity >= 4) && glo_idx == 0) { + LOG(info) << "[CF] Classification model: " << out_class[0] << " (>? " << clusterer.nnClassThreshold << ")"; + LOG(info) << "[CF] Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4]; + } - pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 8], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 4], tmp_out_reg_2[model_output_index + 2], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 2], tmp_out_reg_2[model_output_index + 6], 0, 0); - // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3]; - - tpc::ClusterNative myCluster; - bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); - if (rejectCluster) { - if(clusterer.nnClusterizerVerbosity > 3){ - LOG(warning) << "[CF] Cluster rejected!"; - } - if (clusterPosInRow) { - clusterPosInRow[glo_idx + element] = maxClusterPerRow; - } - continue; - } + int num_outputs_1 = clusterer.model_reg_1.getNumOutputNodes()[0][1], num_outputs_2 = 0, counter_class_2_idcs = 0; + if (num_output_classes > 1) { + num_outputs_2 = clusterer.model_reg_2.getNumOutputNodes()[0][1]; + } - uint rowIndex = 0; - if (clusterByRow != nullptr) { - rowIndex = sortIntoBuckets( - clusterer, - myCluster, - peak_positions[element].row(), - maxClusterPerRow, - clusterInRow, - clusterByRow); - if (clusterPosInRow != nullptr) { - clusterPosInRow[glo_idx + element] = rowIndex; - } - } else if (clusterPosInRow) { - rowIndex = clusterPosInRow[glo_idx + element]; - } - CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); - - // Cluster 2 - CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); - pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 9], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 1], tmp_out_reg_2[model_output_index + 5], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 3], tmp_out_reg_2[model_output_index + 7], 0, 0); - // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3]; - rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); - if (rejectCluster) { - if(clusterer.nnClusterizerVerbosity > 3){ - LOG(warning) << "[CF] Cluster rejected!"; - } - if (clusterPosInRow) { - clusterPosInRow[glo_idx + element] = maxClusterPerRow; - } - continue; - } + for (int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) { - rowIndex = 0; - if (clusterByRow != nullptr) { - rowIndex = sortIntoBuckets( - clusterer, - myCluster, - peak_positions[element].row(), - maxClusterPerRow, - clusterInRow, - clusterByRow); - if (clusterPosInRow != nullptr) { - clusterPosInRow[glo_idx + element] = rowIndex; - } - } else if (clusterPosInRow) { - rowIndex = clusterPosInRow[glo_idx + element]; - } - CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); - } - } + if (glo_idx + element >= clusternum) { + return; } - } else { - - input_data.clear(); - for(int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) { - if (glo_idx + element >= clusternum) { - return; - } - - if(out_class[element] > clusterer.nnClassThreshold) { - + int model_output_index = element * num_outputs_1; + if (out_class[element] > clusterer.nnClassThreshold) { + if ((num_output_classes == 1) || ((num_output_classes > 1) && (out_class[element] < 2))) { + // CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); ClusterAccumulator pc; + + ClusterAccumulator dummy_pc; CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); + // Dummy build to push MC labels buildCluster( calib, chargeMap, @@ -370,7 +228,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i smem.posBcast, smem.buf, smem.innerAboveThreshold, - &pc, + &dummy_pc, labelAcc); if (fragment.isOverlap(peak_positions[element].time())) { @@ -379,20 +237,67 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i } continue; } - pc.finalize(peak_positions[element], central_charges[element], fragment.start, clusterer.Param().tpcGeometry); + + pc.setFull(central_charges[element] * out_reg[model_output_index + 4], peak_positions[element].pad() + out_reg[model_output_index + 0], out_reg[model_output_index + 2], fragment.start + peak_positions[element].time() + out_reg[model_output_index + 1], out_reg[model_output_index + 3], 0, 0); + // LOG(info) << "Example: " << num_outputs_1 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3]; tpc::ClusterNative myCluster; bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); + if (rejectCluster) { + if (clusterer.nnClusterizerVerbosity > 3) { + LOG(warning) << "[CF] Cluster rejected!"; + } + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + + uint rowIndex = 0; + if (clusterByRow != nullptr) { + rowIndex = sortIntoBuckets( + clusterer, + myCluster, + peak_positions[element].row(), + maxClusterPerRow, + clusterInRow, + clusterByRow); + if (clusterPosInRow != nullptr) { + clusterPosInRow[glo_idx + element] = rowIndex; + } + } else if (clusterPosInRow) { + rowIndex = clusterPosInRow[glo_idx + element]; + } + CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); + } else { + model_output_index = index_class_2[counter_class_2_idcs] * num_outputs_2; + counter_class_2_idcs++; + // Cluster 1 + CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); + ClusterAccumulator pc; + + if (fragment.isOverlap(peak_positions[element].time())) { + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + + pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 8], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 4], tmp_out_reg_2[model_output_index + 2], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 2], tmp_out_reg_2[model_output_index + 6], 0, 0); + // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3]; + + tpc::ClusterNative myCluster; + bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); if (rejectCluster) { - if(clusterer.nnClusterizerVerbosity > 3){ - LOG(warning) << "[CF] Cluster rejected!"; - } - if (clusterPosInRow) { - clusterPosInRow[glo_idx + element] = maxClusterPerRow; - } - continue; + if (clusterer.nnClusterizerVerbosity > 3) { + LOG(warning) << "[CF] Cluster rejected!"; + } + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; } + continue; + } uint rowIndex = 0; if (clusterByRow != nullptr) { @@ -409,18 +314,112 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i } else if (clusterPosInRow) { rowIndex = clusterPosInRow[glo_idx + element]; } + CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); + // Cluster 2 + CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); + pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 9], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 1], tmp_out_reg_2[model_output_index + 5], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 3], tmp_out_reg_2[model_output_index + 7], 0, 0); + // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3]; + rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); + if (rejectCluster) { + if (clusterer.nnClusterizerVerbosity > 3) { + LOG(warning) << "[CF] Cluster rejected!"; + } + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + + rowIndex = 0; + if (clusterByRow != nullptr) { + rowIndex = sortIntoBuckets( + clusterer, + myCluster, + peak_positions[element].row(), + maxClusterPerRow, + clusterInRow, + clusterByRow); + if (clusterPosInRow != nullptr) { + clusterPosInRow[glo_idx + element] = rowIndex; + } + } else if (clusterPosInRow) { + rowIndex = clusterPosInRow[glo_idx + element]; + } CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); } } } - if(clusterer.nnClusterizerVerbosity > 4){ - LOG(info) << "[CF] Clusterization done!"; - } -} + } else { + + input_data.clear(); + for (int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) { + if (glo_idx + element >= clusternum) { + return; + } + + if (out_class[element] > clusterer.nnClassThreshold) { + ClusterAccumulator pc; + CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); + buildCluster( + calib, + chargeMap, + peak_positions[element], + smem.posBcast, + smem.buf, + smem.innerAboveThreshold, + &pc, + labelAcc); + + if (fragment.isOverlap(peak_positions[element].time())) { + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + pc.finalize(peak_positions[element], central_charges[element], fragment.start, clusterer.Param().tpcGeometry); + + tpc::ClusterNative myCluster; + bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); + + if (rejectCluster) { + if (clusterer.nnClusterizerVerbosity > 3) { + LOG(warning) << "[CF] Cluster rejected!"; + } + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + + uint rowIndex = 0; + if (clusterByRow != nullptr) { + rowIndex = sortIntoBuckets( + clusterer, + myCluster, + peak_positions[element].row(), + maxClusterPerRow, + clusterInRow, + clusterByRow); + if (clusterPosInRow != nullptr) { + clusterPosInRow[glo_idx + element] = rowIndex; + } + } else if (clusterPosInRow) { + rowIndex = clusterPosInRow[glo_idx + element]; + } + + CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); + } + } + } + + if (clusterer.nnClusterizerVerbosity > 4) { + LOG(info) << "[CF] Clusterization done!"; + } +} GPUdii() void GPUTPCNNClusterizer::computeClustersImpl(int nBlocks, int nThreads, int iBlock, int iThread, processorType& clusterer, diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 51a5c29022421..98d979d28cf15 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -66,20 +66,20 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate static int rowOffset(int, int); static bool isBoundary(int, int, int, const GPUTPCGeometry&); - template + template static GPUd() void nn_clusterizer(int, int, int, int, - processorType&, - const CfFragment&, - GPUSharedMemory&, - const Array2D&, - const ChargePos*, - const GPUSettingsRec&, - MCLabelAccumulator*, - uint, - uint, - uint*, - tpc::ClusterNative*, - uint*); + processorType&, + const CfFragment&, + GPUSharedMemory&, + const Array2D&, + const ChargePos*, + const GPUSettingsRec&, + MCLabelAccumulator*, + uint, + uint, + uint*, + tpc::ClusterNative*, + uint*); private: // ---------------------------------