From d4dc46e6465e47ce8d8b3b4e74b35479638d17e2 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 16 May 2024 09:32:45 +0200 Subject: [PATCH 01/23] Copying kernels to implement NN clusterizer --- GPU/GPUTracking/Global/GPUChainTracking.cxx | 2 +- GPU/GPUTracking/Global/GPUChainTracking.h | 2 +- .../Global/GPUChainTrackingClusterizer.cxx | 19 +- .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 271 ++++++++++++++++++ .../TPCClusterFinder/GPUTPCNNClusterizer.h | 76 +++++ 5 files changed, 364 insertions(+), 6 deletions(-) create mode 100644 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx create mode 100644 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx index bd1cd9859cbd2..68615f47d05db 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.cxx +++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx @@ -722,7 +722,7 @@ int GPUChainTracking::RunChain() return 1; } } else if (mIOPtrs.tpcPackedDigits || mIOPtrs.tpcZS) { - if (runRecoStep(RecoStep::TPCClusterFinding, &GPUChainTracking::RunTPCClusterizer, false)) { + if (runRecoStep(RecoStep::TPCClusterFinding, &GPUChainTracking::RunTPCClusterizer, false, true)) { // FIXME: This enables the neural network clusterization -> Need to actually set this as configurable return 1; } } diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h index 89f2ecd10f65f..032ad0524ccff 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.h +++ b/GPU/GPUTracking/Global/GPUChainTracking.h @@ -161,7 +161,7 @@ class GPUChainTracking : public GPUChain, GPUReconstructionHelpers::helperDelega void SetQAFromForeignChain(GPUChainTracking* chain) { mQAFromForeignChain = chain; } // Processing functions - int RunTPCClusterizer(bool synchronizeOutput = true); + int RunTPCClusterizer(bool synchronizeOutput = true, bool applyNNclusterizer = false); int ForwardTPCDigits(); int RunTPCTrackingSlices(); int RunTPCTrackingMerger(bool synchronizeOutput = true); diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 29bbf34b46135..7b2c5539439be 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -566,7 +566,7 @@ int GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers) #endif // TODO: Clusterizer not working with OCL1 (Clusterizer on CPU, Tracking on GPU) -int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) +int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclusterizer) { if (param().rec.fwdTPCDigitsAsClusters) { return ForwardTPCDigits(); @@ -835,8 +835,14 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (clusterer.mPmemory->counters.nPeaks == 0) { continue; } - runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); - runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); + if(!applyNNclusterizer){ + runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); + runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); + } else { + // FIXME: This needs to be removed when I actually apply the NN! For now its onyl to make the code work + runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); + runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); + } DoDebugAndDump(RecoStep::TPCClusterFinding, 262144, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaks, *mDebugFile); RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 1, doGPU, lane); @@ -870,7 +876,12 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (doGPU) { SynchronizeStream(lane); } - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); + if(!applyNNclusterizer){ + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); + } else { + // FIXME: Here I need to apply the neural network + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); + } } if (GetProcessingSettings().debugLevel >= 3) { GPUInfo("Sector %02d Fragment %02d Lane %d: Found clusters: digits %u peaks %u clusters %u", iSlice, fragment.index, lane, (int)clusterer.mPmemory->counters.nPositions, (int)clusterer.mPmemory->counters.nPeaks, (int)clusterer.mPmemory->counters.nClusters); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx new file mode 100644 index 0000000000000..3097d3adecb3d --- /dev/null +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -0,0 +1,271 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCNNClusterizer.cxx +/// \author Christian Sonnabend + +#include "GPUTPCNNClusterizer.h" + +#include "CfConsts.h" +#include "CfUtils.h" +#include "ClusterAccumulator.h" +#if !defined(GPUCA_GPUCODE) +#include "GPUHostDataTypes.h" +#include "MCLabelAccumulator.h" +#endif + +using namespace GPUCA_NAMESPACE::gpu; +using namespace GPUCA_NAMESPACE::gpu::tpccf; + +template <> +GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC) +{ + Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CPU_ONLY( + MCLabelAccumulator labelAcc(clusterer)); + + tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; + + GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); +} + +GPUdii() void GPUTPCNNClusterizer::computeClustersImpl(int nBlocks, int nThreads, int iBlock, int iThread, + processorType& clusterer, + const CfFragment& fragment, + GPUSharedMemory& smem, + const Array2D& chargeMap, + const ChargePos* filteredPeakPositions, + const GPUSettingsRec& calib, + MCLabelAccumulator* labelAcc, + uint clusternum, + uint maxClusterPerRow, + uint* clusterInRow, + tpc::ClusterNative* clusterByRow, + uint* clusterPosInRow) +{ + uint idx = get_global_id(0); + + // For certain configurations dummy work items are added, so the total + // number of work items is dividable by 64. + // These dummy items also compute the last cluster but discard the result. + ChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)]; + Charge charge = chargeMap[pos].unpack(); + + ClusterAccumulator pc; + CPU_ONLY(labelAcc->collect(pos, charge)); + + buildCluster( + calib, + chargeMap, + pos, + smem.posBcast, + smem.buf, + smem.innerAboveThreshold, + &pc, + labelAcc); + + if (idx >= clusternum) { + return; + } + if (fragment.isOverlap(pos.time())) { + if (clusterPosInRow) { + clusterPosInRow[idx] = maxClusterPerRow; + } + return; + } + pc.finalize(pos, charge, fragment.start, clusterer.Param().tpcGeometry); + + tpc::ClusterNative myCluster; + bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param()); + + if (rejectCluster) { + if (clusterPosInRow) { + clusterPosInRow[idx] = maxClusterPerRow; + } + return; + } + + uint rowIndex = 0; + if (clusterByRow != nullptr) { + rowIndex = sortIntoBuckets( + clusterer, + myCluster, + pos.row(), + maxClusterPerRow, + clusterInRow, + clusterByRow); + if (clusterPosInRow != nullptr) { + clusterPosInRow[idx] = rowIndex; + } + } else if (clusterPosInRow) { + rowIndex = clusterPosInRow[idx]; + } + + CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow)); +} + +GPUdii() void GPUTPCNNClusterizer::updateClusterInner( + const GPUSettingsRec& calib, + ushort lid, + ushort N, + const PackedCharge* buf, + const ChargePos& pos, + ClusterAccumulator* cluster, + MCLabelAccumulator* labelAcc, + uchar* innerAboveThreshold) +{ + uchar aboveThreshold = 0; + + GPUCA_UNROLL(U(), U()) + for (ushort i = 0; i < N; i++) { + Delta2 d = cfconsts::InnerNeighbors[i]; + + PackedCharge p = buf[N * lid + i]; + + Charge q = cluster->updateInner(p, d); + + CPU_ONLY( + labelAcc->collect(pos.delta(d), q)); + + aboveThreshold |= (uchar(q > calib.tpc.cfInnerThreshold) << i); + } + + innerAboveThreshold[lid] = aboveThreshold; + + GPUbarrier(); +} + +GPUdii() void GPUTPCNNClusterizer::updateClusterOuter( + ushort lid, + ushort N, + ushort M, + ushort offset, + const PackedCharge* buf, + const ChargePos& pos, + ClusterAccumulator* cluster, + MCLabelAccumulator* labelAcc) +{ + GPUCA_UNROLL(U(), U()) + for (ushort i = offset; i < M + offset; i++) { + PackedCharge p = buf[N * lid + i]; + + Delta2 d = cfconsts::OuterNeighbors[i]; + + Charge q = cluster->updateOuter(p, d); + static_cast(q); // Avoid unused varible warning on GPU. + + CPU_ONLY( + labelAcc->collect(pos.delta(d), q)); + } +} + +GPUdii() void GPUTPCNNClusterizer::buildCluster( + const GPUSettingsRec& calib, + const Array2D& chargeMap, + ChargePos pos, + ChargePos* posBcast, + PackedCharge* buf, + uchar* innerAboveThreshold, + ClusterAccumulator* myCluster, + MCLabelAccumulator* labelAcc) +{ + ushort ll = get_local_id(0); + + posBcast[ll] = pos; + GPUbarrier(); + + CfUtils::blockLoad( + chargeMap, + SCRATCH_PAD_WORK_GROUP_SIZE, + SCRATCH_PAD_WORK_GROUP_SIZE, + ll, + 0, + 8, + cfconsts::InnerNeighbors, + posBcast, + buf); + updateClusterInner( + calib, + ll, + 8, + buf, + pos, + myCluster, + labelAcc, + innerAboveThreshold); + + ushort wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1) / 2; + + bool inGroup1 = ll < wgSizeHalf; + + ushort llhalf = (inGroup1) ? ll : (ll - wgSizeHalf); + + CfUtils::condBlockLoad( + chargeMap, + wgSizeHalf, + SCRATCH_PAD_WORK_GROUP_SIZE, + ll, + 0, + 16, + cfconsts::OuterNeighbors, + posBcast, + innerAboveThreshold, + buf); + + if (inGroup1) { + updateClusterOuter( + llhalf, + 16, + 16, + 0, + buf, + pos, + myCluster, + labelAcc); + } + +#if defined(GPUCA_GPUCODE) + CfUtils::condBlockLoad( + chargeMap, + wgSizeHalf, + SCRATCH_PAD_WORK_GROUP_SIZE, + ll, + 0, + 16, + cfconsts::OuterNeighbors, + posBcast + wgSizeHalf, + innerAboveThreshold + wgSizeHalf, + buf); + if (!inGroup1) { + updateClusterOuter( + llhalf, + 16, + 16, + 0, + buf, + pos, + myCluster, + labelAcc); + } +#endif +} + +GPUd() uint GPUTPCNNClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint row, uint maxElemsPerBucket, uint* elemsInBucket, tpc::ClusterNative* buckets) +{ + uint index = CAMath::AtomicAdd(&elemsInBucket[row], 1u); + if (index < maxElemsPerBucket) { + buckets[maxElemsPerBucket * row + index] = cluster; + } else { + clusterer.raiseError(GPUErrors::ERROR_CF_ROW_CLUSTER_OVERFLOW, clusterer.mISlice * 1000 + row, index, maxElemsPerBucket); + CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket); + } + return index; +} diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h new file mode 100644 index 0000000000000..f2b92c5f50d40 --- /dev/null +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -0,0 +1,76 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCNNClusterizer.h +/// \author Christian Sonnabend + +#ifndef O2_GPU_CLUSTERIZER_H +#define O2_GPU_CLUSTERIZER_H + +#include "clusterFinderDefs.h" +#include "GPUGeneralKernels.h" +#include "GPUConstantMem.h" +#include "GPUTPCClusterFinder.h" +#include "Array2D.h" +#include "PackedCharge.h" + +namespace o2::tpc +{ +struct ClusterNative; +} // namespace o2::tpc + +namespace GPUCA_NAMESPACE::gpu +{ + +class ClusterAccumulator; +class MCLabelAccumulator; + +class GPUTPCNNClusterizer : public GPUKernelTemplate +{ + public: + static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFClusterizer); + struct GPUSharedMemory { + ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE]; + PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N]; + uchar innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE]; + }; + +#ifdef GPUCA_HAVE_O2HEADERS + typedef GPUTPCClusterFinder processorType; + GPUhdi() static processorType* Processor(GPUConstantMem& processors) + { + return processors.tpcClusterer; + } +#endif + + GPUhdi() CONSTEXPR static GPUDataTypes::RecoStep GetRecoStep() + { + return GPUDataTypes::RecoStep::TPCClusterFinding; + } + + template + GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char); + + static GPUd() void computeClustersImpl(int, int, int, int, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint, uint, uint*, tpc::ClusterNative*, uint*); + + private: + static GPUd() void updateClusterInner(const GPUSettingsRec&, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uchar*); + + static GPUd() void updateClusterOuter(ushort, ushort, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*); + + static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D&, ChargePos, ChargePos*, PackedCharge*, uchar*, ClusterAccumulator*, MCLabelAccumulator*); + + static GPUd() uint sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint, uint, uint*, tpc::ClusterNative*); +}; + +} // namespace GPUCA_NAMESPACE::gpu + +#endif From 05831efed4629001198fbc3b053c8bb41b2e13f7 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 27 May 2024 10:16:18 +0200 Subject: [PATCH 02/23] First version of clusterizer in GPU code --- Common/ML/CMakeLists.txt | 16 ++ Common/ML/include/ML/onnx_interface.h | 88 +++++++++ Common/ML/src/onnx_interface.cxx | 184 ++++++++++++++++++ GPU/GPUTracking/CMakeLists.txt | 3 + .../Global/GPUChainTrackingClusterizer.cxx | 5 +- GPU/GPUTracking/ML/onnx_interface.cxx | 184 ++++++++++++++++++ GPU/GPUTracking/ML/onnx_interface.h | 88 +++++++++ .../TPCClusterFinder/ClusterAccumulator.h | 17 ++ .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 133 +++++++++++++ .../TPCClusterFinder/GPUTPCNNClusterizer.h | 26 +++ 10 files changed, 743 insertions(+), 1 deletion(-) create mode 100644 Common/ML/CMakeLists.txt create mode 100644 Common/ML/include/ML/onnx_interface.h create mode 100644 Common/ML/src/onnx_interface.cxx create mode 100644 GPU/GPUTracking/ML/onnx_interface.cxx create mode 100644 GPU/GPUTracking/ML/onnx_interface.h diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt new file mode 100644 index 0000000000000..60a07041da2e0 --- /dev/null +++ b/Common/ML/CMakeLists.txt @@ -0,0 +1,16 @@ +# Copyright 2019-2020 CERN and copyright holders of ALICE O2. +# See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +# All rights not expressly granted are reserved. +# +# This software is distributed under the terms of the GNU General Public +# License v3 (GPL Version 3), copied verbatim in the file "COPYING". +# +# In applying this license CERN does not waive the privileges and immunities +# granted to it by virtue of its status as an Intergovernmental Organization +# or submit itself to any jurisdiction. + +o2_add_library(ML + SOURCES src/onnx_interface.cxx + TARGETVARNAME targetName + PUBLIC_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime +) \ No newline at end of file diff --git a/Common/ML/include/ML/onnx_interface.h b/Common/ML/include/ML/onnx_interface.h new file mode 100644 index 0000000000000..506311c067351 --- /dev/null +++ b/Common/ML/include/ML/onnx_interface.h @@ -0,0 +1,88 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// +/// \file model.h +/// +/// \author Christian Sonnabend +/// +/// \brief A general-purpose class for ONNX models +/// + +#ifndef GPU_ML_ONNX_INTERFACE_H +#define GPU_ML_ONNX_INTERFACE_H + +// C++ and system includes +#include +#include +#include +#include +#include +#include + +// O2 includes +#include "Framework/Logger.h" + +namespace o2 +{ + +namespace ml +{ + +class OnnxModel +{ + + public: + OnnxModel() = default; + ~OnnxModel() = default; + + // Inferencing + void init(std::string, bool = false, int = 0); + // float* inference(std::vector, int = 0); + // float* inference(std::vector, int = 0); + template float* inference(T input, unsigned int size); + template std::vector inference_vector(T input, unsigned int size); + + // Reset session + void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); } + + // Getters & Setters + Ort::SessionOptions* getSessionOptions() { return &sessionOptions; } // For optimizations in post + std::shared_ptr getSession() { return mSession; } + std::vector> getNumInputNodes() const { return mInputShapes; } + std::vector> getNumOutputNodes() const { return mOutputShapes; } + void setActiveThreads(int); + + private: + // Environment variables for the ONNX runtime + std::shared_ptr mEnv = nullptr; + std::shared_ptr mSession = nullptr; + Ort::SessionOptions sessionOptions; + + // Input & Output specifications of the loaded network + std::vector mInputNames; + std::vector> mInputShapes; + std::vector mOutputNames; + std::vector> mOutputShapes; + + // Environment settings + std::string modelPath; + int activeThreads = 0; + + // Internal function for printing the shape of tensors + std::string printShape(const std::vector&); +}; + +} // namespace gpu + +} // namespace GPUCA_NAMESPACE + +#endif // GPU_ML_ONNX_INTERFACE_H \ No newline at end of file diff --git a/Common/ML/src/onnx_interface.cxx b/Common/ML/src/onnx_interface.cxx new file mode 100644 index 0000000000000..e7c952d6b8cdc --- /dev/null +++ b/Common/ML/src/onnx_interface.cxx @@ -0,0 +1,184 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// +/// \file model.cxx +/// +/// \author Christian Sonnabend +/// +/// \brief A general-purpose class with functions for ONNX model applications +/// + +// ONNX includes +#include "ML/onnx_interface.h" + +namespace o2 +{ + +namespace ml +{ + +std::string OnnxModel::printShape(const std::vector& v) +{ + std::stringstream ss(""); + for (size_t i = 0; i < v.size() - 1; i++) + ss << v[i] << "x"; + ss << v[v.size() - 1]; + return ss.str(); +} + +void OnnxModel::init(std::string localPath, bool enableOptimizations, int threads) +{ + + LOG(info) << "--- ONNX-ML model ---"; + LOG(info) << "Taking model from: " << localPath; + modelPath = localPath; + activeThreads = threads; + + /// Enableing optimizations + if(threads != 0){ + // sessionOptions.SetInterOpNumThreads(1); + if(threads == 1){ + sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); + } + else{ + sessionOptions.SetExecutionMode(ExecutionMode::ORT_PARALLEL); + sessionOptions.SetIntraOpNumThreads(threads); + } + } + if (enableOptimizations) { + // sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED); + sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); + // uint32_t coreml_flags = 0; + // coreml_flags |= COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE; + // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(sessionOptions, coreml_flags)); + } + + mEnv = std::make_shared(ORT_LOGGING_LEVEL_WARNING, "onnx-model"); + mSession = std::make_shared(*mEnv, modelPath, sessionOptions); + + mInputNames = mSession->GetInputNames(); + mInputShapes = mSession->GetInputShapes(); + mOutputNames = mSession->GetOutputNames(); + mOutputShapes = mSession->GetOutputShapes(); + + LOG(info) << "Input Nodes:"; + for (size_t i = 0; i < mInputNames.size(); i++) { + LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]); + } + + LOG(info) << "Output Nodes:"; + for (size_t i = 0; i < mOutputNames.size(); i++) { + LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]); + } + + LOG(info) << "--- Model initialized! ---"; +} + +// float* OnnxModel::inference(std::vector input, int device_id) +// { + +// // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id)); + +// try { +// auto outputTensors = mSession->Run(mInputNames, input, mOutputNames); +// float* outputValues = outputTensors[0].GetTensorMutableData(); +// return outputValues; +// } catch (const Ort::Exception& exception) { +// LOG(error) << "Error running model inference: " << exception.what(); +// } +// return nullptr; +// } + +// float* OnnxModel::inference(std::vector input, int device_id) +// { +// +// // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id)); +// +// int64_t size = input.size(); +// assert(size % mInputShapes[0][1] == 0); +// std::vector inputShape{size / mInputShapes[0][1], mInputShapes[0][1]}; +// std::vector inputTensors; +// inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor(input.data(), size, inputShape)); +// try { +// auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); +// float* outputValues = outputTensors[0].GetTensorMutableData(); +// return outputValues; +// } catch (const Ort::Exception& exception) { +// LOG(error) << "Error running model inference: " << exception.what(); +// } +// return nullptr; +// } + +template +float* OnnxModel::inference(T input, unsigned int size) +{ + + std::vector inputShape = mInputShapes[0]; + inputShape[0] = size; + std::vector inputTensors; + size_t mem_size = 1; + for(auto elem : inputShape){ + mem_size*=elem; + } + inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor(input.data(), mem_size, inputShape)); + // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB"; + try { + auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); + float* outputValues = outputTensors[0].GetTensorMutableData(); + return outputValues; + } catch (const Ort::Exception& exception) { + LOG(error) << "Error running model inference: " << exception.what(); + } + return nullptr; +} + +template +std::vector OnnxModel::inference_vector(T input, unsigned int size) +{ + + std::vector inputShape = mInputShapes[0]; + inputShape[0] = size; + std::vector inputTensors; + // std::vector outputValues; + size_t mem_size = 1; + for(auto elem : inputShape){ + mem_size*=elem; + } + inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor(input.data(), mem_size, inputShape)); + // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB"; + try { + auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); + float* outputValues = outputTensors[0].GetTensorMutableData(); + std::vector outputVector{outputValues, outputValues + size * mOutputShapes[0][1]}; + // for(int s = 0; s < size; s++){ + // for(int o = 0; o < mOutputShapes[0][1]; o++){ + // outputValues.push_back(tmp_output_values[s*(int)mOutputShapes[0][1] + o]); + // } + // } + return outputVector; + } catch (const Ort::Exception& exception) { + LOG(error) << "Error running model inference: " << exception.what(); + } + return std::vector{}; +} + +void OnnxModel::setActiveThreads(int threads) +{ + activeThreads = threads; +} + +template float* OnnxModel::inference(std::vector, unsigned int); +template std::vector OnnxModel::inference_vector(std::vector, unsigned int); + +} // namespace gpu + +} // namespace GPUCA_NAMESPACE \ No newline at end of file diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index 6266d4962b88e..63abf760bf87a 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -62,6 +62,7 @@ set(SRCS Merger/GPUTPCGlobalDebugSortKernels.cxx Merger/GPUTPCGMPhysicalTrackModel.cxx Merger/GPUTPCGMPolynomialFieldManager.cxx + ML/onnx_interface.cxx DataTypes/GPUTRDTrack.cxx TRDTracking/GPUTRDTracker.cxx TRDTracking/GPUTRDTrackletWord.cxx @@ -195,6 +196,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2" OR CONFIG_O2_EXTENSIONS) TPCClusterFinder/GPUTPCCFPeakFinder.cxx TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx TPCClusterFinder/GPUTPCCFClusterizer.cxx + TPCClusterFinder/GPUTPCNNClusterizer.cxx TPCClusterFinder/GPUTPCCFDeconvolution.cxx TPCClusterFinder/GPUTPCCFMCLabelFlattener.cxx TPCClusterFinder/GPUTPCCFDecodeZS.cxx @@ -306,6 +308,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") PUBLIC_LINK_LIBRARIES O2::GPUCommon O2::ReconstructionDataFormats O2::TPCFastTransformation + ONNXRuntime::ONNXRuntime PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC SOURCES ${SRCS_DATATYPE_HEADERS}) target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAVE_O2HEADERS) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 7b2c5539439be..cca00ed3a1d02 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -18,6 +18,7 @@ #include "GPUO2DataTypes.h" #include "GPUMemorySizeScalers.h" #include "GPUTrackingInputProvider.h" +#include "GPUTPCNNClusterizer.h" #include #ifdef GPUCA_O2_LIB @@ -880,7 +881,9 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclus runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); } else { // FIXME: Here I need to apply the neural network - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); + // runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); + GPUCA_NAMESPACE::gpu::GPUTPCNNClusterizer nn_clus; + nn_clus.exec({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); } } if (GetProcessingSettings().debugLevel >= 3) { diff --git a/GPU/GPUTracking/ML/onnx_interface.cxx b/GPU/GPUTracking/ML/onnx_interface.cxx new file mode 100644 index 0000000000000..e7c952d6b8cdc --- /dev/null +++ b/GPU/GPUTracking/ML/onnx_interface.cxx @@ -0,0 +1,184 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// +/// \file model.cxx +/// +/// \author Christian Sonnabend +/// +/// \brief A general-purpose class with functions for ONNX model applications +/// + +// ONNX includes +#include "ML/onnx_interface.h" + +namespace o2 +{ + +namespace ml +{ + +std::string OnnxModel::printShape(const std::vector& v) +{ + std::stringstream ss(""); + for (size_t i = 0; i < v.size() - 1; i++) + ss << v[i] << "x"; + ss << v[v.size() - 1]; + return ss.str(); +} + +void OnnxModel::init(std::string localPath, bool enableOptimizations, int threads) +{ + + LOG(info) << "--- ONNX-ML model ---"; + LOG(info) << "Taking model from: " << localPath; + modelPath = localPath; + activeThreads = threads; + + /// Enableing optimizations + if(threads != 0){ + // sessionOptions.SetInterOpNumThreads(1); + if(threads == 1){ + sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); + } + else{ + sessionOptions.SetExecutionMode(ExecutionMode::ORT_PARALLEL); + sessionOptions.SetIntraOpNumThreads(threads); + } + } + if (enableOptimizations) { + // sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED); + sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); + // uint32_t coreml_flags = 0; + // coreml_flags |= COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE; + // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(sessionOptions, coreml_flags)); + } + + mEnv = std::make_shared(ORT_LOGGING_LEVEL_WARNING, "onnx-model"); + mSession = std::make_shared(*mEnv, modelPath, sessionOptions); + + mInputNames = mSession->GetInputNames(); + mInputShapes = mSession->GetInputShapes(); + mOutputNames = mSession->GetOutputNames(); + mOutputShapes = mSession->GetOutputShapes(); + + LOG(info) << "Input Nodes:"; + for (size_t i = 0; i < mInputNames.size(); i++) { + LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]); + } + + LOG(info) << "Output Nodes:"; + for (size_t i = 0; i < mOutputNames.size(); i++) { + LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]); + } + + LOG(info) << "--- Model initialized! ---"; +} + +// float* OnnxModel::inference(std::vector input, int device_id) +// { + +// // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id)); + +// try { +// auto outputTensors = mSession->Run(mInputNames, input, mOutputNames); +// float* outputValues = outputTensors[0].GetTensorMutableData(); +// return outputValues; +// } catch (const Ort::Exception& exception) { +// LOG(error) << "Error running model inference: " << exception.what(); +// } +// return nullptr; +// } + +// float* OnnxModel::inference(std::vector input, int device_id) +// { +// +// // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id)); +// +// int64_t size = input.size(); +// assert(size % mInputShapes[0][1] == 0); +// std::vector inputShape{size / mInputShapes[0][1], mInputShapes[0][1]}; +// std::vector inputTensors; +// inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor(input.data(), size, inputShape)); +// try { +// auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); +// float* outputValues = outputTensors[0].GetTensorMutableData(); +// return outputValues; +// } catch (const Ort::Exception& exception) { +// LOG(error) << "Error running model inference: " << exception.what(); +// } +// return nullptr; +// } + +template +float* OnnxModel::inference(T input, unsigned int size) +{ + + std::vector inputShape = mInputShapes[0]; + inputShape[0] = size; + std::vector inputTensors; + size_t mem_size = 1; + for(auto elem : inputShape){ + mem_size*=elem; + } + inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor(input.data(), mem_size, inputShape)); + // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB"; + try { + auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); + float* outputValues = outputTensors[0].GetTensorMutableData(); + return outputValues; + } catch (const Ort::Exception& exception) { + LOG(error) << "Error running model inference: " << exception.what(); + } + return nullptr; +} + +template +std::vector OnnxModel::inference_vector(T input, unsigned int size) +{ + + std::vector inputShape = mInputShapes[0]; + inputShape[0] = size; + std::vector inputTensors; + // std::vector outputValues; + size_t mem_size = 1; + for(auto elem : inputShape){ + mem_size*=elem; + } + inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor(input.data(), mem_size, inputShape)); + // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB"; + try { + auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); + float* outputValues = outputTensors[0].GetTensorMutableData(); + std::vector outputVector{outputValues, outputValues + size * mOutputShapes[0][1]}; + // for(int s = 0; s < size; s++){ + // for(int o = 0; o < mOutputShapes[0][1]; o++){ + // outputValues.push_back(tmp_output_values[s*(int)mOutputShapes[0][1] + o]); + // } + // } + return outputVector; + } catch (const Ort::Exception& exception) { + LOG(error) << "Error running model inference: " << exception.what(); + } + return std::vector{}; +} + +void OnnxModel::setActiveThreads(int threads) +{ + activeThreads = threads; +} + +template float* OnnxModel::inference(std::vector, unsigned int); +template std::vector OnnxModel::inference_vector(std::vector, unsigned int); + +} // namespace gpu + +} // namespace GPUCA_NAMESPACE \ No newline at end of file diff --git a/GPU/GPUTracking/ML/onnx_interface.h b/GPU/GPUTracking/ML/onnx_interface.h new file mode 100644 index 0000000000000..506311c067351 --- /dev/null +++ b/GPU/GPUTracking/ML/onnx_interface.h @@ -0,0 +1,88 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// +/// \file model.h +/// +/// \author Christian Sonnabend +/// +/// \brief A general-purpose class for ONNX models +/// + +#ifndef GPU_ML_ONNX_INTERFACE_H +#define GPU_ML_ONNX_INTERFACE_H + +// C++ and system includes +#include +#include +#include +#include +#include +#include + +// O2 includes +#include "Framework/Logger.h" + +namespace o2 +{ + +namespace ml +{ + +class OnnxModel +{ + + public: + OnnxModel() = default; + ~OnnxModel() = default; + + // Inferencing + void init(std::string, bool = false, int = 0); + // float* inference(std::vector, int = 0); + // float* inference(std::vector, int = 0); + template float* inference(T input, unsigned int size); + template std::vector inference_vector(T input, unsigned int size); + + // Reset session + void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); } + + // Getters & Setters + Ort::SessionOptions* getSessionOptions() { return &sessionOptions; } // For optimizations in post + std::shared_ptr getSession() { return mSession; } + std::vector> getNumInputNodes() const { return mInputShapes; } + std::vector> getNumOutputNodes() const { return mOutputShapes; } + void setActiveThreads(int); + + private: + // Environment variables for the ONNX runtime + std::shared_ptr mEnv = nullptr; + std::shared_ptr mSession = nullptr; + Ort::SessionOptions sessionOptions; + + // Input & Output specifications of the loaded network + std::vector mInputNames; + std::vector> mInputShapes; + std::vector mOutputNames; + std::vector> mOutputShapes; + + // Environment settings + std::string modelPath; + int activeThreads = 0; + + // Internal function for printing the shape of tensors + std::string printShape(const std::vector&); +}; + +} // namespace gpu + +} // namespace GPUCA_NAMESPACE + +#endif // GPU_ML_ONNX_INTERFACE_H \ No newline at end of file diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h index 3958f6d3aa137..344a0fae3995f 100644 --- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h +++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h @@ -43,6 +43,23 @@ class ClusterAccumulator GPUd() void finalize(const ChargePos&, tpccf::Charge, tpccf::TPCTime, const GPUTPCGeometry&); GPUd() bool toNative(const ChargePos&, tpccf::Charge, tpc::ClusterNative&, const GPUParam&) const; + GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uchar splitInTime, uchar splitInPad){ + mQtot = qtot; + mPadMean = padMean; + mPadSigma = padSigma; + mTimeMean = timeMean; + mTimeSigma = timeSigma; + mSplitInTime = splitInTime; + mSplitInPad = splitInPad; + } + GPUd() void setQtot(float qtot) { mQtot = qtot; } + GPUd() void setPadMean(float padMean) { mPadMean = padMean; } + GPUd() void setPadSigma(float padSigma) { mPadSigma = padSigma; } + GPUd() void setTimeMean(float timeMean) { mTimeMean = timeMean; } + GPUd() void setTimeSigma(float timeSigma) { mTimeSigma = timeSigma; } + GPUd() void setSplitInTime(uchar splitInTime) { mSplitInTime = splitInTime; } + GPUd() void setSplitInPad(uchar splitInPad) { mSplitInPad = splitInPad; } + private: float mQtot = 0; float mPadMean = 0; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index 3097d3adecb3d..6c64c54ca5193 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -37,6 +37,139 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); } +void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC) +{ + Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CPU_ONLY( + MCLabelAccumulator labelAcc(clusterer)); + + tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; + + OnnxModel model_class, model_reg; + std::string path_class = "", path_reg = ""; + + model_class.init(path_class, 1, 0); + model_reg.init(path_reg, 1, 0); + + GPUTPCNNClusterizer::nn_clusterizer(model_class, model_reg, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, 1, 0.16, 1); +} + +int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current) +{ + return (int)((pad_row_max[row_ref] - pad_row_max[row_current]) / 2); +} + +// --------------------------------- +bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift) +{ + if (row < 0 || pad < 0) { + return true; + } else if (row <= 62) { + if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row]) / 2) { + return true; + } else { + return false; + } + } else if (row <= 62 + global_shift) { + return true; + } else if (row <= o2::tpc::constants::MAXGLOBALPADROW-1 + global_shift) { + if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row - global_shift]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row - global_shift]) / 2) { + return true; + } else { + return false; + } + } else if (row > o2::tpc::constants::MAXGLOBALPADROW-1 + global_shift) { + return true; + } else { + return false; + } +} + +void GPUTPCNNClusterizer::nn_clusterizer(OnnxModel model_class, OnnxModel model_reg, + processorType& clusterer, + const CfFragment& fragment, + GPUSharedMemory& smem, + const Array2D& chargeMap, + const ChargePos* filteredPeakPositions, + const GPUSettingsRec& calib, + MCLabelAccumulator* labelAcc, + uint clusternum, + uint maxClusterPerRow, + uint* clusterInRow, + tpc::ClusterNative* clusterByRow, + uint* clusterPosInRow + int in_row, int in_pad, int in_time, bool add_index_data, float class_threshold, bool sigmoid_transform){ + + Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + std::vector input_data(clusterer.mPmemory->counters.nClusters * (2*in_row + 1) * (2*in_pad + 1) * (2*in_time + 1)); + float classification_threshold = class_threshold; + if(sigmoid_transform){ + classification_threshold = (float)std::log(class_threshold/(1.f-class_threshold)); + } + + for(float cls = 0; cls < clusterer.mPmemory->counters.nClusters; cls++){ + ChargePos peak = clusterer.mPfilteredPeakPositions[cls]; + int row = peak.row(), pad = peak.pad(), time = peak.time(); + float central_charge = chargeMap[peak].unpack(); + unsigned int glo_idx = cls * ((2*in_row + 1) + (2*in_pad + 1) * (2*in_time + 1)); + for(int r = -in_row; r <= in_row; r++){ + for(int p = -in_pad; p <= in_pad; p++){ + for(int t = -in_time; t <= in_time; t++){ + int offset = padOffset(row, row + r); + if(isBoundary(row + r, pad + p + offset)){ + continue; + } else { + unsigned int idx = glo_idx + (row + r) * (2*in_pad + 1) * (2*in_time + 1) + (pad + p) * (2*in_time + 1) + (time + t); + ChargePos tmp_pos(row + r, pad + p + offset, time + t); + input_data[idx] = (chargeMap[tmp_pos].unpack() / central_charge); + } + } + } + } + } + std::vector out_class = model_class.inference_vector(input_data, clusterer.mPmemory->counters.nClusters); + std::vector out_reg = model_reg.inference_vector(input_data, clusterer.mPmemory->counters.nClusters); + int num_outputs = model_reg.getNumOutputNodes()[0][1]; + + for(int cls = 0; cls < clusterer.mPmemory->counters.nClusters; cls++){ + if(out_class > classification_threshold){ + int idx = cls * num_outputs; + ChargePos peak = clusterer.mPfilteredPeakPositions[cls]; + ClusterAccumulator pc; + pc.setFull(chargeMap[peak].unpack() * out_reg[idx + 4], peak.pad() + out_reg[idx], out_reg[idx + 2], peak.time() + out_reg[idx + 1], out_reg[idx + 3], 0, 0); + tpc::ClusterNative myCluster; + bool rejectCluster = !pc.toNative(pos, chargeMap[peak].unpack(), myCluster, clusterer.Param()); + if (rejectCluster) { + if (clusterPosInRow) { + clusterPosInRow[idx] = maxClusterPerRow; + } + return; + } + + uint rowIndex = 0; + if (clusterByRow != nullptr) { + rowIndex = sortIntoBuckets( + clusterer, + myCluster, + pos.row(), + maxClusterPerRow, + clusterInRow, + clusterByRow); + if (clusterPosInRow != nullptr) { + clusterPosInRow[idx] = rowIndex; + } + } else if (clusterPosInRow) { + rowIndex = clusterPosInRow[idx]; + } + + CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow)); + } + } + +} + + + GPUdii() void GPUTPCNNClusterizer::computeClustersImpl(int nBlocks, int nThreads, int iBlock, int iThread, processorType& clusterer, const CfFragment& fragment, diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index f2b92c5f50d40..56ffcbc842223 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -21,6 +21,9 @@ #include "GPUTPCClusterFinder.h" #include "Array2D.h" #include "PackedCharge.h" +#include "ML/onnx_interface.h" + +using namespace o2::ml; namespace o2::tpc { @@ -61,7 +64,30 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate static GPUd() void computeClustersImpl(int, int, int, int, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint, uint, uint*, tpc::ClusterNative*, uint*); + void exec(int, int, int, int, GPUSharedMemory&, processorType&, char); + int padOffset(int); + bool isBoundary(int, int, int); + static void nn_clusterizer(OnnxModel, OnnxModel, + processorType&, + const CfFragment&, + GPUSharedMemory&, + const Array2D&, + const ChargePos*, + const GPUSettingsRec&, + MCLabelAccumulator*, + uint, + uint, + uint*, + tpc::ClusterNative*, + uint*, + int = 3, int = 3, int = 3, bool = true); + private: + // --------------------------------- + std::vector pad_row_max{ + 65, 65, 65, 67, 67, 67, 69, 69, 69, 71, 71, 71, 73, 73, 73, 73, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 81, 81, 81, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 91, 91, 91, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 99, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 79, 81, 81, 81, 83, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 101, 101, 101, 103, 103, 103, 105, 109, 109, 111, 111, 111, 113, 113, 113, 115, 115, 115, 117, 117, 117, 117, 117, 119, 119, 121, 121, 123, 123, 123, 125, 125, 127, 127, 127, 129, 129, 131, 131, 131, 133, 133, 135, 135, 137, 137 + }; + static GPUd() void updateClusterInner(const GPUSettingsRec&, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uchar*); static GPUd() void updateClusterOuter(ushort, ushort, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*); From 3f6c934987d68cce26ca1c63c07dc2038be3850b Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Wed, 29 May 2024 11:38:33 +0200 Subject: [PATCH 03/23] Adding a compiling and running version with single-threaded ONNX model executions. Clusters are not getting published yet (FIXME) --- GPU/GPUTracking/CMakeLists.txt | 2 +- .../Definitions/GPUDefGPUParameters.h | 6 + .../Global/GPUChainTrackingClusterizer.cxx | 16 +- .../TPCClusterFinder/GPUTPCClusterFinder.h | 5 + .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 161 +++++++++++------- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 22 +-- GPU/GPUTracking/kernels.cmake | 1 + 7 files changed, 133 insertions(+), 80 deletions(-) diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index 63abf760bf87a..8b3a37894810c 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -195,8 +195,8 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2" OR CONFIG_O2_EXTENSIONS) TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx TPCClusterFinder/GPUTPCCFPeakFinder.cxx TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx - TPCClusterFinder/GPUTPCCFClusterizer.cxx TPCClusterFinder/GPUTPCNNClusterizer.cxx + TPCClusterFinder/GPUTPCCFClusterizer.cxx TPCClusterFinder/GPUTPCCFDeconvolution.cxx TPCClusterFinder/GPUTPCCFMCLabelFlattener.cxx TPCClusterFinder/GPUTPCCFDecodeZS.cxx diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h index 4bb8303ee9a96..d8eba2a9ad384 100644 --- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h +++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h @@ -79,6 +79,7 @@ #define GPUCA_LB_GPUTPCCFNoiseSuppression 512 #define GPUCA_LB_GPUTPCCFDeconvolution 512 #define GPUCA_LB_GPUTPCCFClusterizer 448 + #define GPUCA_LB_GPUTPCNNClusterizer 448 #define GPUCA_LB_COMPRESSION_GATHER 1024 #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5 #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20 @@ -143,6 +144,7 @@ #define GPUCA_LB_GPUTPCCFNoiseSuppression 512 #define GPUCA_LB_GPUTPCCFDeconvolution 512 #define GPUCA_LB_GPUTPCCFClusterizer 512 + #define GPUCA_LB_GPUTPCNNClusterizer 512 #define GPUCA_LB_COMPRESSION_GATHER 1024 #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5 #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20 @@ -207,6 +209,7 @@ #define GPUCA_LB_GPUTPCCFNoiseSuppression 448 #define GPUCA_LB_GPUTPCCFDeconvolution 384 #define GPUCA_LB_GPUTPCCFClusterizer 448 + #define GPUCA_LB_GPUTPCNNClusterizer 448 #define GPUCA_LB_COMPRESSION_GATHER 1024 #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4 #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20 @@ -475,6 +478,9 @@ #ifndef GPUCA_LB_GPUTPCCFClusterizer #define GPUCA_LB_GPUTPCCFClusterizer 512 #endif + #ifndef GPUCA_LB_GPUTPCNNClusterizer + #define GPUCA_LB_GPUTPCNNClusterizer 512 + #endif #ifndef GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU #define GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU 256 #endif diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 8a6a899f35a1a..26878e6111bd5 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -18,7 +18,6 @@ #include "GPUO2DataTypes.h" #include "GPUMemorySizeScalers.h" #include "GPUTrackingInputProvider.h" -#include "GPUTPCNNClusterizer.h" #include #ifdef GPUCA_O2_LIB @@ -875,7 +874,15 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclus runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}}); DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSlice}}, 0); + if(doGPU){ + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSlice}}, 0); + } else { + std::string path_class = "/lustre/alice/users/csonnab/PhD/jobs/clusterization/NN/output/normalized_qCenter/o2sim_150324_50Ev_10000QED_PbPb_13t7p/classification/3D_FCNN_1cls_03_04_2024_10M_FP16_addIndex/network/net_onnx.onnx", path_reg = "/lustre/alice/users/csonnab/PhD/jobs/clusterization/NN/output/normalized_qCenter/o2sim_150324_50Ev_10000QED_PbPb_13t7p/regression/3D_FCNN_1cls_05_04_2024_10M_FP16_addIndex/network/net_onnx.onnx"; + clusterer.model_class.init(path_class, 1, 1); + clusterer.model_reg.init(path_reg, 1, 1); + + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); + } if (doGPU && propagateMCLabels) { TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane); if (doGPU) { @@ -886,8 +893,9 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclus } else { // FIXME: Here I need to apply the neural network // runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); - GPUCA_NAMESPACE::gpu::GPUTPCNNClusterizer nn_clus; - nn_clus.exec({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); + // GPUTPCNNClusterizer nn_clus; + // nn_clus.exec({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); } } if (GetProcessingSettings().debugLevel >= 3) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h index ca89053797a47..ae40ff780b25a 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h @@ -19,6 +19,9 @@ #include "GPUProcessor.h" #include "GPUDataTypes.h" #include "CfFragment.h" +#include "ML/onnx_interface.h" + +using namespace o2::ml; namespace o2 { @@ -141,6 +144,8 @@ class GPUTPCClusterFinder : public GPUProcessor short mZSOffsetId = -1; short mOutputId = -1; + OnnxModel model_class, model_reg; + #ifndef GPUCA_GPUCODE void DumpDigits(std::ostream& out); void DumpChargeMap(std::ostream& out, std::string_view); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index 6c64c54ca5193..7c19802825eb6 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -34,10 +34,14 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; - GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); + GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true); + + // tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; +// + // GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); } -void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC) +GPUd() void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC) { Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CPU_ONLY( @@ -45,27 +49,37 @@ void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThrea tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; - OnnxModel model_class, model_reg; std::string path_class = "", path_reg = ""; - model_class.init(path_class, 1, 0); - model_reg.init(path_reg, 1, 0); + clusterer.model_class.init(path_class, 1, 0); + clusterer.model_reg.init(path_reg, 1, 0); - GPUTPCNNClusterizer::nn_clusterizer(model_class, model_reg, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, 1, 0.16, 1); + GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true); } int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current) { + std::vector pad_row_max{ + 65, 65, 65, 67, 67, 67, 69, 69, 69, 71, 71, 71, 73, 73, 73, 73, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 81, 81, 81, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 91, 91, 91, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 99, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 79, 81, 81, 81, 83, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 101, 101, 101, 103, 103, 103, 105, 109, 109, 111, 111, 111, 113, 113, 113, 115, 115, 115, 117, 117, 117, 117, 117, 119, 119, 121, 121, 123, 123, 123, 125, 125, 127, 127, 127, 129, 129, 131, 131, 131, 133, 133, 135, 135, 137, 137 + }; return (int)((pad_row_max[row_ref] - pad_row_max[row_current]) / 2); } // --------------------------------- bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift) { + std::vector pad_row_max{ + 65, 65, 65, 67, 67, 67, 69, 69, 69, 71, 71, 71, 73, 73, 73, 73, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 81, 81, 81, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 91, 91, 91, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 99, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 79, 81, 81, 81, 83, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 101, 101, 101, 103, 103, 103, 105, 109, 109, 111, 111, 111, 113, 113, 113, 115, 115, 115, 117, 117, 117, 117, 117, 119, 119, 121, 121, 123, 123, 123, 125, 125, 127, 127, 127, 129, 129, 131, 131, 131, 133, 133, 135, 135, 137, 137 + }; if (row < 0 || pad < 0) { return true; } else if (row <= 62) { - if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row]) / 2) { + // if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row]) / 2) { + // return true; + // } else { + // return false; + // } + if (pad < 0 || pad > pad_row_max[row]) { return true; } else { return false; @@ -73,7 +87,12 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift) } else if (row <= 62 + global_shift) { return true; } else if (row <= o2::tpc::constants::MAXGLOBALPADROW-1 + global_shift) { - if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row - global_shift]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row - global_shift]) / 2) { + //if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row - global_shift]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row - global_shift]) / 2) { + // return true; + //} else { + // return false; + //} + if (pad < 0 || pad > pad_row_max[row]) { return true; } else { return false; @@ -85,7 +104,7 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift) } } -void GPUTPCNNClusterizer::nn_clusterizer(OnnxModel model_class, OnnxModel model_reg, +GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int iBlock, int iThread, processorType& clusterer, const CfFragment& fragment, GPUSharedMemory& smem, @@ -97,73 +116,93 @@ void GPUTPCNNClusterizer::nn_clusterizer(OnnxModel model_class, OnnxModel model_ uint maxClusterPerRow, uint* clusterInRow, tpc::ClusterNative* clusterByRow, - uint* clusterPosInRow + uint* clusterPosInRow, int in_row, int in_pad, int in_time, bool add_index_data, float class_threshold, bool sigmoid_transform){ - Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); - std::vector input_data(clusterer.mPmemory->counters.nClusters * (2*in_row + 1) * (2*in_pad + 1) * (2*in_time + 1)); + std::vector input_data(((2*in_row + 1) * (2*in_pad + 1) * (2*in_time + 1) + (add_index_data ? 3 : 0)), -1.f); float classification_threshold = class_threshold; if(sigmoid_transform){ classification_threshold = (float)std::log(class_threshold/(1.f-class_threshold)); } - for(float cls = 0; cls < clusterer.mPmemory->counters.nClusters; cls++){ - ChargePos peak = clusterer.mPfilteredPeakPositions[cls]; - int row = peak.row(), pad = peak.pad(), time = peak.time(); - float central_charge = chargeMap[peak].unpack(); - unsigned int glo_idx = cls * ((2*in_row + 1) + (2*in_pad + 1) * (2*in_time + 1)); - for(int r = -in_row; r <= in_row; r++){ - for(int p = -in_pad; p <= in_pad; p++){ - for(int t = -in_time; t <= in_time; t++){ - int offset = padOffset(row, row + r); - if(isBoundary(row + r, pad + p + offset)){ - continue; - } else { - unsigned int idx = glo_idx + (row + r) * (2*in_pad + 1) * (2*in_time + 1) + (pad + p) * (2*in_time + 1) + (time + t); - ChargePos tmp_pos(row + r, pad + p + offset, time + t); - input_data[idx] = (chargeMap[tmp_pos].unpack() / central_charge); - } + uint idx = get_global_id(0); + uint cls = CAMath::Min(idx, clusternum - 1); + + // For certain configurations dummy work items are added, so the total + // number of work items is dividable by 64. + // These dummy items also compute the last cluster but discard the result. + + ChargePos peak = clusterer.mPfilteredPeakPositions[cls]; + int row = peak.row(), pad = peak.pad(), time = peak.time(); + float central_charge = chargeMap[peak].unpack(); + CPU_ONLY(labelAcc->collect(peak, central_charge)); + // unsigned int glo_idx = cls * ((2*in_row + 1) + (2*in_pad + 1) * (2*in_time + 1)); + unsigned int write_idx = 0; + for(int r = -in_row; r <= in_row; r++){ + for(int p = -in_pad; p <= in_pad; p++){ + for(int t = -in_time; t <= in_time; t++){ + int offset = GPUTPCNNClusterizer::padOffset(row, row + r); + if(GPUTPCNNClusterizer::isBoundary(row + r, pad + p + offset, in_row)){ + continue; + } else { + // unsigned int loc_idx = (row + r) * (2*in_pad + 1) * (2*in_time + 1) + (pad + p) * (2*in_time + 1) + (time + t); + ChargePos tmp_pos(row + r, pad + p + offset, time + t); + input_data[write_idx] = (chargeMap[tmp_pos].unpack() / central_charge); + write_idx++; } } + if(idx == 100){ + LOG(info) << "[" << input_data[write_idx-7] << ", " << input_data[write_idx-6] << ", " << input_data[write_idx-5] << ", " << input_data[write_idx-4] << ", " << input_data[write_idx-3] << ", " << input_data[write_idx-2] << ", " << input_data[write_idx-1] << "]"; + } + } + } + if(add_index_data){ + input_data[input_data.size()-3] = 1; + input_data[input_data.size()-2] = (float)peak.row() / 152.f; + input_data[input_data.size()-1] = (float)peak.pad() / 138.f; + if(idx == 100){ + LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]"; } } - std::vector out_class = model_class.inference_vector(input_data, clusterer.mPmemory->counters.nClusters); - std::vector out_reg = model_reg.inference_vector(input_data, clusterer.mPmemory->counters.nClusters); - int num_outputs = model_reg.getNumOutputNodes()[0][1]; - - for(int cls = 0; cls < clusterer.mPmemory->counters.nClusters; cls++){ - if(out_class > classification_threshold){ - int idx = cls * num_outputs; - ChargePos peak = clusterer.mPfilteredPeakPositions[cls]; - ClusterAccumulator pc; - pc.setFull(chargeMap[peak].unpack() * out_reg[idx + 4], peak.pad() + out_reg[idx], out_reg[idx + 2], peak.time() + out_reg[idx + 1], out_reg[idx + 3], 0, 0); - tpc::ClusterNative myCluster; - bool rejectCluster = !pc.toNative(pos, chargeMap[peak].unpack(), myCluster, clusterer.Param()); - if (rejectCluster) { - if (clusterPosInRow) { - clusterPosInRow[idx] = maxClusterPerRow; - } - return; - } - uint rowIndex = 0; - if (clusterByRow != nullptr) { - rowIndex = sortIntoBuckets( - clusterer, - myCluster, - pos.row(), - maxClusterPerRow, - clusterInRow, - clusterByRow); - if (clusterPosInRow != nullptr) { - clusterPosInRow[idx] = rowIndex; - } - } else if (clusterPosInRow) { - rowIndex = clusterPosInRow[idx]; + std::vector out_class = clusterer.model_class.inference_vector(input_data, 1); + std::vector out_reg = clusterer.model_reg.inference_vector(input_data, 1); + int num_outputs = clusterer.model_reg.getNumOutputNodes()[0][1]; + + if(idx == 100){ + LOG(info) << "Classification model: " << out_class[0]; + LOG(info) << "Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4]; + } + + if(out_class[0] > classification_threshold){ + ClusterAccumulator pc; + pc.setFull(chargeMap[peak].unpack() * out_reg[4], peak.pad() + out_reg[0], out_reg[2], fragment.start + peak.time() + out_reg[1], out_reg[3], 0, 0); + tpc::ClusterNative myCluster; + bool rejectCluster = !pc.toNative(peak, chargeMap[peak].unpack(), myCluster, clusterer.Param()); + if (rejectCluster) { + if (clusterPosInRow) { + clusterPosInRow[idx] = maxClusterPerRow; } + return; + } - CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow)); + uint rowIndex = 0; + if (clusterByRow != nullptr) { + rowIndex = sortIntoBuckets( + clusterer, + myCluster, + peak.row(), + maxClusterPerRow, + clusterInRow, + clusterByRow); + if (clusterPosInRow != nullptr) { + clusterPosInRow[idx] = rowIndex; + } + } else if (clusterPosInRow) { + rowIndex = clusterPosInRow[idx]; } + + CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, maxClusterPerRow)); } } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 56ffcbc842223..905e6f860a90f 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -12,8 +12,8 @@ /// \file GPUTPCNNClusterizer.h /// \author Christian Sonnabend -#ifndef O2_GPU_CLUSTERIZER_H -#define O2_GPU_CLUSTERIZER_H +#ifndef O2_GPU_NN_CLUSTERIZER_H +#define O2_GPU_NN_CLUSTERIZER_H #include "clusterFinderDefs.h" #include "GPUGeneralKernels.h" @@ -21,9 +21,6 @@ #include "GPUTPCClusterFinder.h" #include "Array2D.h" #include "PackedCharge.h" -#include "ML/onnx_interface.h" - -using namespace o2::ml; namespace o2::tpc { @@ -39,7 +36,7 @@ class MCLabelAccumulator; class GPUTPCNNClusterizer : public GPUKernelTemplate { public: - static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFClusterizer); + static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizer); struct GPUSharedMemory { ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE]; PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N]; @@ -64,10 +61,10 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate static GPUd() void computeClustersImpl(int, int, int, int, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint, uint, uint*, tpc::ClusterNative*, uint*); - void exec(int, int, int, int, GPUSharedMemory&, processorType&, char); - int padOffset(int); - bool isBoundary(int, int, int); - static void nn_clusterizer(OnnxModel, OnnxModel, + static GPUd() void exec(int, int, int, int, GPUSharedMemory&, processorType&, char); + static int padOffset(int, int); + static bool isBoundary(int, int, int); + static GPUd() void nn_clusterizer(int, int, int, int, processorType&, const CfFragment&, GPUSharedMemory&, @@ -80,13 +77,10 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate uint*, tpc::ClusterNative*, uint*, - int = 3, int = 3, int = 3, bool = true); + int = 3, int = 3, int = 3, bool = 1, float = 0.16, bool = true); private: // --------------------------------- - std::vector pad_row_max{ - 65, 65, 65, 67, 67, 67, 69, 69, 69, 71, 71, 71, 73, 73, 73, 73, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 81, 81, 81, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 91, 91, 91, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 99, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 79, 81, 81, 81, 83, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 101, 101, 101, 103, 103, 103, 105, 109, 109, 111, 111, 111, 113, 113, 113, 115, 115, 115, 117, 117, 117, 117, 117, 119, 119, 121, 121, 123, 123, 123, 125, 125, 127, 127, 127, 129, 129, 131, 131, 131, 133, 133, 135, 135, 137, 137 - }; static GPUd() void updateClusterInner(const GPUSettingsRec&, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uchar*); diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index d4f5ca93e9def..b0270511c2249 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -117,6 +117,7 @@ o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression" "= TPCCLUS o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks" "= TPCCLUSTERFINDER" LB single) o2_gpu_add_kernel("GPUTPCCFDeconvolution" "= TPCCLUSTERFINDER" LB single) o2_gpu_add_kernel("GPUTPCCFClusterizer" "= TPCCLUSTERFINDER" LB single char onlyMC) +o2_gpu_add_kernel("GPUTPCNNClusterizer" "= TPCCLUSTERFINDER" LB single char onlyMC) o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets" "= TPCCLUSTERFINDER" NO single) o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten" "= TPCCLUSTERFINDER" NO single GPUTPCLinearLabels* out) o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanStart" "= TPCCLUSTERFINDER" LB single int iBuf int stage) From 8ba6805ebd889bebf4b11972170570bdd99892cf Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Wed, 29 May 2024 21:11:04 +0200 Subject: [PATCH 04/23] Clusters now working by a hack --- .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index 7c19802825eb6..afee680bc0ceb 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -32,7 +32,7 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo CPU_ONLY( MCLabelAccumulator labelAcc(clusterer)); - tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; + tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow; // (onlyMC) ? nullptr : clusterer.mPclusterByRow; GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true); @@ -142,7 +142,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i for(int p = -in_pad; p <= in_pad; p++){ for(int t = -in_time; t <= in_time; t++){ int offset = GPUTPCNNClusterizer::padOffset(row, row + r); - if(GPUTPCNNClusterizer::isBoundary(row + r, pad + p + offset, in_row)){ + if(GPUTPCNNClusterizer::isBoundary(row + r, pad + p, in_row)){ continue; } else { // unsigned int loc_idx = (row + r) * (2*in_pad + 1) * (2*in_time + 1) + (pad + p) * (2*in_time + 1) + (time + t); @@ -151,18 +151,18 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i write_idx++; } } - if(idx == 100){ - LOG(info) << "[" << input_data[write_idx-7] << ", " << input_data[write_idx-6] << ", " << input_data[write_idx-5] << ", " << input_data[write_idx-4] << ", " << input_data[write_idx-3] << ", " << input_data[write_idx-2] << ", " << input_data[write_idx-1] << "]"; - } + // if(idx == 100){ + // LOG(info) << "[" << input_data[write_idx-7] << ", " << input_data[write_idx-6] << ", " << input_data[write_idx-5] << ", " << input_data[write_idx-4] << ", " << input_data[write_idx-3] << ", " << input_data[write_idx-2] << ", " << input_data[write_idx-1] << "]"; + // } } } if(add_index_data){ input_data[input_data.size()-3] = 1; input_data[input_data.size()-2] = (float)peak.row() / 152.f; input_data[input_data.size()-1] = (float)peak.pad() / 138.f; - if(idx == 100){ - LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]"; - } + // if(idx == 100){ + // LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]"; + // } } std::vector out_class = clusterer.model_class.inference_vector(input_data, 1); @@ -170,16 +170,17 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i int num_outputs = clusterer.model_reg.getNumOutputNodes()[0][1]; if(idx == 100){ - LOG(info) << "Classification model: " << out_class[0]; + LOG(info) << "Classification model: " << out_class[0] << " (>? " << classification_threshold << ")"; LOG(info) << "Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4]; } if(out_class[0] > classification_threshold){ ClusterAccumulator pc; - pc.setFull(chargeMap[peak].unpack() * out_reg[4], peak.pad() + out_reg[0], out_reg[2], fragment.start + peak.time() + out_reg[1], out_reg[3], 0, 0); + pc.setFull(central_charge * out_reg[4], peak.pad() + out_reg[0], out_reg[2], fragment.start + peak.time() + out_reg[1], out_reg[3], 0, 0); tpc::ClusterNative myCluster; - bool rejectCluster = !pc.toNative(peak, chargeMap[peak].unpack(), myCluster, clusterer.Param()); + bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param()); if (rejectCluster) { + LOG(warning) << "Cluster rejected!"; if (clusterPosInRow) { clusterPosInRow[idx] = maxClusterPerRow; } From 6ec3c46d37e82b2f37f648ff3750d14f8d72f5b1 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 6 Jun 2024 17:49:38 +0200 Subject: [PATCH 05/23] Working implementation of settings via GPUSettings.h and --configKeyValues "GPU_proc.[setting]=...;..." --- GPU/GPUTracking/Definitions/GPUSettingsList.h | 9 ++++++ GPU/GPUTracking/Global/GPUChainTracking.cxx | 2 +- GPU/GPUTracking/Global/GPUChainTracking.h | 2 +- .../Global/GPUChainTrackingClusterizer.cxx | 28 +++++++++++-------- .../TPCClusterFinder/GPUTPCClusterFinder.h | 7 +++++ .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 2 +- 6 files changed, 35 insertions(+), 15 deletions(-) diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 777ea1e70b0d8..b3f38c6ab81d2 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -280,6 +280,15 @@ AddOption(tpcDownscaledEdx, unsigned char, 0, "", 0, "If != 0, downscale dEdx pr AddOption(tpcMaxAttachedClustersPerSectorRow, unsigned int, 51000, "", 0, "Maximum number of TPC attached clusters which can be decoded per SectorRow") AddOption(tpcUseOldCPUDecoding, bool, false, "", 0, "Enable old CPU-based TPC decoding") AddOption(RTCcacheFolder, std::string, "./rtccache/", "", 0, "Folder in which the cache file is stored") +AddOption(applyNNclusterizer, int, 0, "", 0, "(Bool, default = 0), if the neural network clsuterizer should be used.") +AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path") +AddOption(nnRegressionPath, std::string, "./network_reg.onnx", "", 0, "The regression network path") +AddOption(nnClassThreshold, float, 0.16, "", 0, "The cutoff at which clusters will be accepted / rejected.") +AddOption(nnSigmoidTrafoThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).") +AddOption(nnAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input") +AddOption(nnSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2") +AddOption(nnSizeInputPad, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2") +AddOption(nnSizeInputTime, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2") AddVariable(eventDisplay, GPUCA_NAMESPACE::gpu::GPUDisplayFrontendInterface*, nullptr) AddSubConfig(GPUSettingsProcessingRTC, rtc) AddSubConfig(GPUSettingsProcessingParam, param) diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx index 68615f47d05db..7a202c852b895 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.cxx +++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx @@ -722,7 +722,7 @@ int GPUChainTracking::RunChain() return 1; } } else if (mIOPtrs.tpcPackedDigits || mIOPtrs.tpcZS) { - if (runRecoStep(RecoStep::TPCClusterFinding, &GPUChainTracking::RunTPCClusterizer, false, true)) { // FIXME: This enables the neural network clusterization -> Need to actually set this as configurable + if (runRecoStep(RecoStep::TPCClusterFinding, &GPUChainTracking::RunTPCClusterizer, false)) { // FIXME: This enables the neural network clusterization -> Need to actually set this as configurable return 1; } } diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h index 032ad0524ccff..89f2ecd10f65f 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.h +++ b/GPU/GPUTracking/Global/GPUChainTracking.h @@ -161,7 +161,7 @@ class GPUChainTracking : public GPUChain, GPUReconstructionHelpers::helperDelega void SetQAFromForeignChain(GPUChainTracking* chain) { mQAFromForeignChain = chain; } // Processing functions - int RunTPCClusterizer(bool synchronizeOutput = true, bool applyNNclusterizer = false); + int RunTPCClusterizer(bool synchronizeOutput = true); int ForwardTPCDigits(); int RunTPCTrackingSlices(); int RunTPCTrackingMerger(bool synchronizeOutput = true); diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 26878e6111bd5..6ed3406646abb 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -566,7 +566,7 @@ int GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers) #endif // TODO: Clusterizer not working with OCL1 (Clusterizer on CPU, Tracking on GPU) -int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclusterizer) +int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) { if (param().rec.fwdTPCDigitsAsClusters) { return ForwardTPCDigits(); @@ -837,7 +837,7 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclus if (clusterer.mPmemory->counters.nPeaks == 0) { continue; } - if(!applyNNclusterizer){ + if(!GetProcessingSettings().applyNNclusterizer){ runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); } else { @@ -877,25 +877,29 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclus if(doGPU){ runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSlice}}, 0); } else { - std::string path_class = "/lustre/alice/users/csonnab/PhD/jobs/clusterization/NN/output/normalized_qCenter/o2sim_150324_50Ev_10000QED_PbPb_13t7p/classification/3D_FCNN_1cls_03_04_2024_10M_FP16_addIndex/network/net_onnx.onnx", path_reg = "/lustre/alice/users/csonnab/PhD/jobs/clusterization/NN/output/normalized_qCenter/o2sim_150324_50Ev_10000QED_PbPb_13t7p/regression/3D_FCNN_1cls_05_04_2024_10M_FP16_addIndex/network/net_onnx.onnx"; - clusterer.model_class.init(path_class, 1, 1); - clusterer.model_reg.init(path_reg, 1, 1); - - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); + if(GetProcessingSettings().applyNNclusterizer){ + clusterer.model_class.init(GetProcessingSettings().nnClassificationPath, 1, 1); + clusterer.model_reg.init(GetProcessingSettings().nnRegressionPath, 1, 1); + clusterer.nnSizeInputRow = GetProcessingSettings().nnSizeInputRow; + clusterer.nnSizeInputPad = GetProcessingSettings().nnSizeInputPad; + clusterer.nnSizeInputTime = GetProcessingSettings().nnSizeInputTime; + clusterer.nnAddIndexData = GetProcessingSettings().nnAddIndexData; + clusterer.nnClassThreshold = GetProcessingSettings().nnClassThreshold; + clusterer.nnSigmoidTrafoThreshold = GetProcessingSettings().nnSigmoidTrafoThreshold; + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); + } else { + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); + } } if (doGPU && propagateMCLabels) { TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane); if (doGPU) { SynchronizeStream(lane); } - if(!applyNNclusterizer){ + if(!GetProcessingSettings().applyNNclusterizer){ runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); } else { - // FIXME: Here I need to apply the neural network - // runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); - // GPUTPCNNClusterizer nn_clus; - // nn_clus.exec({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); } } if (GetProcessingSettings().debugLevel >= 3) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h index ae40ff780b25a..a449eb23ef426 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h @@ -144,6 +144,13 @@ class GPUTPCClusterFinder : public GPUProcessor short mZSOffsetId = -1; short mOutputId = -1; + int nnSizeInputRow = 3; + int nnSizeInputPad = 3; + int nnSizeInputTime = 3; + bool nnAddIndexData = true; + float nnClassThreshold = 0.16; + bool nnSigmoidTrafoThreshold = 1; + OnnxModel model_class, model_reg; #ifndef GPUCA_GPUCODE diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index afee680bc0ceb..d2656531c6df1 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -34,7 +34,7 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow; // (onlyMC) ? nullptr : clusterer.mPclusterByRow; - GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true); + GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, clusterer.nnSizeInputRow, clusterer.nnSizeInputPad, clusterer.nnSizeInputTime, clusterer.nnAddIndexData, clusterer.nnClassThreshold, clusterer.nnSigmoidTrafoThreshold); // tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; // From ab4653a78a470e740478ed719f24bfed0b8fc0cb Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 24 Jun 2024 08:08:01 +0200 Subject: [PATCH 06/23] Modifying the onnx_interface to include the right headers --- Common/ML/include/ML/onnx_interface.h | 6 +++++- GPU/GPUTracking/ML/onnx_interface.h | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/Common/ML/include/ML/onnx_interface.h b/Common/ML/include/ML/onnx_interface.h index 506311c067351..d3676b7a3f87a 100644 --- a/Common/ML/include/ML/onnx_interface.h +++ b/Common/ML/include/ML/onnx_interface.h @@ -21,7 +21,11 @@ #define GPU_ML_ONNX_INTERFACE_H // C++ and system includes -#include +#if __has_include() +#include +#else +#include +#endif #include #include #include diff --git a/GPU/GPUTracking/ML/onnx_interface.h b/GPU/GPUTracking/ML/onnx_interface.h index 506311c067351..d3676b7a3f87a 100644 --- a/GPU/GPUTracking/ML/onnx_interface.h +++ b/GPU/GPUTracking/ML/onnx_interface.h @@ -21,7 +21,11 @@ #define GPU_ML_ONNX_INTERFACE_H // C++ and system includes -#include +#if __has_include() +#include +#else +#include +#endif #include #include #include From 04084c8fd1ea9be525a7368afe5567112d4549cc Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 24 Jun 2024 13:21:09 +0200 Subject: [PATCH 07/23] Adjusting initialization for new ONNXRuntime version --- Common/ML/include/ML/onnx_interface.h | 26 ++++--- Common/ML/src/onnx_interface.cxx | 98 +++++++++++++++++++-------- GPU/GPUTracking/ML/onnx_interface.cxx | 98 +++++++++++++++++++-------- GPU/GPUTracking/ML/onnx_interface.h | 19 ++++-- 4 files changed, 171 insertions(+), 70 deletions(-) diff --git a/Common/ML/include/ML/onnx_interface.h b/Common/ML/include/ML/onnx_interface.h index d3676b7a3f87a..fcc02a49996ea 100644 --- a/Common/ML/include/ML/onnx_interface.h +++ b/Common/ML/include/ML/onnx_interface.h @@ -17,8 +17,8 @@ /// \brief A general-purpose class for ONNX models /// -#ifndef GPU_ML_ONNX_INTERFACE_H -#define GPU_ML_ONNX_INTERFACE_H +#ifndef COMMON_ML_ONNX_INTERFACE_H +#define COMMON_ML_ONNX_INTERFACE_H // C++ and system includes #if __has_include() @@ -43,10 +43,9 @@ namespace ml class OnnxModel { - public: - OnnxModel() = default; - ~OnnxModel() = default; + OnnxModel() : mMemoryInfo(Ort::MemoryInfo::CreateCpu(OrtAllocatorType, OrtMemType)) {}; + virtual ~OnnxModel() = default; // Inferencing void init(std::string, bool = false, int = 0); @@ -56,11 +55,19 @@ class OnnxModel template std::vector inference_vector(T input, unsigned int size); // Reset session - void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); } + #if __has_include() + void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); }; + #else + void resetSession() { mSession.reset(new Ort::Session{*mEnv, modelPath.c_str(), sessionOptions}); }; + #endif // Getters & Setters Ort::SessionOptions* getSessionOptions() { return &sessionOptions; } // For optimizations in post - std::shared_ptr getSession() { return mSession; } + #if __has_include() + std::shared_ptr getSession() { return mSession; } + #else + std::shared_ptr getSession() { return mSession; } + #endif std::vector> getNumInputNodes() const { return mInputShapes; } std::vector> getNumOutputNodes() const { return mOutputShapes; } void setActiveThreads(int); @@ -68,7 +75,8 @@ class OnnxModel private: // Environment variables for the ONNX runtime std::shared_ptr mEnv = nullptr; - std::shared_ptr mSession = nullptr; + std::shared_ptr mSession = nullptr; ///< ONNX session + Ort::MemoryInfo mMemoryInfo; Ort::SessionOptions sessionOptions; // Input & Output specifications of the loaded network @@ -89,4 +97,4 @@ class OnnxModel } // namespace GPUCA_NAMESPACE -#endif // GPU_ML_ONNX_INTERFACE_H \ No newline at end of file +#endif // COMMON_ML_ONNX_INTERFACE_H \ No newline at end of file diff --git a/Common/ML/src/onnx_interface.cxx b/Common/ML/src/onnx_interface.cxx index e7c952d6b8cdc..549575600a656 100644 --- a/Common/ML/src/onnx_interface.cxx +++ b/Common/ML/src/onnx_interface.cxx @@ -43,6 +43,11 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread modelPath = localPath; activeThreads = threads; +#if __has_include() +#else + mMemoryInfo = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault); +#endif + /// Enableing optimizations if(threads != 0){ // sessionOptions.SetInterOpNumThreads(1); @@ -63,12 +68,28 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread } mEnv = std::make_shared(ORT_LOGGING_LEVEL_WARNING, "onnx-model"); - mSession = std::make_shared(*mEnv, modelPath, sessionOptions); - - mInputNames = mSession->GetInputNames(); - mInputShapes = mSession->GetInputShapes(); - mOutputNames = mSession->GetOutputNames(); - mOutputShapes = mSession->GetOutputShapes(); + #if __has_include() + mSession = std::make_shared(*mEnv, modelPath, sessionOptions); + mInputNames = mSession->GetInputNames(); + mInputShapes = mSession->GetInputShapes(); + mOutputNames = mSession->GetOutputNames(); + mOutputShapes = mSession->GetOutputShapes(); + #else + mSession = std::make_shared(*mEnv, modelPath.c_str(), sessionOptions); + Ort::AllocatorWithDefaultOptions tmpAllocator; + for (size_t i = 0; i < mSession->GetInputCount(); ++i) { + mInputNames.push_back(mSession->GetInputNameAllocated(i, tmpAllocator).get()); + } + for (size_t i = 0; i < mSession->GetInputCount(); ++i) { + mInputShapes.emplace_back(mSession->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + } + for (size_t i = 0; i < mSession->GetOutputCount(); ++i) { + mOutputNames.push_back(mSession->GetOutputNameAllocated(i, tmpAllocator).get()); + } + for (size_t i = 0; i < mSession->GetOutputCount(); ++i) { + mOutputShapes.emplace_back(mSession->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + } + #endif LOG(info) << "Input Nodes:"; for (size_t i = 0; i < mInputNames.size(); i++) { @@ -121,7 +142,6 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread template float* OnnxModel::inference(T input, unsigned int size) { - std::vector inputShape = mInputShapes[0]; inputShape[0] = size; std::vector inputTensors; @@ -129,22 +149,36 @@ float* OnnxModel::inference(T input, unsigned int size) for(auto elem : inputShape){ mem_size*=elem; } +#if __has_include() inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor(input.data(), mem_size, inputShape)); - // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB"; try { - auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); - float* outputValues = outputTensors[0].GetTensorMutableData(); - return outputValues; - } catch (const Ort::Exception& exception) { - LOG(error) << "Error running model inference: " << exception.what(); - } + auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); + inputTensors.clear(); + float* outputValues = outputTensors[0].GetTensorMutableData(); + return outputValues; + } catch (const Ort::Exception& exception) { + LOG(error) << "Error running model inference: " << exception.what(); + } +#else + std::vector tmpInputs; + std::vector tmpOutputs; + inputTensors.emplace_back(Ort::Value::CreateTensor(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1)); + try { + auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size()); + inputTensors.clear(); + float* outputValues = outputTensors[0].GetTensorMutableData(); + return outputValues; + } catch (const Ort::Exception& exception) { + LOG(error) << "Error running model inference: " << exception.what(); + } +#endif + // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB"; return nullptr; } template std::vector OnnxModel::inference_vector(T input, unsigned int size) { - std::vector inputShape = mInputShapes[0]; inputShape[0] = size; std::vector inputTensors; @@ -153,21 +187,29 @@ std::vector OnnxModel::inference_vector(T input, unsigned int size) for(auto elem : inputShape){ mem_size*=elem; } +#if __has_include() inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor(input.data(), mem_size, inputShape)); - // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB"; try { - auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); - float* outputValues = outputTensors[0].GetTensorMutableData(); - std::vector outputVector{outputValues, outputValues + size * mOutputShapes[0][1]}; - // for(int s = 0; s < size; s++){ - // for(int o = 0; o < mOutputShapes[0][1]; o++){ - // outputValues.push_back(tmp_output_values[s*(int)mOutputShapes[0][1] + o]); - // } - // } - return outputVector; - } catch (const Ort::Exception& exception) { - LOG(error) << "Error running model inference: " << exception.what(); - } + auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); + inputTensors.clear(); + float* outputValues = outputTensors[0].GetTensorMutableData(); + return std::vector{outputValues, outputValues + size * mOutputShapes[0][1]}; + } catch (const Ort::Exception& exception) { + LOG(error) << "Error running model inference: " << exception.what(); + } +#else + std::vector tmpInputs; + std::vector tmpOutputs; + inputTensors.emplace_back(Ort::Value::CreateTensor(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1)); + try { + auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size()); + inputTensors.clear(); + float* outputValues = outputTensors[0].GetTensorMutableData(); + return std::vector{outputValues, outputValues + size * mOutputShapes[0][1]}; + } catch (const Ort::Exception& exception) { + LOG(error) << "Error running model inference: " << exception.what(); + } +#endif return std::vector{}; } diff --git a/GPU/GPUTracking/ML/onnx_interface.cxx b/GPU/GPUTracking/ML/onnx_interface.cxx index e7c952d6b8cdc..549575600a656 100644 --- a/GPU/GPUTracking/ML/onnx_interface.cxx +++ b/GPU/GPUTracking/ML/onnx_interface.cxx @@ -43,6 +43,11 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread modelPath = localPath; activeThreads = threads; +#if __has_include() +#else + mMemoryInfo = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault); +#endif + /// Enableing optimizations if(threads != 0){ // sessionOptions.SetInterOpNumThreads(1); @@ -63,12 +68,28 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread } mEnv = std::make_shared(ORT_LOGGING_LEVEL_WARNING, "onnx-model"); - mSession = std::make_shared(*mEnv, modelPath, sessionOptions); - - mInputNames = mSession->GetInputNames(); - mInputShapes = mSession->GetInputShapes(); - mOutputNames = mSession->GetOutputNames(); - mOutputShapes = mSession->GetOutputShapes(); + #if __has_include() + mSession = std::make_shared(*mEnv, modelPath, sessionOptions); + mInputNames = mSession->GetInputNames(); + mInputShapes = mSession->GetInputShapes(); + mOutputNames = mSession->GetOutputNames(); + mOutputShapes = mSession->GetOutputShapes(); + #else + mSession = std::make_shared(*mEnv, modelPath.c_str(), sessionOptions); + Ort::AllocatorWithDefaultOptions tmpAllocator; + for (size_t i = 0; i < mSession->GetInputCount(); ++i) { + mInputNames.push_back(mSession->GetInputNameAllocated(i, tmpAllocator).get()); + } + for (size_t i = 0; i < mSession->GetInputCount(); ++i) { + mInputShapes.emplace_back(mSession->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + } + for (size_t i = 0; i < mSession->GetOutputCount(); ++i) { + mOutputNames.push_back(mSession->GetOutputNameAllocated(i, tmpAllocator).get()); + } + for (size_t i = 0; i < mSession->GetOutputCount(); ++i) { + mOutputShapes.emplace_back(mSession->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + } + #endif LOG(info) << "Input Nodes:"; for (size_t i = 0; i < mInputNames.size(); i++) { @@ -121,7 +142,6 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread template float* OnnxModel::inference(T input, unsigned int size) { - std::vector inputShape = mInputShapes[0]; inputShape[0] = size; std::vector inputTensors; @@ -129,22 +149,36 @@ float* OnnxModel::inference(T input, unsigned int size) for(auto elem : inputShape){ mem_size*=elem; } +#if __has_include() inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor(input.data(), mem_size, inputShape)); - // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB"; try { - auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); - float* outputValues = outputTensors[0].GetTensorMutableData(); - return outputValues; - } catch (const Ort::Exception& exception) { - LOG(error) << "Error running model inference: " << exception.what(); - } + auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); + inputTensors.clear(); + float* outputValues = outputTensors[0].GetTensorMutableData(); + return outputValues; + } catch (const Ort::Exception& exception) { + LOG(error) << "Error running model inference: " << exception.what(); + } +#else + std::vector tmpInputs; + std::vector tmpOutputs; + inputTensors.emplace_back(Ort::Value::CreateTensor(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1)); + try { + auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size()); + inputTensors.clear(); + float* outputValues = outputTensors[0].GetTensorMutableData(); + return outputValues; + } catch (const Ort::Exception& exception) { + LOG(error) << "Error running model inference: " << exception.what(); + } +#endif + // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB"; return nullptr; } template std::vector OnnxModel::inference_vector(T input, unsigned int size) { - std::vector inputShape = mInputShapes[0]; inputShape[0] = size; std::vector inputTensors; @@ -153,21 +187,29 @@ std::vector OnnxModel::inference_vector(T input, unsigned int size) for(auto elem : inputShape){ mem_size*=elem; } +#if __has_include() inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor(input.data(), mem_size, inputShape)); - // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB"; try { - auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); - float* outputValues = outputTensors[0].GetTensorMutableData(); - std::vector outputVector{outputValues, outputValues + size * mOutputShapes[0][1]}; - // for(int s = 0; s < size; s++){ - // for(int o = 0; o < mOutputShapes[0][1]; o++){ - // outputValues.push_back(tmp_output_values[s*(int)mOutputShapes[0][1] + o]); - // } - // } - return outputVector; - } catch (const Ort::Exception& exception) { - LOG(error) << "Error running model inference: " << exception.what(); - } + auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); + inputTensors.clear(); + float* outputValues = outputTensors[0].GetTensorMutableData(); + return std::vector{outputValues, outputValues + size * mOutputShapes[0][1]}; + } catch (const Ort::Exception& exception) { + LOG(error) << "Error running model inference: " << exception.what(); + } +#else + std::vector tmpInputs; + std::vector tmpOutputs; + inputTensors.emplace_back(Ort::Value::CreateTensor(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1)); + try { + auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size()); + inputTensors.clear(); + float* outputValues = outputTensors[0].GetTensorMutableData(); + return std::vector{outputValues, outputValues + size * mOutputShapes[0][1]}; + } catch (const Ort::Exception& exception) { + LOG(error) << "Error running model inference: " << exception.what(); + } +#endif return std::vector{}; } diff --git a/GPU/GPUTracking/ML/onnx_interface.h b/GPU/GPUTracking/ML/onnx_interface.h index d3676b7a3f87a..5ee2bd716d257 100644 --- a/GPU/GPUTracking/ML/onnx_interface.h +++ b/GPU/GPUTracking/ML/onnx_interface.h @@ -45,8 +45,8 @@ class OnnxModel { public: - OnnxModel() = default; - ~OnnxModel() = default; + OnnxModel(OrtAllocatorType allocatorType = OrtDeviceAllocator, OrtMemType memoryType = OrtMemTypeCPU) : mMemoryInfo(Ort::MemoryInfo::CreateCpu(allocatorType, memoryType)) {}; + virtual ~OnnxModel() = default; // Inferencing void init(std::string, bool = false, int = 0); @@ -56,11 +56,19 @@ class OnnxModel template std::vector inference_vector(T input, unsigned int size); // Reset session - void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); } + #if __has_include() + void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); }; + #else + void resetSession() { mSession.reset(new Ort::Session{*mEnv, modelPath.c_str(), sessionOptions}); }; + #endif // Getters & Setters Ort::SessionOptions* getSessionOptions() { return &sessionOptions; } // For optimizations in post - std::shared_ptr getSession() { return mSession; } + #if __has_include() + std::shared_ptr getSession() { return mSession; } + #else + std::shared_ptr getSession() { return mSession; } + #endif std::vector> getNumInputNodes() const { return mInputShapes; } std::vector> getNumOutputNodes() const { return mOutputShapes; } void setActiveThreads(int); @@ -68,7 +76,8 @@ class OnnxModel private: // Environment variables for the ONNX runtime std::shared_ptr mEnv = nullptr; - std::shared_ptr mSession = nullptr; + std::shared_ptr mSession = nullptr; ///< ONNX session + Ort::MemoryInfo mMemoryInfo; Ort::SessionOptions sessionOptions; // Input & Output specifications of the loaded network From 01dc4a1bd96f3c6094f1368604dff895754a17d3 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Wed, 26 Jun 2024 09:53:36 +0200 Subject: [PATCH 08/23] Adjusting global settings and CF code for several settings --- Common/ML/src/onnx_interface.cxx | 4 +- GPU/GPUTracking/Definitions/GPUSettingsList.h | 1 + .../Global/GPUChainTrackingClusterizer.cxx | 28 +++++------ GPU/GPUTracking/ML/onnx_interface.cxx | 49 +++++++++++-------- GPU/GPUTracking/ML/onnx_interface.h | 2 +- .../TPCClusterFinder/GPUTPCClusterFinder.h | 1 + .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 38 +++++++------- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 2 +- 8 files changed, 67 insertions(+), 58 deletions(-) diff --git a/Common/ML/src/onnx_interface.cxx b/Common/ML/src/onnx_interface.cxx index 549575600a656..c348d4577d47f 100644 --- a/Common/ML/src/onnx_interface.cxx +++ b/Common/ML/src/onnx_interface.cxx @@ -200,9 +200,9 @@ std::vector OnnxModel::inference_vector(T input, unsigned int size) #else std::vector tmpInputs; std::vector tmpOutputs; - inputTensors.emplace_back(Ort::Value::CreateTensor(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1)); + inputTensors.emplace_back(Ort::Value::CreateTensor(input.data(), mem_size, inputShape)); try { - auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size()); + auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); inputTensors.clear(); float* outputValues = outputTensors[0].GetTensorMutableData(); return std::vector{outputValues, outputValues + size * mOutputShapes[0][1]}; diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 5e9d3499eda77..bc42d50d4a88a 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -283,6 +283,7 @@ AddOption(tpcMaxAttachedClustersPerSectorRow, unsigned int, 51000, "", 0, "Maxim AddOption(tpcUseOldCPUDecoding, bool, false, "", 0, "Enable old CPU-based TPC decoding") AddOption(RTCcacheFolder, std::string, "./rtccache/", "", 0, "Folder in which the cache file is stored") AddOption(applyNNclusterizer, int, 0, "", 0, "(Bool, default = 0), if the neural network clsuterizer should be used.") +AddOption(nnClusterizerVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs") AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path") AddOption(nnRegressionPath, std::string, "./network_reg.onnx", "", 0, "The regression network path") AddOption(nnClassThreshold, float, 0.16, "", 0, "The cutoff at which clusters will be accepted / rejected.") diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 6ed3406646abb..44cd1a5f62f4c 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -874,23 +874,21 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}}); DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); - if(doGPU){ - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSlice}}, 0); + if(GetProcessingSettings().applyNNclusterizer){ + clusterer.model_class.init(GetProcessingSettings().nnClassificationPath, 1, 1, GetProcessingSettings().nnClusterizerVerbosity); + clusterer.model_reg.init(GetProcessingSettings().nnRegressionPath, 1, 1, GetProcessingSettings().nnClusterizerVerbosity); + clusterer.nnSizeInputRow = GetProcessingSettings().nnSizeInputRow; + clusterer.nnSizeInputPad = GetProcessingSettings().nnSizeInputPad; + clusterer.nnSizeInputTime = GetProcessingSettings().nnSizeInputTime; + clusterer.nnAddIndexData = GetProcessingSettings().nnAddIndexData; + clusterer.nnClassThreshold = GetProcessingSettings().nnClassThreshold; + clusterer.nnSigmoidTrafoThreshold = GetProcessingSettings().nnSigmoidTrafoThreshold; + clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnClusterizerVerbosity; + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0); } else { - if(GetProcessingSettings().applyNNclusterizer){ - clusterer.model_class.init(GetProcessingSettings().nnClassificationPath, 1, 1); - clusterer.model_reg.init(GetProcessingSettings().nnRegressionPath, 1, 1); - clusterer.nnSizeInputRow = GetProcessingSettings().nnSizeInputRow; - clusterer.nnSizeInputPad = GetProcessingSettings().nnSizeInputPad; - clusterer.nnSizeInputTime = GetProcessingSettings().nnSizeInputTime; - clusterer.nnAddIndexData = GetProcessingSettings().nnAddIndexData; - clusterer.nnClassThreshold = GetProcessingSettings().nnClassThreshold; - clusterer.nnSigmoidTrafoThreshold = GetProcessingSettings().nnSigmoidTrafoThreshold; - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); - } else { - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); - } + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0); } + if (doGPU && propagateMCLabels) { TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane); if (doGPU) { diff --git a/GPU/GPUTracking/ML/onnx_interface.cxx b/GPU/GPUTracking/ML/onnx_interface.cxx index 549575600a656..9bb5137ec63dd 100644 --- a/GPU/GPUTracking/ML/onnx_interface.cxx +++ b/GPU/GPUTracking/ML/onnx_interface.cxx @@ -35,11 +35,13 @@ std::string OnnxModel::printShape(const std::vector& v) return ss.str(); } -void OnnxModel::init(std::string localPath, bool enableOptimizations, int threads) +void OnnxModel::init(std::string localPath, bool enableOptimizations, int threads, int verbosity) { - LOG(info) << "--- ONNX-ML model ---"; - LOG(info) << "Taking model from: " << localPath; + if(verbosity > 1){ + LOG(info) << "--- ONNX-ML model ---"; + LOG(info) << "Taking model from: " << localPath; + } modelPath = localPath; activeThreads = threads; @@ -91,17 +93,18 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread } #endif - LOG(info) << "Input Nodes:"; - for (size_t i = 0; i < mInputNames.size(); i++) { - LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]); - } + if(verbosity > 1){ + LOG(info) << "Input Nodes:"; + for (size_t i = 0; i < mInputNames.size(); i++) { + LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]); + } - LOG(info) << "Output Nodes:"; - for (size_t i = 0; i < mOutputNames.size(); i++) { - LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]); + LOG(info) << "Output Nodes:"; + for (size_t i = 0; i < mOutputNames.size(); i++) { + LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]); + } + LOG(info) << "--- Model initialized! ---"; } - - LOG(info) << "--- Model initialized! ---"; } // float* OnnxModel::inference(std::vector input, int device_id) @@ -200,15 +203,21 @@ std::vector OnnxModel::inference_vector(T input, unsigned int size) #else std::vector tmpInputs; std::vector tmpOutputs; - inputTensors.emplace_back(Ort::Value::CreateTensor(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1)); + for (unsigned int i = 0; i < mInputNames.size(); i++) { + tmpInputs.emplace_back(mInputNames[i].c_str()); + } + for (unsigned int i = 0; i < mOutputNames.size(); i++) { + tmpOutputs.emplace_back(mOutputNames[i].c_str()); + } + inputTensors.emplace_back(Ort::Value::CreateTensor(mMemoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size())); try { - auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size()); - inputTensors.clear(); - float* outputValues = outputTensors[0].GetTensorMutableData(); - return std::vector{outputValues, outputValues + size * mOutputShapes[0][1]}; - } catch (const Ort::Exception& exception) { - LOG(error) << "Error running model inference: " << exception.what(); - } + auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size()); + inputTensors.clear(); + float* outputValues = outputTensors[0].GetTensorMutableData(); + return std::vector{outputValues, outputValues + size * mOutputShapes[0][1]}; + } catch (const Ort::Exception& exception) { + LOG(error) << "Error running model inference: " << exception.what(); + } #endif return std::vector{}; } diff --git a/GPU/GPUTracking/ML/onnx_interface.h b/GPU/GPUTracking/ML/onnx_interface.h index 5ee2bd716d257..17c45f439dc63 100644 --- a/GPU/GPUTracking/ML/onnx_interface.h +++ b/GPU/GPUTracking/ML/onnx_interface.h @@ -49,7 +49,7 @@ class OnnxModel virtual ~OnnxModel() = default; // Inferencing - void init(std::string, bool = false, int = 0); + void init(std::string, bool = false, int = 0, int = 0); // float* inference(std::vector, int = 0); // float* inference(std::vector, int = 0); template float* inference(T input, unsigned int size); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h index a449eb23ef426..aed00623ef167 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h @@ -150,6 +150,7 @@ class GPUTPCClusterFinder : public GPUProcessor bool nnAddIndexData = true; float nnClassThreshold = 0.16; bool nnSigmoidTrafoThreshold = 1; + int nnClusterizerVerbosity = 1; OnnxModel model_class, model_reg; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index d2656531c6df1..d7e3226e0d54c 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -34,28 +34,28 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow; // (onlyMC) ? nullptr : clusterer.mPclusterByRow; - GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, clusterer.nnSizeInputRow, clusterer.nnSizeInputPad, clusterer.nnSizeInputTime, clusterer.nnAddIndexData, clusterer.nnClassThreshold, clusterer.nnSigmoidTrafoThreshold); + GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, clusterer.nnSizeInputRow, clusterer.nnSizeInputPad, clusterer.nnSizeInputTime, clusterer.nnAddIndexData, clusterer.nnClassThreshold, clusterer.nnSigmoidTrafoThreshold, clusterer.nnClusterizerVerbosity); // tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; // // GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); } -GPUd() void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC) -{ - Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); - CPU_ONLY( - MCLabelAccumulator labelAcc(clusterer)); - - tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; - - std::string path_class = "", path_reg = ""; - - clusterer.model_class.init(path_class, 1, 0); - clusterer.model_reg.init(path_reg, 1, 0); - - GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true); -} +// GPUd() void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC) +// { +// Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); +// CPU_ONLY( +// MCLabelAccumulator labelAcc(clusterer)); +// +// tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; +// +// std::string path_class = "", path_reg = ""; +// +// clusterer.model_class.init(path_class, 1, 0); +// clusterer.model_reg.init(path_reg, 1, 0); +// +// GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true); +// } int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current) { @@ -117,7 +117,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i uint* clusterInRow, tpc::ClusterNative* clusterByRow, uint* clusterPosInRow, - int in_row, int in_pad, int in_time, bool add_index_data, float class_threshold, bool sigmoid_transform){ + int in_row, int in_pad, int in_time, bool add_index_data, float class_threshold, bool sigmoid_transform, int verbosity){ std::vector input_data(((2*in_row + 1) * (2*in_pad + 1) * (2*in_time + 1) + (add_index_data ? 3 : 0)), -1.f); float classification_threshold = class_threshold; @@ -169,7 +169,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i std::vector out_reg = clusterer.model_reg.inference_vector(input_data, 1); int num_outputs = clusterer.model_reg.getNumOutputNodes()[0][1]; - if(idx == 100){ + if((verbosity > 4) && idx == 100){ LOG(info) << "Classification model: " << out_class[0] << " (>? " << classification_threshold << ")"; LOG(info) << "Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4]; } @@ -179,7 +179,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i pc.setFull(central_charge * out_reg[4], peak.pad() + out_reg[0], out_reg[2], fragment.start + peak.time() + out_reg[1], out_reg[3], 0, 0); tpc::ClusterNative myCluster; bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param()); - if (rejectCluster) { + if ((verbosity > 0) && rejectCluster) { LOG(warning) << "Cluster rejected!"; if (clusterPosInRow) { clusterPosInRow[idx] = maxClusterPerRow; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 905e6f860a90f..7fbf5a806a916 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -77,7 +77,7 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate uint*, tpc::ClusterNative*, uint*, - int = 3, int = 3, int = 3, bool = 1, float = 0.16, bool = true); + int = 3, int = 3, int = 3, bool = 1, float = 0.16, bool = true, int = 1); private: // --------------------------------- From accd7abaac7a2fce98a280ec6e4d8fa2e8eb6254 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Wed, 3 Jul 2024 13:39:11 +0200 Subject: [PATCH 09/23] Adding return statement if cluster is rejected --- GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index d7e3226e0d54c..3c2dadaf660b1 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -202,8 +202,12 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i } else if (clusterPosInRow) { rowIndex = clusterPosInRow[idx]; } - CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, maxClusterPerRow)); + } else { + if (clusterPosInRow) { + clusterPosInRow[idx] = maxClusterPerRow; + } + return; } } From 3473a066755dc4ae23ce7965d7b77cb7d5ffb020 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 4 Jul 2024 14:10:58 +0200 Subject: [PATCH 10/23] Adding some statements back --- GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index 3c2dadaf660b1..98f7cdee72b0c 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -32,7 +32,7 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo CPU_ONLY( MCLabelAccumulator labelAcc(clusterer)); - tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow; // (onlyMC) ? nullptr : clusterer.mPclusterByRow; + tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, clusterer.nnSizeInputRow, clusterer.nnSizeInputPad, clusterer.nnSizeInputTime, clusterer.nnAddIndexData, clusterer.nnClassThreshold, clusterer.nnSigmoidTrafoThreshold, clusterer.nnClusterizerVerbosity); @@ -210,6 +210,10 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i return; } + if((verbosity > 4) && idx == 100){ + LOG(info) << "Clusterization done!"; + } + } From df21c963bc4cd132eb5eb175160bf5c76e264fe3 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 17 Oct 2024 14:09:20 +0200 Subject: [PATCH 11/23] Update to latest status of gpu clusterization --- Common/CMakeLists.txt | 1 + Common/ML/CMakeLists.txt | 5 +- Common/ML/include/ML/3rdparty/GPUORTFloat16.h | 867 ++++++++++++++++++ Common/ML/include/ML/onnx_interface.h | 100 -- Common/ML/include/ML/ort_interface.h | 94 ++ Common/ML/src/onnx_interface.cxx | 226 ----- Common/ML/src/ort_interface.cxx | 262 ++++++ GPU/GPUTracking/CMakeLists.txt | 3 +- GPU/GPUTracking/Definitions/GPUSettingsList.h | 28 +- GPU/GPUTracking/Global/GPUChainTracking.cxx | 2 +- .../Global/GPUChainTrackingClusterizer.cxx | 67 +- GPU/GPUTracking/ML/onnx_interface.cxx | 235 ----- GPU/GPUTracking/ML/onnx_interface.h | 101 -- GPU/GPUTracking/TPCClusterFinder/ChargePos.h | 1 + .../TPCClusterFinder/GPUTPCClusterFinder.h | 25 +- .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 446 ++++++--- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 12 +- 17 files changed, 1651 insertions(+), 824 deletions(-) create mode 100644 Common/ML/include/ML/3rdparty/GPUORTFloat16.h delete mode 100644 Common/ML/include/ML/onnx_interface.h create mode 100644 Common/ML/include/ML/ort_interface.h delete mode 100644 Common/ML/src/onnx_interface.cxx create mode 100644 Common/ML/src/ort_interface.cxx delete mode 100644 GPU/GPUTracking/ML/onnx_interface.cxx delete mode 100644 GPU/GPUTracking/ML/onnx_interface.h diff --git a/Common/CMakeLists.txt b/Common/CMakeLists.txt index f435e269575aa..0b92758e45f43 100644 --- a/Common/CMakeLists.txt +++ b/Common/CMakeLists.txt @@ -16,5 +16,6 @@ add_subdirectory(Types) add_subdirectory(Utils) add_subdirectory(SimConfig) add_subdirectory(DCAFitter) +add_subdirectory(ML) o2_data_file(COPY maps DESTINATION Common) diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt index 60a07041da2e0..954d29d6e2793 100644 --- a/Common/ML/CMakeLists.txt +++ b/Common/ML/CMakeLists.txt @@ -10,7 +10,6 @@ # or submit itself to any jurisdiction. o2_add_library(ML - SOURCES src/onnx_interface.cxx + SOURCES src/ort_interface.cxx TARGETVARNAME targetName - PUBLIC_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime -) \ No newline at end of file + PRIVATE_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime) \ No newline at end of file diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h new file mode 100644 index 0000000000000..db65328409d3c --- /dev/null +++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h @@ -0,0 +1,867 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// This code was created from: +// - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_float16.h +// - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_cxx_api.h + +#include +#include +#include +#include + +namespace o2 +{ + +namespace OrtDataType +{ + +namespace detail +{ + +enum class endian { +#if defined(_WIN32) + little = 0, + big = 1, + native = little, +#elif defined(__GNUC__) || defined(__clang__) + little = __ORDER_LITTLE_ENDIAN__, + big = __ORDER_BIG_ENDIAN__, + native = __BYTE_ORDER__, +#else +#error OrtDataType::detail::endian is not implemented in this environment. +#endif +}; + +static_assert( + endian::native == endian::little || endian::native == endian::big, + "Only little-endian or big-endian native byte orders are supported."); + +} // namespace detail + +/// +/// Shared implementation between public and internal classes. CRTP pattern. +/// +template +struct Float16Impl { + protected: + /// + /// Converts from float to uint16_t float16 representation + /// + /// + /// + constexpr static uint16_t ToUint16Impl(float v) noexcept; + + /// + /// Converts float16 to float + /// + /// float representation of float16 value + float ToFloatImpl() const noexcept; + + /// + /// Creates an instance that represents absolute value. + /// + /// Absolute value + uint16_t AbsImpl() const noexcept + { + return static_cast(val & ~kSignMask); + } + + /// + /// Creates a new instance with the sign flipped. + /// + /// Flipped sign instance + uint16_t NegateImpl() const noexcept + { + return IsNaN() ? val : static_cast(val ^ kSignMask); + } + + public: + // uint16_t special values + static constexpr uint16_t kSignMask = 0x8000U; + static constexpr uint16_t kBiasedExponentMask = 0x7C00U; + static constexpr uint16_t kPositiveInfinityBits = 0x7C00U; + static constexpr uint16_t kNegativeInfinityBits = 0xFC00U; + static constexpr uint16_t kPositiveQNaNBits = 0x7E00U; + static constexpr uint16_t kNegativeQNaNBits = 0xFE00U; + static constexpr uint16_t kEpsilonBits = 0x4170U; + static constexpr uint16_t kMinValueBits = 0xFBFFU; // Minimum normal number + static constexpr uint16_t kMaxValueBits = 0x7BFFU; // Largest normal number + static constexpr uint16_t kOneBits = 0x3C00U; + static constexpr uint16_t kMinusOneBits = 0xBC00U; + + uint16_t val{0}; + + Float16Impl() = default; + + /// + /// Checks if the value is negative + /// + /// true if negative + bool IsNegative() const noexcept + { + return static_cast(val) < 0; + } + + /// + /// Tests if the value is NaN + /// + /// true if NaN + bool IsNaN() const noexcept + { + return AbsImpl() > kPositiveInfinityBits; + } + + /// + /// Tests if the value is finite + /// + /// true if finite + bool IsFinite() const noexcept + { + return AbsImpl() < kPositiveInfinityBits; + } + + /// + /// Tests if the value represents positive infinity. + /// + /// true if positive infinity + bool IsPositiveInfinity() const noexcept + { + return val == kPositiveInfinityBits; + } + + /// + /// Tests if the value represents negative infinity + /// + /// true if negative infinity + bool IsNegativeInfinity() const noexcept + { + return val == kNegativeInfinityBits; + } + + /// + /// Tests if the value is either positive or negative infinity. + /// + /// True if absolute value is infinity + bool IsInfinity() const noexcept + { + return AbsImpl() == kPositiveInfinityBits; + } + + /// + /// Tests if the value is NaN or zero. Useful for comparisons. + /// + /// True if NaN or zero. + bool IsNaNOrZero() const noexcept + { + auto abs = AbsImpl(); + return (abs == 0 || abs > kPositiveInfinityBits); + } + + /// + /// Tests if the value is normal (not zero, subnormal, infinite, or NaN). + /// + /// True if so + bool IsNormal() const noexcept + { + auto abs = AbsImpl(); + return (abs < kPositiveInfinityBits) // is finite + && (abs != 0) // is not zero + && ((abs & kBiasedExponentMask) != 0); // is not subnormal (has a non-zero exponent) + } + + /// + /// Tests if the value is subnormal (denormal). + /// + /// True if so + bool IsSubnormal() const noexcept + { + auto abs = AbsImpl(); + return (abs < kPositiveInfinityBits) // is finite + && (abs != 0) // is not zero + && ((abs & kBiasedExponentMask) == 0); // is subnormal (has a zero exponent) + } + + /// + /// Creates an instance that represents absolute value. + /// + /// Absolute value + Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); } + + /// + /// Creates a new instance with the sign flipped. + /// + /// Flipped sign instance + Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); } + + /// + /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check + /// for two values by or'ing the private bits together and stripping the sign. They are both zero, + /// and therefore equivalent, if the resulting value is still zero. + /// + /// first value + /// second value + /// True if both arguments represent zero + static bool AreZero(const Float16Impl& lhs, const Float16Impl& rhs) noexcept + { + return static_cast((lhs.val | rhs.val) & ~kSignMask) == 0; + } + + bool operator==(const Float16Impl& rhs) const noexcept + { + if (IsNaN() || rhs.IsNaN()) { + // IEEE defines that NaN is not equal to anything, including itself. + return false; + } + return val == rhs.val; + } + + bool operator!=(const Float16Impl& rhs) const noexcept { return !(*this == rhs); } + + bool operator<(const Float16Impl& rhs) const noexcept + { + if (IsNaN() || rhs.IsNaN()) { + // IEEE defines that NaN is unordered with respect to everything, including itself. + return false; + } + + const bool left_is_negative = IsNegative(); + if (left_is_negative != rhs.IsNegative()) { + // When the signs of left and right differ, we know that left is less than right if it is + // the negative value. The exception to this is if both values are zero, in which case IEEE + // says they should be equal, even if the signs differ. + return left_is_negative && !AreZero(*this, rhs); + } + return (val != rhs.val) && ((val < rhs.val) ^ left_is_negative); + } +}; + +// The following Float16_t conversions are based on the code from +// Eigen library. + +// The conversion routines are Copyright (c) Fabian Giesen, 2016. +// The original license follows: +// +// Copyright (c) Fabian Giesen, 2016 +// All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted. +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +namespace detail +{ +union float32_bits { + unsigned int u; + float f; +}; +}; // namespace detail + +template +inline constexpr uint16_t Float16Impl::ToUint16Impl(float v) noexcept +{ + detail::float32_bits f{}; + f.f = v; + + constexpr detail::float32_bits f32infty = {255 << 23}; + constexpr detail::float32_bits f16max = {(127 + 16) << 23}; + constexpr detail::float32_bits denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23}; + constexpr unsigned int sign_mask = 0x80000000u; + uint16_t val = static_cast(0x0u); + + unsigned int sign = f.u & sign_mask; + f.u ^= sign; + + // NOTE all the integer compares in this function can be safely + // compiled into signed compares since all operands are below + // 0x80000000. Important if you want fast straight SSE2 code + // (since there's no unsigned PCMPGTD). + + if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) + val = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + } else { // (De)normalized number or zero + if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + + // and one integer subtract of the bias later, we have our final float! + val = static_cast(f.u - denorm_magic.u); + } else { + unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + + // update exponent, rounding bias part 1 + // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but + // without arithmetic overflow. + f.u += 0xc8000fffU; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + val = static_cast(f.u >> 13); + } + } + + val |= static_cast(sign >> 16); + return val; +} + +template +inline float Float16Impl::ToFloatImpl() const noexcept +{ + constexpr detail::float32_bits magic = {113 << 23}; + constexpr unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift + detail::float32_bits o{}; + + o.u = (val & 0x7fff) << 13; // exponent/mantissa bits + unsigned int exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) { // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + } else if (exp == 0) { // Zero/Denormal? + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // re-normalize + } + + // Attempt to workaround the Internal Compiler Error on ARM64 + // for bitwise | operator, including std::bitset +#if (defined _MSC_VER) && (defined _M_ARM || defined _M_ARM64 || defined _M_ARM64EC) + if (IsNegative()) { + return -o.f; + } +#else + // original code: + o.u |= (val & 0x8000U) << 16U; // sign bit +#endif + return o.f; +} + +/// Shared implementation between public and internal classes. CRTP pattern. +template +struct BFloat16Impl { + protected: + /// + /// Converts from float to uint16_t float16 representation + /// + /// + /// + static uint16_t ToUint16Impl(float v) noexcept; + + /// + /// Converts bfloat16 to float + /// + /// float representation of bfloat16 value + float ToFloatImpl() const noexcept; + + /// + /// Creates an instance that represents absolute value. + /// + /// Absolute value + uint16_t AbsImpl() const noexcept + { + return static_cast(val & ~kSignMask); + } + + /// + /// Creates a new instance with the sign flipped. + /// + /// Flipped sign instance + uint16_t NegateImpl() const noexcept + { + return IsNaN() ? val : static_cast(val ^ kSignMask); + } + + public: + // uint16_t special values + static constexpr uint16_t kSignMask = 0x8000U; + static constexpr uint16_t kBiasedExponentMask = 0x7F80U; + static constexpr uint16_t kPositiveInfinityBits = 0x7F80U; + static constexpr uint16_t kNegativeInfinityBits = 0xFF80U; + static constexpr uint16_t kPositiveQNaNBits = 0x7FC1U; + static constexpr uint16_t kNegativeQNaNBits = 0xFFC1U; + static constexpr uint16_t kSignaling_NaNBits = 0x7F80U; + static constexpr uint16_t kEpsilonBits = 0x0080U; + static constexpr uint16_t kMinValueBits = 0xFF7FU; + static constexpr uint16_t kMaxValueBits = 0x7F7FU; + static constexpr uint16_t kRoundToNearest = 0x7FFFU; + static constexpr uint16_t kOneBits = 0x3F80U; + static constexpr uint16_t kMinusOneBits = 0xBF80U; + + uint16_t val{0}; + + BFloat16Impl() = default; + + /// + /// Checks if the value is negative + /// + /// true if negative + bool IsNegative() const noexcept + { + return static_cast(val) < 0; + } + + /// + /// Tests if the value is NaN + /// + /// true if NaN + bool IsNaN() const noexcept + { + return AbsImpl() > kPositiveInfinityBits; + } + + /// + /// Tests if the value is finite + /// + /// true if finite + bool IsFinite() const noexcept + { + return AbsImpl() < kPositiveInfinityBits; + } + + /// + /// Tests if the value represents positive infinity. + /// + /// true if positive infinity + bool IsPositiveInfinity() const noexcept + { + return val == kPositiveInfinityBits; + } + + /// + /// Tests if the value represents negative infinity + /// + /// true if negative infinity + bool IsNegativeInfinity() const noexcept + { + return val == kNegativeInfinityBits; + } + + /// + /// Tests if the value is either positive or negative infinity. + /// + /// True if absolute value is infinity + bool IsInfinity() const noexcept + { + return AbsImpl() == kPositiveInfinityBits; + } + + /// + /// Tests if the value is NaN or zero. Useful for comparisons. + /// + /// True if NaN or zero. + bool IsNaNOrZero() const noexcept + { + auto abs = AbsImpl(); + return (abs == 0 || abs > kPositiveInfinityBits); + } + + /// + /// Tests if the value is normal (not zero, subnormal, infinite, or NaN). + /// + /// True if so + bool IsNormal() const noexcept + { + auto abs = AbsImpl(); + return (abs < kPositiveInfinityBits) // is finite + && (abs != 0) // is not zero + && ((abs & kBiasedExponentMask) != 0); // is not subnormal (has a non-zero exponent) + } + + /// + /// Tests if the value is subnormal (denormal). + /// + /// True if so + bool IsSubnormal() const noexcept + { + auto abs = AbsImpl(); + return (abs < kPositiveInfinityBits) // is finite + && (abs != 0) // is not zero + && ((abs & kBiasedExponentMask) == 0); // is subnormal (has a zero exponent) + } + + /// + /// Creates an instance that represents absolute value. + /// + /// Absolute value + Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); } + + /// + /// Creates a new instance with the sign flipped. + /// + /// Flipped sign instance + Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); } + + /// + /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check + /// for two values by or'ing the private bits together and stripping the sign. They are both zero, + /// and therefore equivalent, if the resulting value is still zero. + /// + /// first value + /// second value + /// True if both arguments represent zero + static bool AreZero(const BFloat16Impl& lhs, const BFloat16Impl& rhs) noexcept + { + // IEEE defines that positive and negative zero are equal, this gives us a quick equality check + // for two values by or'ing the private bits together and stripping the sign. They are both zero, + // and therefore equivalent, if the resulting value is still zero. + return static_cast((lhs.val | rhs.val) & ~kSignMask) == 0; + } +}; + +template +inline uint16_t BFloat16Impl::ToUint16Impl(float v) noexcept +{ + uint16_t result; + if (std::isnan(v)) { + result = kPositiveQNaNBits; + } else { + auto get_msb_half = [](float fl) { + uint16_t result; +#ifdef __cpp_if_constexpr + if constexpr (detail::endian::native == detail::endian::little) +#else + if (detail::endian::native == detail::endian::little) +#endif + { + std::memcpy(&result, reinterpret_cast(&fl) + sizeof(uint16_t), sizeof(uint16_t)); + } else { + std::memcpy(&result, &fl, sizeof(uint16_t)); + } + return result; + }; + + uint16_t upper_bits = get_msb_half(v); + union { + uint32_t U32; + float F32; + }; + F32 = v; + U32 += (upper_bits & 1) + kRoundToNearest; + result = get_msb_half(F32); + } + return result; +} + +template +inline float BFloat16Impl::ToFloatImpl() const noexcept +{ + if (IsNaN()) { + return std::numeric_limits::quiet_NaN(); + } + float result; + char* const first = reinterpret_cast(&result); + char* const second = first + sizeof(uint16_t); +#ifdef __cpp_if_constexpr + if constexpr (detail::endian::native == detail::endian::little) +#else + if (detail::endian::native == detail::endian::little) +#endif + { + std::memset(first, 0, sizeof(uint16_t)); + std::memcpy(second, &val, sizeof(uint16_t)); + } else { + std::memcpy(first, &val, sizeof(uint16_t)); + std::memset(second, 0, sizeof(uint16_t)); + } + return result; +} + +/** \brief IEEE 754 half-precision floating point data type + * + * \details This struct is used for converting float to float16 and back + * so the user could feed inputs and fetch outputs using these type. + * + * The size of the structure should align with uint16_t and one can freely cast + * uint16_t buffers to/from Ort::Float16_t to feed and retrieve data. + * + * \code{.unparsed} + * // This example demonstrates converion from float to float16 + * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f}; + * std::vector fp16_values; + * fp16_values.reserve(std::size(values)); + * std::transform(std::begin(values), std::end(values), std::back_inserter(fp16_values), + * [](float value) { return Ort::Float16_t(value); }); + * + * \endcode + */ +struct Float16_t : OrtDataType::Float16Impl { + private: + /// + /// Constructor from a 16-bit representation of a float16 value + /// No conversion is done here. + /// + /// 16-bit representation + constexpr explicit Float16_t(uint16_t v) noexcept { val = v; } + + public: + using Base = OrtDataType::Float16Impl; + + /// + /// Default constructor + /// + Float16_t() = default; + + /// + /// Explicit conversion to uint16_t representation of float16. + /// + /// uint16_t bit representation of float16 + /// new instance of Float16_t + constexpr static Float16_t FromBits(uint16_t v) noexcept { return Float16_t(v); } + + /// + /// __ctor from float. Float is converted into float16 16-bit representation. + /// + /// float value + explicit Float16_t(float v) noexcept { val = Base::ToUint16Impl(v); } + + /// + /// Converts float16 to float + /// + /// float representation of float16 value + float ToFloat() const noexcept { return Base::ToFloatImpl(); } + + /// + /// Checks if the value is negative + /// + /// true if negative + using Base::IsNegative; + + /// + /// Tests if the value is NaN + /// + /// true if NaN + using Base::IsNaN; + + /// + /// Tests if the value is finite + /// + /// true if finite + using Base::IsFinite; + + /// + /// Tests if the value represents positive infinity. + /// + /// true if positive infinity + using Base::IsPositiveInfinity; + + /// + /// Tests if the value represents negative infinity + /// + /// true if negative infinity + using Base::IsNegativeInfinity; + + /// + /// Tests if the value is either positive or negative infinity. + /// + /// True if absolute value is infinity + using Base::IsInfinity; + + /// + /// Tests if the value is NaN or zero. Useful for comparisons. + /// + /// True if NaN or zero. + using Base::IsNaNOrZero; + + /// + /// Tests if the value is normal (not zero, subnormal, infinite, or NaN). + /// + /// True if so + using Base::IsNormal; + + /// + /// Tests if the value is subnormal (denormal). + /// + /// True if so + using Base::IsSubnormal; + + /// + /// Creates an instance that represents absolute value. + /// + /// Absolute value + using Base::Abs; + + /// + /// Creates a new instance with the sign flipped. + /// + /// Flipped sign instance + using Base::Negate; + + /// + /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check + /// for two values by or'ing the private bits together and stripping the sign. They are both zero, + /// and therefore equivalent, if the resulting value is still zero. + /// + /// first value + /// second value + /// True if both arguments represent zero + using Base::AreZero; + + /// + /// User defined conversion operator. Converts Float16_t to float. + /// + explicit operator float() const noexcept { return ToFloat(); } + + using Base::operator==; + using Base::operator!=; + using Base::operator<; +}; + +static_assert(sizeof(Float16_t) == sizeof(uint16_t), "Sizes must match"); + +/** \brief bfloat16 (Brain Floating Point) data type + * + * \details This struct is used for converting float to bfloat16 and back + * so the user could feed inputs and fetch outputs using these type. + * + * The size of the structure should align with uint16_t and one can freely cast + * uint16_t buffers to/from Ort::BFloat16_t to feed and retrieve data. + * + * \code{.unparsed} + * // This example demonstrates converion from float to float16 + * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f}; + * std::vector bfp16_values; + * bfp16_values.reserve(std::size(values)); + * std::transform(std::begin(values), std::end(values), std::back_inserter(bfp16_values), + * [](float value) { return Ort::BFloat16_t(value); }); + * + * \endcode + */ +struct BFloat16_t : OrtDataType::BFloat16Impl { + private: + /// + /// Constructor from a uint16_t representation of bfloat16 + /// used in FromBits() to escape overload resolution issue with + /// constructor from float. + /// No conversion is done. + /// + /// 16-bit bfloat16 value + constexpr explicit BFloat16_t(uint16_t v) noexcept { val = v; } + + public: + using Base = OrtDataType::BFloat16Impl; + + BFloat16_t() = default; + + /// + /// Explicit conversion to uint16_t representation of bfloat16. + /// + /// uint16_t bit representation of bfloat16 + /// new instance of BFloat16_t + static constexpr BFloat16_t FromBits(uint16_t v) noexcept { return BFloat16_t(v); } + + /// + /// __ctor from float. Float is converted into bfloat16 16-bit representation. + /// + /// float value + explicit BFloat16_t(float v) noexcept { val = Base::ToUint16Impl(v); } + + /// + /// Converts bfloat16 to float + /// + /// float representation of bfloat16 value + float ToFloat() const noexcept { return Base::ToFloatImpl(); } + + /// + /// Checks if the value is negative + /// + /// true if negative + using Base::IsNegative; + + /// + /// Tests if the value is NaN + /// + /// true if NaN + using Base::IsNaN; + + /// + /// Tests if the value is finite + /// + /// true if finite + using Base::IsFinite; + + /// + /// Tests if the value represents positive infinity. + /// + /// true if positive infinity + using Base::IsPositiveInfinity; + + /// + /// Tests if the value represents negative infinity + /// + /// true if negative infinity + using Base::IsNegativeInfinity; + + /// + /// Tests if the value is either positive or negative infinity. + /// + /// True if absolute value is infinity + using Base::IsInfinity; + + /// + /// Tests if the value is NaN or zero. Useful for comparisons. + /// + /// True if NaN or zero. + using Base::IsNaNOrZero; + + /// + /// Tests if the value is normal (not zero, subnormal, infinite, or NaN). + /// + /// True if so + using Base::IsNormal; + + /// + /// Tests if the value is subnormal (denormal). + /// + /// True if so + using Base::IsSubnormal; + + /// + /// Creates an instance that represents absolute value. + /// + /// Absolute value + using Base::Abs; + + /// + /// Creates a new instance with the sign flipped. + /// + /// Flipped sign instance + using Base::Negate; + + /// + /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check + /// for two values by or'ing the private bits together and stripping the sign. They are both zero, + /// and therefore equivalent, if the resulting value is still zero. + /// + /// first value + /// second value + /// True if both arguments represent zero + using Base::AreZero; + + /// + /// User defined conversion operator. Converts BFloat16_t to float. + /// + explicit operator float() const noexcept { return ToFloat(); } + + // We do not have an inherited impl for the below operators + // as the internal class implements them a little differently + bool operator==(const BFloat16_t& rhs) const noexcept; + bool operator!=(const BFloat16_t& rhs) const noexcept { return !(*this == rhs); } + bool operator<(const BFloat16_t& rhs) const noexcept; +}; + +static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match"); + +} // namespace OrtDataType + +} // namespace o2 \ No newline at end of file diff --git a/Common/ML/include/ML/onnx_interface.h b/Common/ML/include/ML/onnx_interface.h deleted file mode 100644 index fcc02a49996ea..0000000000000 --- a/Common/ML/include/ML/onnx_interface.h +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. - -/// -/// \file model.h -/// -/// \author Christian Sonnabend -/// -/// \brief A general-purpose class for ONNX models -/// - -#ifndef COMMON_ML_ONNX_INTERFACE_H -#define COMMON_ML_ONNX_INTERFACE_H - -// C++ and system includes -#if __has_include() -#include -#else -#include -#endif -#include -#include -#include -#include -#include - -// O2 includes -#include "Framework/Logger.h" - -namespace o2 -{ - -namespace ml -{ - -class OnnxModel -{ - public: - OnnxModel() : mMemoryInfo(Ort::MemoryInfo::CreateCpu(OrtAllocatorType, OrtMemType)) {}; - virtual ~OnnxModel() = default; - - // Inferencing - void init(std::string, bool = false, int = 0); - // float* inference(std::vector, int = 0); - // float* inference(std::vector, int = 0); - template float* inference(T input, unsigned int size); - template std::vector inference_vector(T input, unsigned int size); - - // Reset session - #if __has_include() - void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); }; - #else - void resetSession() { mSession.reset(new Ort::Session{*mEnv, modelPath.c_str(), sessionOptions}); }; - #endif - - // Getters & Setters - Ort::SessionOptions* getSessionOptions() { return &sessionOptions; } // For optimizations in post - #if __has_include() - std::shared_ptr getSession() { return mSession; } - #else - std::shared_ptr getSession() { return mSession; } - #endif - std::vector> getNumInputNodes() const { return mInputShapes; } - std::vector> getNumOutputNodes() const { return mOutputShapes; } - void setActiveThreads(int); - - private: - // Environment variables for the ONNX runtime - std::shared_ptr mEnv = nullptr; - std::shared_ptr mSession = nullptr; ///< ONNX session - Ort::MemoryInfo mMemoryInfo; - Ort::SessionOptions sessionOptions; - - // Input & Output specifications of the loaded network - std::vector mInputNames; - std::vector> mInputShapes; - std::vector mOutputNames; - std::vector> mOutputShapes; - - // Environment settings - std::string modelPath; - int activeThreads = 0; - - // Internal function for printing the shape of tensors - std::string printShape(const std::vector&); -}; - -} // namespace gpu - -} // namespace GPUCA_NAMESPACE - -#endif // COMMON_ML_ONNX_INTERFACE_H \ No newline at end of file diff --git a/Common/ML/include/ML/ort_interface.h b/Common/ML/include/ML/ort_interface.h new file mode 100644 index 0000000000000..a365860db3279 --- /dev/null +++ b/Common/ML/include/ML/ort_interface.h @@ -0,0 +1,94 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file ort_interface.h +/// \author Christian Sonnabend +/// \brief A header library for loading ONNX models and inferencing them on CPU and GPU + +#ifndef O2_ML_ONNX_INTERFACE_H +#define O2_ML_ONNX_INTERFACE_H + +// C++ and system includes +#include +#include +#include +#include +#include + +// O2 includes +#include "Framework/Logger.h" + +namespace o2 +{ + +namespace ml +{ + +class OrtModel +{ + + public: + // Constructor + OrtModel() = default; + OrtModel(std::unordered_map optionsMap){ reset(optionsMap); } + void init(std::unordered_map optionsMap){ reset(optionsMap); } + void reset(std::unordered_map); + + virtual ~OrtModel() = default; + + // Conversion + template + std::vector v2v(std::vector&, bool = true); + + // Inferencing + template // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h + std::vector inference(std::vector&); + + template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h + std::vector inference(std::vector>&); + + // template // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type + // std::vector inference(std::vector&); + + // Reset session + void resetSession(); + + std::vector> getNumInputNodes() const { return mInputShapes; } + std::vector> getNumOutputNodes() const { return mOutputShapes; } + std::vector getInputNames() const { return mInputNames; } + std::vector getOutputNames() const { return mOutputNames; } + + void setActiveThreads(int threads) { intraOpNumThreads = threads; } + + private: + + // ORT variables -> need to be hidden as Pimpl + struct OrtVariables; + OrtVariables* pImplOrt; + + // Input & Output specifications of the loaded network + std::vector inputNamesChar, outputNamesChar; + std::vector mInputNames, mOutputNames; + std::vector> mInputShapes, mOutputShapes; + + // Environment settings + std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda + int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; + + std::string printShape(const std::vector&); + +}; + +} // namespace ml + +} // namespace ml + +#endif // O2_ML_ORT_INTERFACE_H \ No newline at end of file diff --git a/Common/ML/src/onnx_interface.cxx b/Common/ML/src/onnx_interface.cxx deleted file mode 100644 index c348d4577d47f..0000000000000 --- a/Common/ML/src/onnx_interface.cxx +++ /dev/null @@ -1,226 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. - -/// -/// \file model.cxx -/// -/// \author Christian Sonnabend -/// -/// \brief A general-purpose class with functions for ONNX model applications -/// - -// ONNX includes -#include "ML/onnx_interface.h" - -namespace o2 -{ - -namespace ml -{ - -std::string OnnxModel::printShape(const std::vector& v) -{ - std::stringstream ss(""); - for (size_t i = 0; i < v.size() - 1; i++) - ss << v[i] << "x"; - ss << v[v.size() - 1]; - return ss.str(); -} - -void OnnxModel::init(std::string localPath, bool enableOptimizations, int threads) -{ - - LOG(info) << "--- ONNX-ML model ---"; - LOG(info) << "Taking model from: " << localPath; - modelPath = localPath; - activeThreads = threads; - -#if __has_include() -#else - mMemoryInfo = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault); -#endif - - /// Enableing optimizations - if(threads != 0){ - // sessionOptions.SetInterOpNumThreads(1); - if(threads == 1){ - sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); - } - else{ - sessionOptions.SetExecutionMode(ExecutionMode::ORT_PARALLEL); - sessionOptions.SetIntraOpNumThreads(threads); - } - } - if (enableOptimizations) { - // sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED); - sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); - // uint32_t coreml_flags = 0; - // coreml_flags |= COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE; - // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(sessionOptions, coreml_flags)); - } - - mEnv = std::make_shared(ORT_LOGGING_LEVEL_WARNING, "onnx-model"); - #if __has_include() - mSession = std::make_shared(*mEnv, modelPath, sessionOptions); - mInputNames = mSession->GetInputNames(); - mInputShapes = mSession->GetInputShapes(); - mOutputNames = mSession->GetOutputNames(); - mOutputShapes = mSession->GetOutputShapes(); - #else - mSession = std::make_shared(*mEnv, modelPath.c_str(), sessionOptions); - Ort::AllocatorWithDefaultOptions tmpAllocator; - for (size_t i = 0; i < mSession->GetInputCount(); ++i) { - mInputNames.push_back(mSession->GetInputNameAllocated(i, tmpAllocator).get()); - } - for (size_t i = 0; i < mSession->GetInputCount(); ++i) { - mInputShapes.emplace_back(mSession->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); - } - for (size_t i = 0; i < mSession->GetOutputCount(); ++i) { - mOutputNames.push_back(mSession->GetOutputNameAllocated(i, tmpAllocator).get()); - } - for (size_t i = 0; i < mSession->GetOutputCount(); ++i) { - mOutputShapes.emplace_back(mSession->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); - } - #endif - - LOG(info) << "Input Nodes:"; - for (size_t i = 0; i < mInputNames.size(); i++) { - LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]); - } - - LOG(info) << "Output Nodes:"; - for (size_t i = 0; i < mOutputNames.size(); i++) { - LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]); - } - - LOG(info) << "--- Model initialized! ---"; -} - -// float* OnnxModel::inference(std::vector input, int device_id) -// { - -// // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id)); - -// try { -// auto outputTensors = mSession->Run(mInputNames, input, mOutputNames); -// float* outputValues = outputTensors[0].GetTensorMutableData(); -// return outputValues; -// } catch (const Ort::Exception& exception) { -// LOG(error) << "Error running model inference: " << exception.what(); -// } -// return nullptr; -// } - -// float* OnnxModel::inference(std::vector input, int device_id) -// { -// -// // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id)); -// -// int64_t size = input.size(); -// assert(size % mInputShapes[0][1] == 0); -// std::vector inputShape{size / mInputShapes[0][1], mInputShapes[0][1]}; -// std::vector inputTensors; -// inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor(input.data(), size, inputShape)); -// try { -// auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); -// float* outputValues = outputTensors[0].GetTensorMutableData(); -// return outputValues; -// } catch (const Ort::Exception& exception) { -// LOG(error) << "Error running model inference: " << exception.what(); -// } -// return nullptr; -// } - -template -float* OnnxModel::inference(T input, unsigned int size) -{ - std::vector inputShape = mInputShapes[0]; - inputShape[0] = size; - std::vector inputTensors; - size_t mem_size = 1; - for(auto elem : inputShape){ - mem_size*=elem; - } -#if __has_include() - inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor(input.data(), mem_size, inputShape)); - try { - auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); - inputTensors.clear(); - float* outputValues = outputTensors[0].GetTensorMutableData(); - return outputValues; - } catch (const Ort::Exception& exception) { - LOG(error) << "Error running model inference: " << exception.what(); - } -#else - std::vector tmpInputs; - std::vector tmpOutputs; - inputTensors.emplace_back(Ort::Value::CreateTensor(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1)); - try { - auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size()); - inputTensors.clear(); - float* outputValues = outputTensors[0].GetTensorMutableData(); - return outputValues; - } catch (const Ort::Exception& exception) { - LOG(error) << "Error running model inference: " << exception.what(); - } -#endif - // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB"; - return nullptr; -} - -template -std::vector OnnxModel::inference_vector(T input, unsigned int size) -{ - std::vector inputShape = mInputShapes[0]; - inputShape[0] = size; - std::vector inputTensors; - // std::vector outputValues; - size_t mem_size = 1; - for(auto elem : inputShape){ - mem_size*=elem; - } -#if __has_include() - inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor(input.data(), mem_size, inputShape)); - try { - auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); - inputTensors.clear(); - float* outputValues = outputTensors[0].GetTensorMutableData(); - return std::vector{outputValues, outputValues + size * mOutputShapes[0][1]}; - } catch (const Ort::Exception& exception) { - LOG(error) << "Error running model inference: " << exception.what(); - } -#else - std::vector tmpInputs; - std::vector tmpOutputs; - inputTensors.emplace_back(Ort::Value::CreateTensor(input.data(), mem_size, inputShape)); - try { - auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); - inputTensors.clear(); - float* outputValues = outputTensors[0].GetTensorMutableData(); - return std::vector{outputValues, outputValues + size * mOutputShapes[0][1]}; - } catch (const Ort::Exception& exception) { - LOG(error) << "Error running model inference: " << exception.what(); - } -#endif - return std::vector{}; -} - -void OnnxModel::setActiveThreads(int threads) -{ - activeThreads = threads; -} - -template float* OnnxModel::inference(std::vector, unsigned int); -template std::vector OnnxModel::inference_vector(std::vector, unsigned int); - -} // namespace gpu - -} // namespace GPUCA_NAMESPACE \ No newline at end of file diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx new file mode 100644 index 0000000000000..84a06ce1da068 --- /dev/null +++ b/Common/ML/src/ort_interface.cxx @@ -0,0 +1,262 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file ort_interface.cxx +/// \author Christian Sonnabend +/// \brief A header library for loading ONNX models and inferencing them on CPU and GPU + +#include "ML/ort_interface.h" +#include "ML/3rdparty/GPUORTFloat16.h" + +// ONNX includes +#include + +namespace o2 +{ + +namespace ml +{ + +struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file + // ORT runtime objects + Ort::RunOptions runOptions; + std::shared_ptr env = nullptr; + std::shared_ptr session = nullptr; ///< ONNX session + Ort::SessionOptions sessionOptions; + Ort::AllocatorWithDefaultOptions allocator; + Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); +}; + +void OrtModel::reset(std::unordered_map optionsMap){ + + pImplOrt = new OrtVariables(); + + // Load from options map + if(!optionsMap.contains("model-path")){ + LOG(fatal) << "(ORT) Model path cannot be empty!"; + } + modelPath = optionsMap["model-path"]; + device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU"); + dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float"); + deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0); + allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); + intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); + loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0); + enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0); + enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); + + std::string dev_mem_str = "Hip"; +#ifdef ORT_ROCM_BUILD + if(device == "ROCM") { + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId)); + LOG(info) << "(ORT) ROCM execution provider set"; + } +#endif +#ifdef ORT_MIGRAPHX_BUILD + if(device == "MIGRAPHX") { + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId)); + LOG(info) << "(ORT) MIGraphX execution provider set"; + } +#endif +#ifdef ORT_CUDA_BUILD + if(device == "CUDA") { + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId)); + LOG(info) << "(ORT) CUDA execution provider set"; + dev_mem_str = "Cuda"; + } +#endif + + if(allocateDeviceMemory){ + pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); + LOG(info) << "(ORT) Memory info set to on-device memory"; + } + + if(device == "CPU") { + (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads); + if(intraOpNumThreads > 1){ + (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL); + } else if(intraOpNumThreads == 1){ + (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); + } + LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads"; + } + + (pImplOrt->sessionOptions).DisableMemPattern(); + (pImplOrt->sessionOptions).DisableCpuMemArena(); + + if(enableProfiling){ + if(optionsMap.contains("profiling-output-path")){ + (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str()); + } else { + LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now."; + (pImplOrt->sessionOptions).DisableProfiling(); + } + } else { + (pImplOrt->sessionOptions).DisableProfiling(); + } + (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations)); + (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel)); + + pImplOrt->env = std::make_shared(OrtLoggingLevel(loggingLevel), (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str())); + (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); + + for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { + mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); + } + for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { + mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + } + for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { + mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get()); + } + for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { + mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + } + + inputNamesChar.resize(mInputNames.size(), nullptr); + std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar), + [&](const std::string& str) { return str.c_str(); }); + outputNamesChar.resize(mOutputNames.size(), nullptr); + std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar), + [&](const std::string& str) { return str.c_str(); }); + + // Print names + if(loggingLevel > 1) { + LOG(info) << "Input Nodes:"; + for (size_t i = 0; i < mInputNames.size(); i++) { + LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]); + } + + LOG(info) << "Output Nodes:"; + for (size_t i = 0; i < mOutputNames.size(); i++) { + LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]); + } + } +} + +void OrtModel::resetSession() { + (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); +} + +template +std::vector OrtModel::v2v(std::vector& input, bool clearInput) { + if constexpr (std::is_same_v){ + return input; + } else { + std::vector output(input.size()); + std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); }); + if(clearInput) input.clear(); + return output; + } +} + +template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h +std::vector OrtModel::inference(std::vector& input){ + std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + std::vector inputTensor; + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (reinterpret_cast(input)).data(), input.size(), inputShape.data(), inputShape.size())); + // input.clear(); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + O* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); + std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; + outputTensors.clear(); + return outputValuesVec; +} + +template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h +std::vector OrtModel::inference(std::vector>& input){ + std::vector inputTensor; + for(auto i : input){ + std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (reinterpret_cast(i)).data(), i.size(), inputShape.data(), inputShape.size())); + } + // input.clear(); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + O* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); + std::vector outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]}; + outputTensors.clear(); + return outputValuesVec; +} + +std::string OrtModel::printShape(const std::vector& v) +{ + std::stringstream ss(""); + for (size_t i = 0; i < v.size() - 1; i++) + ss << v[i] << "x"; + ss << v[v.size() - 1]; + return ss.str(); +} + +template <> std::vector OrtModel::inference(std::vector& input) { + std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + std::vector inputTensor; + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size())); + // input.clear(); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + float* outputValues = outputTensors[0].template GetTensorMutableData(); + std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; + outputTensors.clear(); + return outputValuesVec; +} + +template <> std::vector OrtModel::inference(std::vector& input) { + std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + std::vector inputTensor; + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); + // input.clear(); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + float* outputValues = outputTensors[0].template GetTensorMutableData(); + std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; + outputTensors.clear(); + return outputValuesVec; +} + +template <> std::vector OrtModel::inference(std::vector& input) { + std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + std::vector inputTensor; + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); + // input.clear(); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); + std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; + outputTensors.clear(); + return outputValuesVec; +} + +template <> std::vector OrtModel::inference(std::vector& input) { + std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + std::vector inputTensor; + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); + // input.clear(); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); + std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; + outputTensors.clear(); + return outputValuesVec; +} + +template <> std::vector OrtModel::inference(std::vector>& input) { + std::vector inputTensor; + for(auto i : input){ + std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(i.data()), i.size(), inputShape.data(), inputShape.size())); + } + // input.clear(); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); + std::vector outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]}; + outputTensors.clear(); + return outputValuesVec; +} + +} // namespace ml + +} // namespace o2 \ No newline at end of file diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index ad8e53309beee..0efed3ad4c76c 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -63,7 +63,6 @@ set(SRCS Merger/GPUTPCGlobalDebugSortKernels.cxx Merger/GPUTPCGMPhysicalTrackModel.cxx Merger/GPUTPCGMPolynomialFieldManager.cxx - ML/onnx_interface.cxx DataTypes/GPUTRDTrack.cxx TRDTracking/GPUTRDTracker.cxx TRDTracking/GPUTRDTrackletWord.cxx @@ -313,7 +312,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") PUBLIC_LINK_LIBRARIES O2::GPUCommon O2::ReconstructionDataFormats O2::TPCFastTransformation - ONNXRuntime::ONNXRuntime + O2::ML PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC SOURCES ${SRCS_DATATYPE_HEADERS}) target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAVE_O2HEADERS) diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 31d46b928a33f..24c1ea6a6e2ce 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -294,16 +294,26 @@ AddOption(printSettings, bool, false, "", 0, "Print all settings when initializi AddVariable(eventDisplay, GPUCA_NAMESPACE::gpu::GPUDisplayFrontendInterface*, nullptr) AddSubConfig(GPUSettingsProcessingRTC, rtc) AddSubConfig(GPUSettingsProcessingParam, param) -AddOption(applyNNclusterizer, int, 0, "", 0, "(Bool, default = 0), if the neural network clsuterizer should be used.") -AddOption(nnClusterizerVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs") +AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural network clusterizer should be used.") +AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)") +AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id") +AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference") +AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16 +AddOption(nnInferenceThreadsPerNN, int, 0, "", 0, "Number of threads used to evaluate one neural network") +AddOption(nnInferenceEnableOrtOptimization, unsigned int, 1, "", 0, "Enables graph optimizations in ONNX Runtime. Can be greater than 1!") +AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime") +AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If mmInferenceOrtProfiling is set, the path to store the profiling data") +AddOption(nnInferenceVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs") +AddOption(nnClusterizerAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input") +AddOption(nnClusterizerSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2") +AddOption(nnClusterizerSizeInputPad, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2") +AddOption(nnClusterizerSizeInputTime, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2") +AddOption(nnClusterizerUseCFregression, int, 0, "", 0, "(bool, default = false) If true, use the regression from the native clusterizer and not the NN") +AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable") AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path") -AddOption(nnRegressionPath, std::string, "./network_reg.onnx", "", 0, "The regression network path") -AddOption(nnClassThreshold, float, 0.16, "", 0, "The cutoff at which clusters will be accepted / rejected.") -AddOption(nnSigmoidTrafoThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).") -AddOption(nnAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input") -AddOption(nnSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2") -AddOption(nnSizeInputPad, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2") -AddOption(nnSizeInputTime, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2") +AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.") +AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path") +AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).") AddHelp("help", 'h') EndConfig() #endif // __OPENCL__ diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx index f413598c13f59..528c683944ef1 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.cxx +++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx @@ -742,7 +742,7 @@ int32_t GPUChainTracking::RunChain() return 1; } } else if (mIOPtrs.tpcPackedDigits || mIOPtrs.tpcZS) { - if (runRecoStep(RecoStep::TPCClusterFinding, &GPUChainTracking::RunTPCClusterizer, false)) { // FIXME: This enables the neural network clusterization -> Need to actually set this as configurable + if (runRecoStep(RecoStep::TPCClusterFinding, &GPUChainTracking::RunTPCClusterizer, false)) { return 1; } } diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index e69c3d15c6fc2..eafd50a72424f 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -841,7 +841,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); } else { - // FIXME: This needs to be removed when I actually apply the NN! For now its onyl to make the code work + // FIXME: This potentially needs to be removed when I actually apply the NN. For now its only to make the code work runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); } @@ -875,16 +875,60 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); if(GetProcessingSettings().applyNNclusterizer){ - clusterer.model_class.init(GetProcessingSettings().nnClassificationPath, 1, 1, GetProcessingSettings().nnClusterizerVerbosity); - clusterer.model_reg.init(GetProcessingSettings().nnRegressionPath, 1, 1, GetProcessingSettings().nnClusterizerVerbosity); - clusterer.nnSizeInputRow = GetProcessingSettings().nnSizeInputRow; - clusterer.nnSizeInputPad = GetProcessingSettings().nnSizeInputPad; - clusterer.nnSizeInputTime = GetProcessingSettings().nnSizeInputTime; - clusterer.nnAddIndexData = GetProcessingSettings().nnAddIndexData; + // Settings for the clusterizer + clusterer.nnClusterizerUseCFregression = GetProcessingSettings().nnClusterizerUseCFregression; + clusterer.nnClusterizerSizeInputRow = GetProcessingSettings().nnClusterizerSizeInputRow; + clusterer.nnClusterizerSizeInputPad = GetProcessingSettings().nnClusterizerSizeInputPad; + clusterer.nnClusterizerSizeInputTime = GetProcessingSettings().nnClusterizerSizeInputTime; + clusterer.nnClusterizerAddIndexData = GetProcessingSettings().nnClusterizerAddIndexData; + clusterer.nnClusterizerElementSize = ((2*clusterer.nnClusterizerSizeInputRow + 1) * (2*clusterer.nnClusterizerSizeInputPad + 1) * (2*clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0); + clusterer.nnClusterizerBatchedMode = GetProcessingSettings().nnClusterizerBatchedMode; + clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnInferenceVerbosity; + + // Settings for the NN evaluation clusterer.nnClassThreshold = GetProcessingSettings().nnClassThreshold; - clusterer.nnSigmoidTrafoThreshold = GetProcessingSettings().nnSigmoidTrafoThreshold; - clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnClusterizerVerbosity; - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0); + clusterer.nnSigmoidTrafoClassThreshold = GetProcessingSettings().nnSigmoidTrafoClassThreshold; + + // Settings for the neural network evaluation + clusterer.OrtOptions = { + {"model-path", GetProcessingSettings().nnClassificationPath}, + {"device", GetProcessingSettings().nnInferenceDevice}, + {"device-id", std::to_string(GetProcessingSettings().nnInferenceDeviceId)}, + {"allocate-device-memory", std::to_string(GetProcessingSettings().nnInferenceAllocateDevMem)}, + {"dtype", GetProcessingSettings().nnInferenceDtype}, + {"intra-op-num-threads", std::to_string(GetProcessingSettings().nnInferenceThreadsPerNN)}, + {"enable-optimizations", std::to_string(GetProcessingSettings().nnInferenceEnableOrtOptimization)}, + {"enable-profiling", std::to_string(GetProcessingSettings().nnInferenceOrtProfiling)}, + {"profiling-output-path", GetProcessingSettings().nnInferenceOrtProfilingPath}, + {"logging-level", std::to_string(GetProcessingSettings().nnInferenceVerbosity)} + }; + clusterer.model_class.init(clusterer.OrtOptions); + if(!clusterer.nnClusterizerUseCFregression){ + std::vector reg_model_paths = o2::utils::Str::tokenize(GetProcessingSettings().nnRegressionPath, ':'); + if(clusterer.model_class.getNumOutputNodes()[0][1] == 1){ + clusterer.OrtOptions["model-path"] = reg_model_paths[0]; + clusterer.model_reg_1.init(clusterer.OrtOptions); + } else { + if(reg_model_paths.size() == 1){ + clusterer.OrtOptions["model-path"] = reg_model_paths[0]; + clusterer.model_reg_1.init(clusterer.OrtOptions); + } else { + clusterer.OrtOptions["model-path"] = reg_model_paths[0]; + clusterer.model_reg_1.init(clusterer.OrtOptions); + clusterer.OrtOptions["model-path"] = reg_model_paths[1]; + clusterer.model_reg_2.init(clusterer.OrtOptions); + } + } + } else { + runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}}); + DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); + } + + if(clusterer.nnSigmoidTrafoClassThreshold){ + // Inverse sigmoid transformation + clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold/(1.f-clusterer.nnClassThreshold)); + } + runKernel({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0); } else { runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0); } @@ -897,9 +941,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if(!GetProcessingSettings().applyNNclusterizer){ runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); } else { - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); + runKernel({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); } } + if (GetProcessingSettings().debugLevel >= 3) { GPUInfo("Sector %02d Fragment %02d Lane %d: Found clusters: digits %u peaks %u clusters %u", iSlice, fragment.index, lane, (int32_t)clusterer.mPmemory->counters.nPositions, (int32_t)clusterer.mPmemory->counters.nPeaks, (int32_t)clusterer.mPmemory->counters.nClusters); } diff --git a/GPU/GPUTracking/ML/onnx_interface.cxx b/GPU/GPUTracking/ML/onnx_interface.cxx deleted file mode 100644 index 9bb5137ec63dd..0000000000000 --- a/GPU/GPUTracking/ML/onnx_interface.cxx +++ /dev/null @@ -1,235 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. - -/// -/// \file model.cxx -/// -/// \author Christian Sonnabend -/// -/// \brief A general-purpose class with functions for ONNX model applications -/// - -// ONNX includes -#include "ML/onnx_interface.h" - -namespace o2 -{ - -namespace ml -{ - -std::string OnnxModel::printShape(const std::vector& v) -{ - std::stringstream ss(""); - for (size_t i = 0; i < v.size() - 1; i++) - ss << v[i] << "x"; - ss << v[v.size() - 1]; - return ss.str(); -} - -void OnnxModel::init(std::string localPath, bool enableOptimizations, int threads, int verbosity) -{ - - if(verbosity > 1){ - LOG(info) << "--- ONNX-ML model ---"; - LOG(info) << "Taking model from: " << localPath; - } - modelPath = localPath; - activeThreads = threads; - -#if __has_include() -#else - mMemoryInfo = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault); -#endif - - /// Enableing optimizations - if(threads != 0){ - // sessionOptions.SetInterOpNumThreads(1); - if(threads == 1){ - sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); - } - else{ - sessionOptions.SetExecutionMode(ExecutionMode::ORT_PARALLEL); - sessionOptions.SetIntraOpNumThreads(threads); - } - } - if (enableOptimizations) { - // sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED); - sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); - // uint32_t coreml_flags = 0; - // coreml_flags |= COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE; - // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(sessionOptions, coreml_flags)); - } - - mEnv = std::make_shared(ORT_LOGGING_LEVEL_WARNING, "onnx-model"); - #if __has_include() - mSession = std::make_shared(*mEnv, modelPath, sessionOptions); - mInputNames = mSession->GetInputNames(); - mInputShapes = mSession->GetInputShapes(); - mOutputNames = mSession->GetOutputNames(); - mOutputShapes = mSession->GetOutputShapes(); - #else - mSession = std::make_shared(*mEnv, modelPath.c_str(), sessionOptions); - Ort::AllocatorWithDefaultOptions tmpAllocator; - for (size_t i = 0; i < mSession->GetInputCount(); ++i) { - mInputNames.push_back(mSession->GetInputNameAllocated(i, tmpAllocator).get()); - } - for (size_t i = 0; i < mSession->GetInputCount(); ++i) { - mInputShapes.emplace_back(mSession->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); - } - for (size_t i = 0; i < mSession->GetOutputCount(); ++i) { - mOutputNames.push_back(mSession->GetOutputNameAllocated(i, tmpAllocator).get()); - } - for (size_t i = 0; i < mSession->GetOutputCount(); ++i) { - mOutputShapes.emplace_back(mSession->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); - } - #endif - - if(verbosity > 1){ - LOG(info) << "Input Nodes:"; - for (size_t i = 0; i < mInputNames.size(); i++) { - LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]); - } - - LOG(info) << "Output Nodes:"; - for (size_t i = 0; i < mOutputNames.size(); i++) { - LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]); - } - LOG(info) << "--- Model initialized! ---"; - } -} - -// float* OnnxModel::inference(std::vector input, int device_id) -// { - -// // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id)); - -// try { -// auto outputTensors = mSession->Run(mInputNames, input, mOutputNames); -// float* outputValues = outputTensors[0].GetTensorMutableData(); -// return outputValues; -// } catch (const Ort::Exception& exception) { -// LOG(error) << "Error running model inference: " << exception.what(); -// } -// return nullptr; -// } - -// float* OnnxModel::inference(std::vector input, int device_id) -// { -// -// // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id)); -// -// int64_t size = input.size(); -// assert(size % mInputShapes[0][1] == 0); -// std::vector inputShape{size / mInputShapes[0][1], mInputShapes[0][1]}; -// std::vector inputTensors; -// inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor(input.data(), size, inputShape)); -// try { -// auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); -// float* outputValues = outputTensors[0].GetTensorMutableData(); -// return outputValues; -// } catch (const Ort::Exception& exception) { -// LOG(error) << "Error running model inference: " << exception.what(); -// } -// return nullptr; -// } - -template -float* OnnxModel::inference(T input, unsigned int size) -{ - std::vector inputShape = mInputShapes[0]; - inputShape[0] = size; - std::vector inputTensors; - size_t mem_size = 1; - for(auto elem : inputShape){ - mem_size*=elem; - } -#if __has_include() - inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor(input.data(), mem_size, inputShape)); - try { - auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); - inputTensors.clear(); - float* outputValues = outputTensors[0].GetTensorMutableData(); - return outputValues; - } catch (const Ort::Exception& exception) { - LOG(error) << "Error running model inference: " << exception.what(); - } -#else - std::vector tmpInputs; - std::vector tmpOutputs; - inputTensors.emplace_back(Ort::Value::CreateTensor(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1)); - try { - auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size()); - inputTensors.clear(); - float* outputValues = outputTensors[0].GetTensorMutableData(); - return outputValues; - } catch (const Ort::Exception& exception) { - LOG(error) << "Error running model inference: " << exception.what(); - } -#endif - // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB"; - return nullptr; -} - -template -std::vector OnnxModel::inference_vector(T input, unsigned int size) -{ - std::vector inputShape = mInputShapes[0]; - inputShape[0] = size; - std::vector inputTensors; - // std::vector outputValues; - size_t mem_size = 1; - for(auto elem : inputShape){ - mem_size*=elem; - } -#if __has_include() - inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor(input.data(), mem_size, inputShape)); - try { - auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames); - inputTensors.clear(); - float* outputValues = outputTensors[0].GetTensorMutableData(); - return std::vector{outputValues, outputValues + size * mOutputShapes[0][1]}; - } catch (const Ort::Exception& exception) { - LOG(error) << "Error running model inference: " << exception.what(); - } -#else - std::vector tmpInputs; - std::vector tmpOutputs; - for (unsigned int i = 0; i < mInputNames.size(); i++) { - tmpInputs.emplace_back(mInputNames[i].c_str()); - } - for (unsigned int i = 0; i < mOutputNames.size(); i++) { - tmpOutputs.emplace_back(mOutputNames[i].c_str()); - } - inputTensors.emplace_back(Ort::Value::CreateTensor(mMemoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size())); - try { - auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size()); - inputTensors.clear(); - float* outputValues = outputTensors[0].GetTensorMutableData(); - return std::vector{outputValues, outputValues + size * mOutputShapes[0][1]}; - } catch (const Ort::Exception& exception) { - LOG(error) << "Error running model inference: " << exception.what(); - } -#endif - return std::vector{}; -} - -void OnnxModel::setActiveThreads(int threads) -{ - activeThreads = threads; -} - -template float* OnnxModel::inference(std::vector, unsigned int); -template std::vector OnnxModel::inference_vector(std::vector, unsigned int); - -} // namespace gpu - -} // namespace GPUCA_NAMESPACE \ No newline at end of file diff --git a/GPU/GPUTracking/ML/onnx_interface.h b/GPU/GPUTracking/ML/onnx_interface.h deleted file mode 100644 index 17c45f439dc63..0000000000000 --- a/GPU/GPUTracking/ML/onnx_interface.h +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. - -/// -/// \file model.h -/// -/// \author Christian Sonnabend -/// -/// \brief A general-purpose class for ONNX models -/// - -#ifndef GPU_ML_ONNX_INTERFACE_H -#define GPU_ML_ONNX_INTERFACE_H - -// C++ and system includes -#if __has_include() -#include -#else -#include -#endif -#include -#include -#include -#include -#include - -// O2 includes -#include "Framework/Logger.h" - -namespace o2 -{ - -namespace ml -{ - -class OnnxModel -{ - - public: - OnnxModel(OrtAllocatorType allocatorType = OrtDeviceAllocator, OrtMemType memoryType = OrtMemTypeCPU) : mMemoryInfo(Ort::MemoryInfo::CreateCpu(allocatorType, memoryType)) {}; - virtual ~OnnxModel() = default; - - // Inferencing - void init(std::string, bool = false, int = 0, int = 0); - // float* inference(std::vector, int = 0); - // float* inference(std::vector, int = 0); - template float* inference(T input, unsigned int size); - template std::vector inference_vector(T input, unsigned int size); - - // Reset session - #if __has_include() - void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); }; - #else - void resetSession() { mSession.reset(new Ort::Session{*mEnv, modelPath.c_str(), sessionOptions}); }; - #endif - - // Getters & Setters - Ort::SessionOptions* getSessionOptions() { return &sessionOptions; } // For optimizations in post - #if __has_include() - std::shared_ptr getSession() { return mSession; } - #else - std::shared_ptr getSession() { return mSession; } - #endif - std::vector> getNumInputNodes() const { return mInputShapes; } - std::vector> getNumOutputNodes() const { return mOutputShapes; } - void setActiveThreads(int); - - private: - // Environment variables for the ONNX runtime - std::shared_ptr mEnv = nullptr; - std::shared_ptr mSession = nullptr; ///< ONNX session - Ort::MemoryInfo mMemoryInfo; - Ort::SessionOptions sessionOptions; - - // Input & Output specifications of the loaded network - std::vector mInputNames; - std::vector> mInputShapes; - std::vector mOutputNames; - std::vector> mOutputShapes; - - // Environment settings - std::string modelPath; - int activeThreads = 0; - - // Internal function for printing the shape of tensors - std::string printShape(const std::vector&); -}; - -} // namespace gpu - -} // namespace GPUCA_NAMESPACE - -#endif // GPU_ML_ONNX_INTERFACE_H \ No newline at end of file diff --git a/GPU/GPUTracking/TPCClusterFinder/ChargePos.h b/GPU/GPUTracking/TPCClusterFinder/ChargePos.h index f5ca9dbedd5ac..c2ee542f65434 100644 --- a/GPU/GPUTracking/TPCClusterFinder/ChargePos.h +++ b/GPU/GPUTracking/TPCClusterFinder/ChargePos.h @@ -47,6 +47,7 @@ struct ChargePos { GPUdi() tpccf::Row row() const { return gpad / TPC_PADS_PER_ROW_PADDED; } GPUdi() tpccf::Pad pad() const { return gpad % TPC_PADS_PER_ROW_PADDED - GPUCF_PADDING_PAD; } GPUdi() tpccf::TPCFragmentTime time() const { return timePadded - GPUCF_PADDING_TIME; } + GPUdi() tpccf::TPCFragmentTime globalTime() const { return timePadded; } private: // Maps the position of a pad given as row and index in that row to a unique diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h index 10b52ca05da71..130453e833911 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h @@ -19,7 +19,8 @@ #include "GPUProcessor.h" #include "GPUDataTypes.h" #include "CfFragment.h" -#include "ML/onnx_interface.h" +#include "ML/ort_interface.h" +#include "ML/3rdparty/GPUORTFloat16.h" using namespace o2::ml; @@ -144,16 +145,20 @@ class GPUTPCClusterFinder : public GPUProcessor int16_t mZSOffsetId = -1; int16_t mOutputId = -1; - int nnSizeInputRow = 3; - int nnSizeInputPad = 3; - int nnSizeInputTime = 3; - bool nnAddIndexData = true; + int nnClusterizerSizeInputRow = 3; + int nnClusterizerSizeInputPad = 3; + int nnClusterizerSizeInputTime = 3; + int nnClusterizerElementSize = -1; + bool nnClusterizerAddIndexData = true; float nnClassThreshold = 0.16; - bool nnSigmoidTrafoThreshold = 1; - int nnClusterizerVerbosity = 1; - - OnnxModel model_class, model_reg; - + bool nnSigmoidTrafoClassThreshold = 1; + int nnClusterizerUseCFregression = 0; + int nnClusterizerBatchedMode = 1; + int nnClusterizerVerbosity = 0; + + std::unordered_map OrtOptions; + OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters + #ifndef GPUCA_GPUCODE void DumpDigits(std::ostream& out); void DumpChargeMap(std::ostream& out, std::string_view); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index 98f7cdee72b0c..e6cf745ce3101 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -34,76 +34,63 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; - GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, clusterer.nnSizeInputRow, clusterer.nnSizeInputPad, clusterer.nnSizeInputTime, clusterer.nnAddIndexData, clusterer.nnClassThreshold, clusterer.nnSigmoidTrafoThreshold, clusterer.nnClusterizerVerbosity); - + if(clusterer.OrtOptions["dtype"].find("32") != std::string::npos){ + GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); + } else if(clusterer.OrtOptions["dtype"].find("16") != std::string::npos) { + GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); + } else { + LOG(fatal) << "Unsupported data type for neural network clusterizer!"; + } // tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; // // GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); } -// GPUd() void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC) -// { -// Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); -// CPU_ONLY( -// MCLabelAccumulator labelAcc(clusterer)); -// -// tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; -// -// std::string path_class = "", path_reg = ""; -// -// clusterer.model_class.init(path_class, 1, 0); -// clusterer.model_reg.init(path_reg, 1, 0); -// -// GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true); -// } +int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo) +{ + return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2); +} -int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current) +int GPUTPCNNClusterizer::rowOffset(int row, int global_shift) { - std::vector pad_row_max{ - 65, 65, 65, 67, 67, 67, 69, 69, 69, 71, 71, 71, 73, 73, 73, 73, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 81, 81, 81, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 91, 91, 91, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 99, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 79, 81, 81, 81, 83, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 101, 101, 101, 103, 103, 103, 105, 109, 109, 111, 111, 111, 113, 113, 113, 115, 115, 115, 117, 117, 117, 117, 117, 119, 119, 121, 121, 123, 123, 123, 125, 125, 127, 127, 127, 129, 129, 131, 131, 131, 133, 133, 135, 135, 137, 137 - }; - return (int)((pad_row_max[row_ref] - pad_row_max[row_current]) / 2); + return (row > 62 ? global_shift : 0); } // --------------------------------- -bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift) +bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo) { - std::vector pad_row_max{ - 65, 65, 65, 67, 67, 67, 69, 69, 69, 71, 71, 71, 73, 73, 73, 73, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 81, 81, 81, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 91, 91, 91, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 99, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 79, 81, 81, 81, 83, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 101, 101, 101, 103, 103, 103, 105, 109, 109, 111, 111, 111, 113, 113, 113, 115, 115, 115, 117, 117, 117, 117, 117, 119, 119, 121, 121, 123, 123, 123, 125, 125, 127, 127, 127, 129, 129, 131, 131, 131, 133, 133, 135, 135, 137, 137 - }; - if (row < 0 || pad < 0) { + if (pad < 0 || row < 0) { // Faster short-circuit return true; } else if (row <= 62) { - // if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row]) / 2) { + // if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)) / 2) { // return true; // } else { // return false; // } - if (pad < 0 || pad > pad_row_max[row]) { + if (pad < 0 || pad > geo.NPads(row)) { return true; } else { return false; } - } else if (row <= 62 + global_shift) { + } else if (row <= 62 + global_shift) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network return true; } else if (row <= o2::tpc::constants::MAXGLOBALPADROW-1 + global_shift) { - //if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row - global_shift]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row - global_shift]) / 2) { + //if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)- global_shift]) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)- global_shift]) / 2) { // return true; //} else { // return false; //} - if (pad < 0 || pad > pad_row_max[row]) { + if (pad < 0 || pad > geo.NPads(row)) { return true; } else { return false; } - } else if (row > o2::tpc::constants::MAXGLOBALPADROW-1 + global_shift) { - return true; } else { - return false; + return true; } } +template GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int iBlock, int iThread, processorType& clusterer, const CfFragment& fragment, @@ -116,104 +103,321 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i uint maxClusterPerRow, uint* clusterInRow, tpc::ClusterNative* clusterByRow, - uint* clusterPosInRow, - int in_row, int in_pad, int in_time, bool add_index_data, float class_threshold, bool sigmoid_transform, int verbosity){ + uint* clusterPosInRow){ - std::vector input_data(((2*in_row + 1) * (2*in_pad + 1) * (2*in_time + 1) + (add_index_data ? 3 : 0)), -1.f); - float classification_threshold = class_threshold; - if(sigmoid_transform){ - classification_threshold = (float)std::log(class_threshold/(1.f-class_threshold)); - } - - uint idx = get_global_id(0); - uint cls = CAMath::Min(idx, clusternum - 1); + uint glo_idx = get_global_id(0) * clusterer.nnClusterizerBatchedMode; + if(glo_idx >= clusternum){ + return; + } - // For certain configurations dummy work items are added, so the total - // number of work items is dividable by 64. - // These dummy items also compute the last cluster but discard the result. - - ChargePos peak = clusterer.mPfilteredPeakPositions[cls]; - int row = peak.row(), pad = peak.pad(), time = peak.time(); - float central_charge = chargeMap[peak].unpack(); - CPU_ONLY(labelAcc->collect(peak, central_charge)); - // unsigned int glo_idx = cls * ((2*in_row + 1) + (2*in_pad + 1) * (2*in_time + 1)); - unsigned int write_idx = 0; - for(int r = -in_row; r <= in_row; r++){ - for(int p = -in_pad; p <= in_pad; p++){ - for(int t = -in_time; t <= in_time; t++){ - int offset = GPUTPCNNClusterizer::padOffset(row, row + r); - if(GPUTPCNNClusterizer::isBoundary(row + r, pad + p, in_row)){ - continue; - } else { - // unsigned int loc_idx = (row + r) * (2*in_pad + 1) * (2*in_time + 1) + (pad + p) * (2*in_time + 1) + (time + t); - ChargePos tmp_pos(row + r, pad + p + offset, time + t); - input_data[write_idx] = (chargeMap[tmp_pos].unpack() / central_charge); - write_idx++; + std::vector central_charges(clusterer.nnClusterizerBatchedMode, -1.f); + std::vector input_data(clusterer.nnClusterizerElementSize * clusterer.nnClusterizerBatchedMode, (T)-1.f); + std::vector peak_positions(clusterer.nnClusterizerBatchedMode); + unsigned int write_idx = 0; + + for(int batch_counter = 0; batch_counter < clusterer.nnClusterizerBatchedMode; batch_counter++){ + + uint cls = CAMath::Min(glo_idx + batch_counter, clusternum - 1); + + ChargePos peak = clusterer.mPfilteredPeakPositions[cls]; + int row = peak.row(), pad = peak.pad(), time = peak.time(); + float central_charge = chargeMap[peak].unpack(); + + peak_positions[batch_counter] = peak; + central_charges[batch_counter] = central_charge; + + // unsigned int batch_offset = batch_counter * clusterer.nnClusterizerElementSize; + for(int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++){ + bool push_mc_label = (r == 0); + int pad_offset = GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry); + int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow); + for(int p = -clusterer.nnClusterizerSizeInputPad; p <= clusterer.nnClusterizerSizeInputPad; p++){ + push_mc_label &= (std::abs(p) < 2); // Use inner 5x5 window + bool is_boundary = GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p + pad_offset, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry); + for(int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++){ + push_mc_label &= (std::abs(t) < 2); // Use inner 5x5 window + if(!is_boundary){ + ChargePos tmp_pos(row + r, pad + p + pad_offset, time + t); + input_data[write_idx] = (T)(chargeMap[tmp_pos].unpack() / central_charge); + if(push_mc_label){ + ChargePos tmp_pos_mc(row, pad + p, time + t); + CPU_ONLY(labelAcc->collect(tmp_pos, chargeMap[tmp_pos_mc].unpack())); + } + } + write_idx++; + } } } - // if(idx == 100){ - // LOG(info) << "[" << input_data[write_idx-7] << ", " << input_data[write_idx-6] << ", " << input_data[write_idx-5] << ", " << input_data[write_idx-4] << ", " << input_data[write_idx-3] << ", " << input_data[write_idx-2] << ", " << input_data[write_idx-1] << "]"; - // } + if(clusterer.nnClusterizerAddIndexData){ + input_data[write_idx] = (T)(clusterer.mISlice / 36.f); + input_data[write_idx + 1] = (T)(row / 152.f); + input_data[write_idx + 2] = (T)((float)pad / clusterer.Param().tpcGeometry.NPads(row)); + write_idx+=3; + // if(idx == 100){ + // LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]"; + // } + } } - } - if(add_index_data){ - input_data[input_data.size()-3] = 1; - input_data[input_data.size()-2] = (float)peak.row() / 152.f; - input_data[input_data.size()-1] = (float)peak.pad() / 138.f; - // if(idx == 100){ - // LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]"; - // } - } - std::vector out_class = clusterer.model_class.inference_vector(input_data, 1); - std::vector out_reg = clusterer.model_reg.inference_vector(input_data, 1); - int num_outputs = clusterer.model_reg.getNumOutputNodes()[0][1]; + std::vector index_class_2; + std::vector out_class = clusterer.model_class.inference(input_data); + // LOG(info) << "input_data.size(): " << input_data.size() << "; write_idx: " << write_idx << "; out_class.size(): " << out_class.size(); + int num_output_classes = clusterer.model_class.getNumOutputNodes()[0][1]; + + if(num_output_classes > 1){ + std::vector tmp_out_class(clusterer.nnClusterizerBatchedMode); + for(int cls_idx = 0; cls_idx < clusterer.nnClusterizerBatchedMode; cls_idx++){ + auto elem_iterator = out_class.begin() + (unsigned int)(cls_idx*num_output_classes); + tmp_out_class[cls_idx] = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator+num_output_classes)) - 1; // -1 since 2-class classifier will have 3 outputs: classes 0, 1, 2 + if(tmp_out_class[cls_idx] > 1){ + index_class_2.push_back(cls_idx); + } + } + out_class = tmp_out_class; + } - if((verbosity > 4) && idx == 100){ - LOG(info) << "Classification model: " << out_class[0] << " (>? " << classification_threshold << ")"; - LOG(info) << "Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4]; - } + if(!clusterer.nnClusterizerUseCFregression) { + + std::vector out_reg = clusterer.model_reg_1.inference(input_data), tmp_out_reg_2; + if(index_class_2.size() > 0){ + std::vector tmp_in_reg_2(index_class_2.size() * clusterer.nnClusterizerElementSize); + int fill_counter = 0; + for(int cls_idx : index_class_2){ + int from_idx = cls_idx*clusterer.nnClusterizerElementSize, to_idx = fill_counter * clusterer.nnClusterizerElementSize; + for(int reg_idx = 0; reg_idx < clusterer.nnClusterizerElementSize; reg_idx++){ + tmp_in_reg_2[to_idx + reg_idx] = input_data[from_idx + reg_idx]; + } + fill_counter++; + } + tmp_out_reg_2 = clusterer.model_reg_2.inference(input_data); + } + + input_data.clear(); - if(out_class[0] > classification_threshold){ - ClusterAccumulator pc; - pc.setFull(central_charge * out_reg[4], peak.pad() + out_reg[0], out_reg[2], fragment.start + peak.time() + out_reg[1], out_reg[3], 0, 0); - tpc::ClusterNative myCluster; - bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param()); - if ((verbosity > 0) && rejectCluster) { - LOG(warning) << "Cluster rejected!"; - if (clusterPosInRow) { - clusterPosInRow[idx] = maxClusterPerRow; + if((clusterer.nnClusterizerVerbosity >= 4) && glo_idx == 0){ + LOG(info) << "[CF] Classification model: " << out_class[0] << " (>? " << clusterer.nnClassThreshold << ")"; + LOG(info) << "[CF] Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4]; } - return; - } - uint rowIndex = 0; - if (clusterByRow != nullptr) { - rowIndex = sortIntoBuckets( - clusterer, - myCluster, - peak.row(), - maxClusterPerRow, - clusterInRow, - clusterByRow); - if (clusterPosInRow != nullptr) { - clusterPosInRow[idx] = rowIndex; + int num_outputs_1 = clusterer.model_reg_1.getNumOutputNodes()[0][1], num_outputs_2 = 0, counter_class_2_idcs = 0; + if(num_output_classes > 1){ + num_outputs_2 = clusterer.model_reg_2.getNumOutputNodes()[0][1]; } - } else if (clusterPosInRow) { - rowIndex = clusterPosInRow[idx]; - } - CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, maxClusterPerRow)); - } else { - if (clusterPosInRow) { - clusterPosInRow[idx] = maxClusterPerRow; - } - return; - } - if((verbosity > 4) && idx == 100){ - LOG(info) << "Clusterization done!"; - } + for(int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) { + + if (glo_idx + element >= clusternum) { + return; + } + int model_output_index = element*num_outputs_1; + if(out_class[element] > clusterer.nnClassThreshold) { + if((num_output_classes == 1) || ((num_output_classes > 1) && (out_class[element] < 2))) { + // CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); + ClusterAccumulator pc; + + ClusterAccumulator dummy_pc; + CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); + + // Dummy build to push MC labels + buildCluster( + calib, + chargeMap, + peak_positions[element], + smem.posBcast, + smem.buf, + smem.innerAboveThreshold, + &dummy_pc, + labelAcc); + + if (fragment.isOverlap(peak_positions[element].time())) { + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + + pc.setFull(central_charges[element] * out_reg[model_output_index + 4], peak_positions[element].pad() + out_reg[model_output_index + 0], out_reg[model_output_index + 2], fragment.start + peak_positions[element].time() + out_reg[model_output_index + 1], out_reg[model_output_index + 3], 0, 0); + // LOG(info) << "Example: " << num_outputs_1 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3]; + + tpc::ClusterNative myCluster; + bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); + if (rejectCluster) { + if(clusterer.nnClusterizerVerbosity > 3){ + LOG(warning) << "[CF] Cluster rejected!"; + } + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + + uint rowIndex = 0; + if (clusterByRow != nullptr) { + rowIndex = sortIntoBuckets( + clusterer, + myCluster, + peak_positions[element].row(), + maxClusterPerRow, + clusterInRow, + clusterByRow); + if (clusterPosInRow != nullptr) { + clusterPosInRow[glo_idx + element] = rowIndex; + } + } else if (clusterPosInRow) { + rowIndex = clusterPosInRow[glo_idx + element]; + } + CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); + } else { + model_output_index = index_class_2[counter_class_2_idcs]*num_outputs_2; + counter_class_2_idcs++; + + // Cluster 1 + CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); + ClusterAccumulator pc; + + if (fragment.isOverlap(peak_positions[element].time())) { + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + + pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 8], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 4], tmp_out_reg_2[model_output_index + 2], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 2], tmp_out_reg_2[model_output_index + 6], 0, 0); + // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3]; + + tpc::ClusterNative myCluster; + bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); + if (rejectCluster) { + if(clusterer.nnClusterizerVerbosity > 3){ + LOG(warning) << "[CF] Cluster rejected!"; + } + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + + uint rowIndex = 0; + if (clusterByRow != nullptr) { + rowIndex = sortIntoBuckets( + clusterer, + myCluster, + peak_positions[element].row(), + maxClusterPerRow, + clusterInRow, + clusterByRow); + if (clusterPosInRow != nullptr) { + clusterPosInRow[glo_idx + element] = rowIndex; + } + } else if (clusterPosInRow) { + rowIndex = clusterPosInRow[glo_idx + element]; + } + CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); + + // Cluster 2 + CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); + pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 9], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 1], tmp_out_reg_2[model_output_index + 5], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 3], tmp_out_reg_2[model_output_index + 7], 0, 0); + // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3]; + rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); + if (rejectCluster) { + if(clusterer.nnClusterizerVerbosity > 3){ + LOG(warning) << "[CF] Cluster rejected!"; + } + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + + rowIndex = 0; + if (clusterByRow != nullptr) { + rowIndex = sortIntoBuckets( + clusterer, + myCluster, + peak_positions[element].row(), + maxClusterPerRow, + clusterInRow, + clusterByRow); + if (clusterPosInRow != nullptr) { + clusterPosInRow[glo_idx + element] = rowIndex; + } + } else if (clusterPosInRow) { + rowIndex = clusterPosInRow[glo_idx + element]; + } + CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); + } + } + } + + } else { + + input_data.clear(); + for(int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) { + if (glo_idx + element >= clusternum) { + return; + } + + if(out_class[element] > clusterer.nnClassThreshold) { + + ClusterAccumulator pc; + CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); + + buildCluster( + calib, + chargeMap, + peak_positions[element], + smem.posBcast, + smem.buf, + smem.innerAboveThreshold, + &pc, + labelAcc); + + if (fragment.isOverlap(peak_positions[element].time())) { + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + pc.finalize(peak_positions[element], central_charges[element], fragment.start, clusterer.Param().tpcGeometry); + + tpc::ClusterNative myCluster; + bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); + + if (rejectCluster) { + if(clusterer.nnClusterizerVerbosity > 3){ + LOG(warning) << "[CF] Cluster rejected!"; + } + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + + uint rowIndex = 0; + if (clusterByRow != nullptr) { + rowIndex = sortIntoBuckets( + clusterer, + myCluster, + peak_positions[element].row(), + maxClusterPerRow, + clusterInRow, + clusterByRow); + if (clusterPosInRow != nullptr) { + clusterPosInRow[glo_idx + element] = rowIndex; + } + } else if (clusterPosInRow) { + rowIndex = clusterPosInRow[glo_idx + element]; + } + + CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); + } + } + } + + if(clusterer.nnClusterizerVerbosity > 4){ + LOG(info) << "[CF] Clusterization done!"; + } } @@ -449,4 +653,4 @@ GPUd() uint GPUTPCNNClusterizer::sortIntoBuckets(processorType& clusterer, const CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket); } return index; -} +} \ No newline at end of file diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 7fbf5a806a916..42104ae2099d3 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -62,8 +62,11 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate static GPUd() void computeClustersImpl(int, int, int, int, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint, uint, uint*, tpc::ClusterNative*, uint*); static GPUd() void exec(int, int, int, int, GPUSharedMemory&, processorType&, char); - static int padOffset(int, int); - static bool isBoundary(int, int, int); + static int padOffset(int, int, const GPUTPCGeometry&); + static int rowOffset(int, int); + static bool isBoundary(int, int, int, const GPUTPCGeometry&); + + template static GPUd() void nn_clusterizer(int, int, int, int, processorType&, const CfFragment&, @@ -76,8 +79,7 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate uint, uint*, tpc::ClusterNative*, - uint*, - int = 3, int = 3, int = 3, bool = 1, float = 0.16, bool = true, int = 1); + uint*); private: // --------------------------------- @@ -93,4 +95,4 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate } // namespace GPUCA_NAMESPACE::gpu -#endif +#endif \ No newline at end of file From 06737fd8d044a75d4e6da947a3ae6792c7ae42af Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 18 Oct 2024 09:16:01 +0200 Subject: [PATCH 12/23] Fixing uchar -> uint8_t --- GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h | 6 +++--- GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx | 8 ++++---- GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h index 534cc44513286..d308b8bd6efa7 100644 --- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h +++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h @@ -43,7 +43,7 @@ class ClusterAccumulator GPUd() void finalize(const ChargePos&, tpccf::Charge, tpccf::TPCTime, const GPUTPCGeometry&); GPUd() bool toNative(const ChargePos&, tpccf::Charge, tpc::ClusterNative&, const GPUParam&) const; - GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uchar splitInTime, uchar splitInPad){ + GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uint8_t splitInTime, uint8_t splitInPad){ mQtot = qtot; mPadMean = padMean; mPadSigma = padSigma; @@ -57,8 +57,8 @@ class ClusterAccumulator GPUd() void setPadSigma(float padSigma) { mPadSigma = padSigma; } GPUd() void setTimeMean(float timeMean) { mTimeMean = timeMean; } GPUd() void setTimeSigma(float timeSigma) { mTimeSigma = timeSigma; } - GPUd() void setSplitInTime(uchar splitInTime) { mSplitInTime = splitInTime; } - GPUd() void setSplitInPad(uchar splitInPad) { mSplitInPad = splitInPad; } + GPUd() void setSplitInTime(uint8_t splitInTime) { mSplitInTime = splitInTime; } + GPUd() void setSplitInPad(uint8_t splitInPad) { mSplitInPad = splitInPad; } private: float mQtot = 0; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index e6cf745ce3101..f5e094a3c363e 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -505,9 +505,9 @@ GPUdii() void GPUTPCNNClusterizer::updateClusterInner( const ChargePos& pos, ClusterAccumulator* cluster, MCLabelAccumulator* labelAcc, - uchar* innerAboveThreshold) + uint8_t* innerAboveThreshold) { - uchar aboveThreshold = 0; + uint8_t aboveThreshold = 0; GPUCA_UNROLL(U(), U()) for (ushort i = 0; i < N; i++) { @@ -520,7 +520,7 @@ GPUdii() void GPUTPCNNClusterizer::updateClusterInner( CPU_ONLY( labelAcc->collect(pos.delta(d), q)); - aboveThreshold |= (uchar(q > calib.tpc.cfInnerThreshold) << i); + aboveThreshold |= (uint8_t(q > calib.tpc.cfInnerThreshold) << i); } innerAboveThreshold[lid] = aboveThreshold; @@ -558,7 +558,7 @@ GPUdii() void GPUTPCNNClusterizer::buildCluster( ChargePos pos, ChargePos* posBcast, PackedCharge* buf, - uchar* innerAboveThreshold, + uint8_t* innerAboveThreshold, ClusterAccumulator* myCluster, MCLabelAccumulator* labelAcc) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 42104ae2099d3..51a5c29022421 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -40,7 +40,7 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate struct GPUSharedMemory { ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE]; PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N]; - uchar innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE]; + uint8_t innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE]; }; #ifdef GPUCA_HAVE_O2HEADERS @@ -84,11 +84,11 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate private: // --------------------------------- - static GPUd() void updateClusterInner(const GPUSettingsRec&, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uchar*); + static GPUd() void updateClusterInner(const GPUSettingsRec&, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uint8_t*); static GPUd() void updateClusterOuter(ushort, ushort, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*); - static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D&, ChargePos, ChargePos*, PackedCharge*, uchar*, ClusterAccumulator*, MCLabelAccumulator*); + static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*); static GPUd() uint sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint, uint, uint*, tpc::ClusterNative*); }; From b14844990173a00a66d9e2ad62185232ab3992d6 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 18 Oct 2024 09:55:31 +0200 Subject: [PATCH 13/23] Adding utils header --- GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 1 + 1 file changed, 1 insertion(+) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index eafd50a72424f..0f22a7472feac 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -37,6 +37,7 @@ #endif #include "utils/strtag.h" +#include #ifndef GPUCA_NO_VC #include From 534da50f248210cff92acdeac763f4f74a2de30e Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 21 Oct 2024 09:40:43 +0200 Subject: [PATCH 14/23] Updating kernels.cmake to uint8_t --- GPU/GPUTracking/kernels.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index 5b5aed94a7472..b6490c0c5b4c6 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -116,7 +116,7 @@ o2_gpu_add_kernel("GPUTPCCFPeakFinder" "= TPCCLUS o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression" "= TPCCLUSTERFINDER" LB single) o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks" "= TPCCLUSTERFINDER" LB single) o2_gpu_add_kernel("GPUTPCCFDeconvolution" "= TPCCLUSTERFINDER" LB single) -o2_gpu_add_kernel("GPUTPCNNClusterizer" "= TPCCLUSTERFINDER" LB single char onlyMC) +o2_gpu_add_kernel("GPUTPCNNClusterizer" "= TPCCLUSTERFINDER" LB single int8_t onlyMC) o2_gpu_add_kernel("GPUTPCCFClusterizer" "= TPCCLUSTERFINDER" LB single int8_t onlyMC) o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets" "= TPCCLUSTERFINDER" NO single) o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten" "= TPCCLUSTERFINDER" NO single GPUTPCLinearLabels* out) From bb2cb6e48d12f71fb634b1429bf284db23bb97ee Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Mon, 21 Oct 2024 07:41:20 +0000 Subject: [PATCH 15/23] Please consider the following formatting changes --- Common/ML/include/ML/ort_interface.h | 76 ++- Common/ML/src/ort_interface.cxx | 88 +-- .../Global/GPUChainTrackingClusterizer.cxx | 23 +- .../TPCClusterFinder/ClusterAccumulator.h | 3 +- .../TPCClusterFinder/GPUTPCClusterFinder.h | 2 +- .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 539 +++++++++--------- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 26 +- 7 files changed, 385 insertions(+), 372 deletions(-) diff --git a/Common/ML/include/ML/ort_interface.h b/Common/ML/include/ML/ort_interface.h index a365860db3279..2fe9a44a0623c 100644 --- a/Common/ML/include/ML/ort_interface.h +++ b/Common/ML/include/ML/ort_interface.h @@ -35,60 +35,58 @@ namespace ml class OrtModel { - public: - // Constructor - OrtModel() = default; - OrtModel(std::unordered_map optionsMap){ reset(optionsMap); } - void init(std::unordered_map optionsMap){ reset(optionsMap); } - void reset(std::unordered_map); + public: + // Constructor + OrtModel() = default; + OrtModel(std::unordered_map optionsMap) { reset(optionsMap); } + void init(std::unordered_map optionsMap) { reset(optionsMap); } + void reset(std::unordered_map); - virtual ~OrtModel() = default; + virtual ~OrtModel() = default; - // Conversion - template - std::vector v2v(std::vector&, bool = true); + // Conversion + template + std::vector v2v(std::vector&, bool = true); - // Inferencing - template // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h - std::vector inference(std::vector&); + // Inferencing + template // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h + std::vector inference(std::vector&); - template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h - std::vector inference(std::vector>&); + template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h + std::vector inference(std::vector>&); - // template // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type - // std::vector inference(std::vector&); + // template // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type + // std::vector inference(std::vector&); - // Reset session - void resetSession(); + // Reset session + void resetSession(); - std::vector> getNumInputNodes() const { return mInputShapes; } - std::vector> getNumOutputNodes() const { return mOutputShapes; } - std::vector getInputNames() const { return mInputNames; } - std::vector getOutputNames() const { return mOutputNames; } + std::vector> getNumInputNodes() const { return mInputShapes; } + std::vector> getNumOutputNodes() const { return mOutputShapes; } + std::vector getInputNames() const { return mInputNames; } + std::vector getOutputNames() const { return mOutputNames; } - void setActiveThreads(int threads) { intraOpNumThreads = threads; } + void setActiveThreads(int threads) { intraOpNumThreads = threads; } - private: + private: + // ORT variables -> need to be hidden as Pimpl + struct OrtVariables; + OrtVariables* pImplOrt; - // ORT variables -> need to be hidden as Pimpl - struct OrtVariables; - OrtVariables* pImplOrt; + // Input & Output specifications of the loaded network + std::vector inputNamesChar, outputNamesChar; + std::vector mInputNames, mOutputNames; + std::vector> mInputShapes, mOutputShapes; - // Input & Output specifications of the loaded network - std::vector inputNamesChar, outputNamesChar; - std::vector mInputNames, mOutputNames; - std::vector> mInputShapes, mOutputShapes; - - // Environment settings - std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda - int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; - - std::string printShape(const std::vector&); + // Environment settings + std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda + int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; + std::string printShape(const std::vector&); }; } // namespace ml -} // namespace ml +} // namespace o2 #endif // O2_ML_ORT_INTERFACE_H \ No newline at end of file diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx index 84a06ce1da068..8ebe0588b4a2b 100644 --- a/Common/ML/src/ort_interface.cxx +++ b/Common/ML/src/ort_interface.cxx @@ -25,7 +25,7 @@ namespace o2 namespace ml { -struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file +struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file // ORT runtime objects Ort::RunOptions runOptions; std::shared_ptr env = nullptr; @@ -35,12 +35,13 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the . Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); }; -void OrtModel::reset(std::unordered_map optionsMap){ +void OrtModel::reset(std::unordered_map optionsMap) +{ pImplOrt = new OrtVariables(); // Load from options map - if(!optionsMap.contains("model-path")){ + if (!optionsMap.contains("model-path")) { LOG(fatal) << "(ORT) Model path cannot be empty!"; } modelPath = optionsMap["model-path"]; @@ -48,42 +49,42 @@ void OrtModel::reset(std::unordered_map optionsMap){ dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float"); deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0); allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); - intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); + intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0); enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0); enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); std::string dev_mem_str = "Hip"; #ifdef ORT_ROCM_BUILD - if(device == "ROCM") { + if (device == "ROCM") { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) ROCM execution provider set"; } #endif #ifdef ORT_MIGRAPHX_BUILD - if(device == "MIGRAPHX") { + if (device == "MIGRAPHX") { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) MIGraphX execution provider set"; } #endif #ifdef ORT_CUDA_BUILD - if(device == "CUDA") { + if (device == "CUDA") { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) CUDA execution provider set"; dev_mem_str = "Cuda"; } #endif - if(allocateDeviceMemory){ + if (allocateDeviceMemory) { pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); LOG(info) << "(ORT) Memory info set to on-device memory"; } - if(device == "CPU") { + if (device == "CPU") { (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads); - if(intraOpNumThreads > 1){ + if (intraOpNumThreads > 1) { (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL); - } else if(intraOpNumThreads == 1){ + } else if (intraOpNumThreads == 1) { (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); } LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads"; @@ -92,8 +93,8 @@ void OrtModel::reset(std::unordered_map optionsMap){ (pImplOrt->sessionOptions).DisableMemPattern(); (pImplOrt->sessionOptions).DisableCpuMemArena(); - if(enableProfiling){ - if(optionsMap.contains("profiling-output-path")){ + if (enableProfiling) { + if (optionsMap.contains("profiling-output-path")) { (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str()); } else { LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now."; @@ -109,27 +110,27 @@ void OrtModel::reset(std::unordered_map optionsMap){ (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { - mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); + mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); } for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { - mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); } for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { - mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get()); + mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get()); } for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { - mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); } inputNamesChar.resize(mInputNames.size(), nullptr); std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar), - [&](const std::string& str) { return str.c_str(); }); + [&](const std::string& str) { return str.c_str(); }); outputNamesChar.resize(mOutputNames.size(), nullptr); std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar), - [&](const std::string& str) { return str.c_str(); }); + [&](const std::string& str) { return str.c_str(); }); // Print names - if(loggingLevel > 1) { + if (loggingLevel > 1) { LOG(info) << "Input Nodes:"; for (size_t i = 0; i < mInputNames.size(); i++) { LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]); @@ -142,24 +143,28 @@ void OrtModel::reset(std::unordered_map optionsMap){ } } -void OrtModel::resetSession() { +void OrtModel::resetSession() +{ (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); } -template -std::vector OrtModel::v2v(std::vector& input, bool clearInput) { - if constexpr (std::is_same_v){ +template +std::vector OrtModel::v2v(std::vector& input, bool clearInput) +{ + if constexpr (std::is_same_v) { return input; } else { std::vector output(input.size()); std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); }); - if(clearInput) input.clear(); + if (clearInput) + input.clear(); return output; } } -template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h -std::vector OrtModel::inference(std::vector& input){ +template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h +std::vector OrtModel::inference(std::vector& input) +{ std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (reinterpret_cast(input)).data(), input.size(), inputShape.data(), inputShape.size())); @@ -171,10 +176,11 @@ std::vector OrtModel::inference(std::vector& input){ return outputValuesVec; } -template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h -std::vector OrtModel::inference(std::vector>& input){ +template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h +std::vector OrtModel::inference(std::vector>& input) +{ std::vector inputTensor; - for(auto i : input){ + for (auto i : input) { std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (reinterpret_cast(i)).data(), i.size(), inputShape.data(), inputShape.size())); } @@ -195,7 +201,9 @@ std::string OrtModel::printShape(const std::vector& v) return ss.str(); } -template <> std::vector OrtModel::inference(std::vector& input) { +template <> +std::vector OrtModel::inference(std::vector& input) +{ std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size())); @@ -207,7 +215,9 @@ template <> std::vector OrtModel::inference(std::vector std::vector OrtModel::inference(std::vector& input) { +template <> +std::vector OrtModel::inference(std::vector& input) +{ std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); @@ -219,7 +229,9 @@ template <> std::vector OrtModel::inference std::vector OrtModel::inference(std::vector& input) { +template <> +std::vector OrtModel::inference(std::vector& input) +{ std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); @@ -231,7 +243,9 @@ template <> std::vector OrtModel::inference std::vector OrtModel::inference(std::vector& input) { +template <> +std::vector OrtModel::inference(std::vector& input) +{ std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); @@ -243,9 +257,11 @@ template <> std::vector OrtModel::inference std::vector OrtModel::inference(std::vector>& input) { +template <> +std::vector OrtModel::inference(std::vector>& input) +{ std::vector inputTensor; - for(auto i : input){ + for (auto i : input) { std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(i.data()), i.size(), inputShape.data(), inputShape.size())); } diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 0f22a7472feac..d8470fdc2bf10 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -838,7 +838,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (clusterer.mPmemory->counters.nPeaks == 0) { continue; } - if(!GetProcessingSettings().applyNNclusterizer){ + if (!GetProcessingSettings().applyNNclusterizer) { runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); } else { @@ -875,14 +875,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}}); DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); - if(GetProcessingSettings().applyNNclusterizer){ + if (GetProcessingSettings().applyNNclusterizer) { // Settings for the clusterizer clusterer.nnClusterizerUseCFregression = GetProcessingSettings().nnClusterizerUseCFregression; clusterer.nnClusterizerSizeInputRow = GetProcessingSettings().nnClusterizerSizeInputRow; clusterer.nnClusterizerSizeInputPad = GetProcessingSettings().nnClusterizerSizeInputPad; clusterer.nnClusterizerSizeInputTime = GetProcessingSettings().nnClusterizerSizeInputTime; clusterer.nnClusterizerAddIndexData = GetProcessingSettings().nnClusterizerAddIndexData; - clusterer.nnClusterizerElementSize = ((2*clusterer.nnClusterizerSizeInputRow + 1) * (2*clusterer.nnClusterizerSizeInputPad + 1) * (2*clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0); + clusterer.nnClusterizerElementSize = ((2 * clusterer.nnClusterizerSizeInputRow + 1) * (2 * clusterer.nnClusterizerSizeInputPad + 1) * (2 * clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0); clusterer.nnClusterizerBatchedMode = GetProcessingSettings().nnClusterizerBatchedMode; clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnInferenceVerbosity; @@ -893,7 +893,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // Settings for the neural network evaluation clusterer.OrtOptions = { {"model-path", GetProcessingSettings().nnClassificationPath}, - {"device", GetProcessingSettings().nnInferenceDevice}, + {"device", GetProcessingSettings().nnInferenceDevice}, {"device-id", std::to_string(GetProcessingSettings().nnInferenceDeviceId)}, {"allocate-device-memory", std::to_string(GetProcessingSettings().nnInferenceAllocateDevMem)}, {"dtype", GetProcessingSettings().nnInferenceDtype}, @@ -901,16 +901,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) {"enable-optimizations", std::to_string(GetProcessingSettings().nnInferenceEnableOrtOptimization)}, {"enable-profiling", std::to_string(GetProcessingSettings().nnInferenceOrtProfiling)}, {"profiling-output-path", GetProcessingSettings().nnInferenceOrtProfilingPath}, - {"logging-level", std::to_string(GetProcessingSettings().nnInferenceVerbosity)} - }; + {"logging-level", std::to_string(GetProcessingSettings().nnInferenceVerbosity)}}; clusterer.model_class.init(clusterer.OrtOptions); - if(!clusterer.nnClusterizerUseCFregression){ + if (!clusterer.nnClusterizerUseCFregression) { std::vector reg_model_paths = o2::utils::Str::tokenize(GetProcessingSettings().nnRegressionPath, ':'); - if(clusterer.model_class.getNumOutputNodes()[0][1] == 1){ + if (clusterer.model_class.getNumOutputNodes()[0][1] == 1) { clusterer.OrtOptions["model-path"] = reg_model_paths[0]; clusterer.model_reg_1.init(clusterer.OrtOptions); } else { - if(reg_model_paths.size() == 1){ + if (reg_model_paths.size() == 1) { clusterer.OrtOptions["model-path"] = reg_model_paths[0]; clusterer.model_reg_1.init(clusterer.OrtOptions); } else { @@ -925,9 +924,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); } - if(clusterer.nnSigmoidTrafoClassThreshold){ + if (clusterer.nnSigmoidTrafoClassThreshold) { // Inverse sigmoid transformation - clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold/(1.f-clusterer.nnClassThreshold)); + clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold / (1.f - clusterer.nnClassThreshold)); } runKernel({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0); } else { @@ -939,7 +938,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (doGPU) { SynchronizeStream(lane); } - if(!GetProcessingSettings().applyNNclusterizer){ + if (!GetProcessingSettings().applyNNclusterizer) { runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); } else { runKernel({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h index d308b8bd6efa7..b7e535a107eac 100644 --- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h +++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h @@ -43,7 +43,8 @@ class ClusterAccumulator GPUd() void finalize(const ChargePos&, tpccf::Charge, tpccf::TPCTime, const GPUTPCGeometry&); GPUd() bool toNative(const ChargePos&, tpccf::Charge, tpc::ClusterNative&, const GPUParam&) const; - GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uint8_t splitInTime, uint8_t splitInPad){ + GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uint8_t splitInTime, uint8_t splitInPad) + { mQtot = qtot; mPadMean = padMean; mPadSigma = padSigma; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h index 130453e833911..fd420357073e9 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h @@ -158,7 +158,7 @@ class GPUTPCClusterFinder : public GPUProcessor std::unordered_map OrtOptions; OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters - + #ifndef GPUCA_GPUCODE void DumpDigits(std::ostream& out); void DumpChargeMap(std::ostream& out, std::string_view); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index f5e094a3c363e..ba8fac2a397e9 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -34,15 +34,15 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; - if(clusterer.OrtOptions["dtype"].find("32") != std::string::npos){ + if (clusterer.OrtOptions["dtype"].find("32") != std::string::npos) { GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); - } else if(clusterer.OrtOptions["dtype"].find("16") != std::string::npos) { + } else if (clusterer.OrtOptions["dtype"].find("16") != std::string::npos) { GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); } else { LOG(fatal) << "Unsupported data type for neural network clusterizer!"; } // tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; -// + // // GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); } @@ -74,12 +74,12 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G } } else if (row <= 62 + global_shift) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network return true; - } else if (row <= o2::tpc::constants::MAXGLOBALPADROW-1 + global_shift) { - //if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)- global_shift]) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)- global_shift]) / 2) { - // return true; - //} else { - // return false; - //} + } else if (row <= o2::tpc::constants::MAXGLOBALPADROW - 1 + global_shift) { + // if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)- global_shift]) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)- global_shift]) / 2) { + // return true; + // } else { + // return false; + // } if (pad < 0 || pad > geo.NPads(row)) { return true; } else { @@ -92,277 +92,135 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G template GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int iBlock, int iThread, - processorType& clusterer, - const CfFragment& fragment, - GPUSharedMemory& smem, - const Array2D& chargeMap, - const ChargePos* filteredPeakPositions, - const GPUSettingsRec& calib, - MCLabelAccumulator* labelAcc, - uint clusternum, - uint maxClusterPerRow, - uint* clusterInRow, - tpc::ClusterNative* clusterByRow, - uint* clusterPosInRow){ - - uint glo_idx = get_global_id(0) * clusterer.nnClusterizerBatchedMode; - if(glo_idx >= clusternum){ - return; - } + processorType& clusterer, + const CfFragment& fragment, + GPUSharedMemory& smem, + const Array2D& chargeMap, + const ChargePos* filteredPeakPositions, + const GPUSettingsRec& calib, + MCLabelAccumulator* labelAcc, + uint clusternum, + uint maxClusterPerRow, + uint* clusterInRow, + tpc::ClusterNative* clusterByRow, + uint* clusterPosInRow) +{ + + uint glo_idx = get_global_id(0) * clusterer.nnClusterizerBatchedMode; + if (glo_idx >= clusternum) { + return; + } - std::vector central_charges(clusterer.nnClusterizerBatchedMode, -1.f); - std::vector input_data(clusterer.nnClusterizerElementSize * clusterer.nnClusterizerBatchedMode, (T)-1.f); - std::vector peak_positions(clusterer.nnClusterizerBatchedMode); - unsigned int write_idx = 0; - - for(int batch_counter = 0; batch_counter < clusterer.nnClusterizerBatchedMode; batch_counter++){ - - uint cls = CAMath::Min(glo_idx + batch_counter, clusternum - 1); - - ChargePos peak = clusterer.mPfilteredPeakPositions[cls]; - int row = peak.row(), pad = peak.pad(), time = peak.time(); - float central_charge = chargeMap[peak].unpack(); - - peak_positions[batch_counter] = peak; - central_charges[batch_counter] = central_charge; - - // unsigned int batch_offset = batch_counter * clusterer.nnClusterizerElementSize; - for(int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++){ - bool push_mc_label = (r == 0); - int pad_offset = GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry); - int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow); - for(int p = -clusterer.nnClusterizerSizeInputPad; p <= clusterer.nnClusterizerSizeInputPad; p++){ - push_mc_label &= (std::abs(p) < 2); // Use inner 5x5 window - bool is_boundary = GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p + pad_offset, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry); - for(int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++){ - push_mc_label &= (std::abs(t) < 2); // Use inner 5x5 window - if(!is_boundary){ - ChargePos tmp_pos(row + r, pad + p + pad_offset, time + t); - input_data[write_idx] = (T)(chargeMap[tmp_pos].unpack() / central_charge); - if(push_mc_label){ - ChargePos tmp_pos_mc(row, pad + p, time + t); - CPU_ONLY(labelAcc->collect(tmp_pos, chargeMap[tmp_pos_mc].unpack())); - } + std::vector central_charges(clusterer.nnClusterizerBatchedMode, -1.f); + std::vector input_data(clusterer.nnClusterizerElementSize * clusterer.nnClusterizerBatchedMode, (T)-1.f); + std::vector peak_positions(clusterer.nnClusterizerBatchedMode); + unsigned int write_idx = 0; + + for (int batch_counter = 0; batch_counter < clusterer.nnClusterizerBatchedMode; batch_counter++) { + + uint cls = CAMath::Min(glo_idx + batch_counter, clusternum - 1); + + ChargePos peak = clusterer.mPfilteredPeakPositions[cls]; + int row = peak.row(), pad = peak.pad(), time = peak.time(); + float central_charge = chargeMap[peak].unpack(); + + peak_positions[batch_counter] = peak; + central_charges[batch_counter] = central_charge; + + // unsigned int batch_offset = batch_counter * clusterer.nnClusterizerElementSize; + for (int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++) { + bool push_mc_label = (r == 0); + int pad_offset = GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry); + int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow); + for (int p = -clusterer.nnClusterizerSizeInputPad; p <= clusterer.nnClusterizerSizeInputPad; p++) { + push_mc_label &= (std::abs(p) < 2); // Use inner 5x5 window + bool is_boundary = GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p + pad_offset, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry); + for (int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++) { + push_mc_label &= (std::abs(t) < 2); // Use inner 5x5 window + if (!is_boundary) { + ChargePos tmp_pos(row + r, pad + p + pad_offset, time + t); + input_data[write_idx] = (T)(chargeMap[tmp_pos].unpack() / central_charge); + if (push_mc_label) { + ChargePos tmp_pos_mc(row, pad + p, time + t); + CPU_ONLY(labelAcc->collect(tmp_pos, chargeMap[tmp_pos_mc].unpack())); } - write_idx++; } + write_idx++; } } - if(clusterer.nnClusterizerAddIndexData){ - input_data[write_idx] = (T)(clusterer.mISlice / 36.f); - input_data[write_idx + 1] = (T)(row / 152.f); - input_data[write_idx + 2] = (T)((float)pad / clusterer.Param().tpcGeometry.NPads(row)); - write_idx+=3; - // if(idx == 100){ - // LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]"; - // } - } } + if (clusterer.nnClusterizerAddIndexData) { + input_data[write_idx] = (T)(clusterer.mISlice / 36.f); + input_data[write_idx + 1] = (T)(row / 152.f); + input_data[write_idx + 2] = (T)((float)pad / clusterer.Param().tpcGeometry.NPads(row)); + write_idx += 3; + // if(idx == 100){ + // LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]"; + // } + } + } - std::vector index_class_2; - std::vector out_class = clusterer.model_class.inference(input_data); - // LOG(info) << "input_data.size(): " << input_data.size() << "; write_idx: " << write_idx << "; out_class.size(): " << out_class.size(); - int num_output_classes = clusterer.model_class.getNumOutputNodes()[0][1]; - - if(num_output_classes > 1){ - std::vector tmp_out_class(clusterer.nnClusterizerBatchedMode); - for(int cls_idx = 0; cls_idx < clusterer.nnClusterizerBatchedMode; cls_idx++){ - auto elem_iterator = out_class.begin() + (unsigned int)(cls_idx*num_output_classes); - tmp_out_class[cls_idx] = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator+num_output_classes)) - 1; // -1 since 2-class classifier will have 3 outputs: classes 0, 1, 2 - if(tmp_out_class[cls_idx] > 1){ - index_class_2.push_back(cls_idx); - } + std::vector index_class_2; + std::vector out_class = clusterer.model_class.inference(input_data); + // LOG(info) << "input_data.size(): " << input_data.size() << "; write_idx: " << write_idx << "; out_class.size(): " << out_class.size(); + int num_output_classes = clusterer.model_class.getNumOutputNodes()[0][1]; + + if (num_output_classes > 1) { + std::vector tmp_out_class(clusterer.nnClusterizerBatchedMode); + for (int cls_idx = 0; cls_idx < clusterer.nnClusterizerBatchedMode; cls_idx++) { + auto elem_iterator = out_class.begin() + (unsigned int)(cls_idx * num_output_classes); + tmp_out_class[cls_idx] = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + num_output_classes)) - 1; // -1 since 2-class classifier will have 3 outputs: classes 0, 1, 2 + if (tmp_out_class[cls_idx] > 1) { + index_class_2.push_back(cls_idx); } - out_class = tmp_out_class; } + out_class = tmp_out_class; + } - if(!clusterer.nnClusterizerUseCFregression) { + if (!clusterer.nnClusterizerUseCFregression) { - std::vector out_reg = clusterer.model_reg_1.inference(input_data), tmp_out_reg_2; - if(index_class_2.size() > 0){ - std::vector tmp_in_reg_2(index_class_2.size() * clusterer.nnClusterizerElementSize); - int fill_counter = 0; - for(int cls_idx : index_class_2){ - int from_idx = cls_idx*clusterer.nnClusterizerElementSize, to_idx = fill_counter * clusterer.nnClusterizerElementSize; - for(int reg_idx = 0; reg_idx < clusterer.nnClusterizerElementSize; reg_idx++){ - tmp_in_reg_2[to_idx + reg_idx] = input_data[from_idx + reg_idx]; - } - fill_counter++; + std::vector out_reg = clusterer.model_reg_1.inference(input_data), tmp_out_reg_2; + if (index_class_2.size() > 0) { + std::vector tmp_in_reg_2(index_class_2.size() * clusterer.nnClusterizerElementSize); + int fill_counter = 0; + for (int cls_idx : index_class_2) { + int from_idx = cls_idx * clusterer.nnClusterizerElementSize, to_idx = fill_counter * clusterer.nnClusterizerElementSize; + for (int reg_idx = 0; reg_idx < clusterer.nnClusterizerElementSize; reg_idx++) { + tmp_in_reg_2[to_idx + reg_idx] = input_data[from_idx + reg_idx]; } - tmp_out_reg_2 = clusterer.model_reg_2.inference(input_data); - } - - input_data.clear(); - - if((clusterer.nnClusterizerVerbosity >= 4) && glo_idx == 0){ - LOG(info) << "[CF] Classification model: " << out_class[0] << " (>? " << clusterer.nnClassThreshold << ")"; - LOG(info) << "[CF] Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4]; - } - - int num_outputs_1 = clusterer.model_reg_1.getNumOutputNodes()[0][1], num_outputs_2 = 0, counter_class_2_idcs = 0; - if(num_output_classes > 1){ - num_outputs_2 = clusterer.model_reg_2.getNumOutputNodes()[0][1]; + fill_counter++; } + tmp_out_reg_2 = clusterer.model_reg_2.inference(input_data); + } - for(int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) { - - if (glo_idx + element >= clusternum) { - return; - } - - int model_output_index = element*num_outputs_1; - if(out_class[element] > clusterer.nnClassThreshold) { - if((num_output_classes == 1) || ((num_output_classes > 1) && (out_class[element] < 2))) { - // CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); - ClusterAccumulator pc; - - ClusterAccumulator dummy_pc; - CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); - - // Dummy build to push MC labels - buildCluster( - calib, - chargeMap, - peak_positions[element], - smem.posBcast, - smem.buf, - smem.innerAboveThreshold, - &dummy_pc, - labelAcc); - - if (fragment.isOverlap(peak_positions[element].time())) { - if (clusterPosInRow) { - clusterPosInRow[glo_idx + element] = maxClusterPerRow; - } - continue; - } + input_data.clear(); - pc.setFull(central_charges[element] * out_reg[model_output_index + 4], peak_positions[element].pad() + out_reg[model_output_index + 0], out_reg[model_output_index + 2], fragment.start + peak_positions[element].time() + out_reg[model_output_index + 1], out_reg[model_output_index + 3], 0, 0); - // LOG(info) << "Example: " << num_outputs_1 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3]; - - tpc::ClusterNative myCluster; - bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); - if (rejectCluster) { - if(clusterer.nnClusterizerVerbosity > 3){ - LOG(warning) << "[CF] Cluster rejected!"; - } - if (clusterPosInRow) { - clusterPosInRow[glo_idx + element] = maxClusterPerRow; - } - continue; - } - - uint rowIndex = 0; - if (clusterByRow != nullptr) { - rowIndex = sortIntoBuckets( - clusterer, - myCluster, - peak_positions[element].row(), - maxClusterPerRow, - clusterInRow, - clusterByRow); - if (clusterPosInRow != nullptr) { - clusterPosInRow[glo_idx + element] = rowIndex; - } - } else if (clusterPosInRow) { - rowIndex = clusterPosInRow[glo_idx + element]; - } - CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); - } else { - model_output_index = index_class_2[counter_class_2_idcs]*num_outputs_2; - counter_class_2_idcs++; - - // Cluster 1 - CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); - ClusterAccumulator pc; - - if (fragment.isOverlap(peak_positions[element].time())) { - if (clusterPosInRow) { - clusterPosInRow[glo_idx + element] = maxClusterPerRow; - } - continue; - } + if ((clusterer.nnClusterizerVerbosity >= 4) && glo_idx == 0) { + LOG(info) << "[CF] Classification model: " << out_class[0] << " (>? " << clusterer.nnClassThreshold << ")"; + LOG(info) << "[CF] Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4]; + } - pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 8], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 4], tmp_out_reg_2[model_output_index + 2], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 2], tmp_out_reg_2[model_output_index + 6], 0, 0); - // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3]; - - tpc::ClusterNative myCluster; - bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); - if (rejectCluster) { - if(clusterer.nnClusterizerVerbosity > 3){ - LOG(warning) << "[CF] Cluster rejected!"; - } - if (clusterPosInRow) { - clusterPosInRow[glo_idx + element] = maxClusterPerRow; - } - continue; - } + int num_outputs_1 = clusterer.model_reg_1.getNumOutputNodes()[0][1], num_outputs_2 = 0, counter_class_2_idcs = 0; + if (num_output_classes > 1) { + num_outputs_2 = clusterer.model_reg_2.getNumOutputNodes()[0][1]; + } - uint rowIndex = 0; - if (clusterByRow != nullptr) { - rowIndex = sortIntoBuckets( - clusterer, - myCluster, - peak_positions[element].row(), - maxClusterPerRow, - clusterInRow, - clusterByRow); - if (clusterPosInRow != nullptr) { - clusterPosInRow[glo_idx + element] = rowIndex; - } - } else if (clusterPosInRow) { - rowIndex = clusterPosInRow[glo_idx + element]; - } - CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); - - // Cluster 2 - CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); - pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 9], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 1], tmp_out_reg_2[model_output_index + 5], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 3], tmp_out_reg_2[model_output_index + 7], 0, 0); - // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3]; - rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); - if (rejectCluster) { - if(clusterer.nnClusterizerVerbosity > 3){ - LOG(warning) << "[CF] Cluster rejected!"; - } - if (clusterPosInRow) { - clusterPosInRow[glo_idx + element] = maxClusterPerRow; - } - continue; - } + for (int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) { - rowIndex = 0; - if (clusterByRow != nullptr) { - rowIndex = sortIntoBuckets( - clusterer, - myCluster, - peak_positions[element].row(), - maxClusterPerRow, - clusterInRow, - clusterByRow); - if (clusterPosInRow != nullptr) { - clusterPosInRow[glo_idx + element] = rowIndex; - } - } else if (clusterPosInRow) { - rowIndex = clusterPosInRow[glo_idx + element]; - } - CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); - } - } + if (glo_idx + element >= clusternum) { + return; } - } else { - - input_data.clear(); - for(int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) { - if (glo_idx + element >= clusternum) { - return; - } - - if(out_class[element] > clusterer.nnClassThreshold) { - + int model_output_index = element * num_outputs_1; + if (out_class[element] > clusterer.nnClassThreshold) { + if ((num_output_classes == 1) || ((num_output_classes > 1) && (out_class[element] < 2))) { + // CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); ClusterAccumulator pc; + + ClusterAccumulator dummy_pc; CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); + // Dummy build to push MC labels buildCluster( calib, chargeMap, @@ -370,7 +228,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i smem.posBcast, smem.buf, smem.innerAboveThreshold, - &pc, + &dummy_pc, labelAcc); if (fragment.isOverlap(peak_positions[element].time())) { @@ -379,20 +237,67 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i } continue; } - pc.finalize(peak_positions[element], central_charges[element], fragment.start, clusterer.Param().tpcGeometry); + + pc.setFull(central_charges[element] * out_reg[model_output_index + 4], peak_positions[element].pad() + out_reg[model_output_index + 0], out_reg[model_output_index + 2], fragment.start + peak_positions[element].time() + out_reg[model_output_index + 1], out_reg[model_output_index + 3], 0, 0); + // LOG(info) << "Example: " << num_outputs_1 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3]; tpc::ClusterNative myCluster; bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); + if (rejectCluster) { + if (clusterer.nnClusterizerVerbosity > 3) { + LOG(warning) << "[CF] Cluster rejected!"; + } + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + + uint rowIndex = 0; + if (clusterByRow != nullptr) { + rowIndex = sortIntoBuckets( + clusterer, + myCluster, + peak_positions[element].row(), + maxClusterPerRow, + clusterInRow, + clusterByRow); + if (clusterPosInRow != nullptr) { + clusterPosInRow[glo_idx + element] = rowIndex; + } + } else if (clusterPosInRow) { + rowIndex = clusterPosInRow[glo_idx + element]; + } + CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); + } else { + model_output_index = index_class_2[counter_class_2_idcs] * num_outputs_2; + counter_class_2_idcs++; + // Cluster 1 + CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); + ClusterAccumulator pc; + + if (fragment.isOverlap(peak_positions[element].time())) { + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + + pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 8], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 4], tmp_out_reg_2[model_output_index + 2], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 2], tmp_out_reg_2[model_output_index + 6], 0, 0); + // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3]; + + tpc::ClusterNative myCluster; + bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); if (rejectCluster) { - if(clusterer.nnClusterizerVerbosity > 3){ - LOG(warning) << "[CF] Cluster rejected!"; - } - if (clusterPosInRow) { - clusterPosInRow[glo_idx + element] = maxClusterPerRow; - } - continue; + if (clusterer.nnClusterizerVerbosity > 3) { + LOG(warning) << "[CF] Cluster rejected!"; + } + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; } + continue; + } uint rowIndex = 0; if (clusterByRow != nullptr) { @@ -409,18 +314,112 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i } else if (clusterPosInRow) { rowIndex = clusterPosInRow[glo_idx + element]; } + CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); + // Cluster 2 + CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); + pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 9], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 1], tmp_out_reg_2[model_output_index + 5], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 3], tmp_out_reg_2[model_output_index + 7], 0, 0); + // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3]; + rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); + if (rejectCluster) { + if (clusterer.nnClusterizerVerbosity > 3) { + LOG(warning) << "[CF] Cluster rejected!"; + } + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + + rowIndex = 0; + if (clusterByRow != nullptr) { + rowIndex = sortIntoBuckets( + clusterer, + myCluster, + peak_positions[element].row(), + maxClusterPerRow, + clusterInRow, + clusterByRow); + if (clusterPosInRow != nullptr) { + clusterPosInRow[glo_idx + element] = rowIndex; + } + } else if (clusterPosInRow) { + rowIndex = clusterPosInRow[glo_idx + element]; + } CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); } } } - if(clusterer.nnClusterizerVerbosity > 4){ - LOG(info) << "[CF] Clusterization done!"; - } -} + } else { + + input_data.clear(); + for (int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) { + if (glo_idx + element >= clusternum) { + return; + } + + if (out_class[element] > clusterer.nnClassThreshold) { + ClusterAccumulator pc; + CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element])); + buildCluster( + calib, + chargeMap, + peak_positions[element], + smem.posBcast, + smem.buf, + smem.innerAboveThreshold, + &pc, + labelAcc); + + if (fragment.isOverlap(peak_positions[element].time())) { + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + pc.finalize(peak_positions[element], central_charges[element], fragment.start, clusterer.Param().tpcGeometry); + + tpc::ClusterNative myCluster; + bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param()); + + if (rejectCluster) { + if (clusterer.nnClusterizerVerbosity > 3) { + LOG(warning) << "[CF] Cluster rejected!"; + } + if (clusterPosInRow) { + clusterPosInRow[glo_idx + element] = maxClusterPerRow; + } + continue; + } + + uint rowIndex = 0; + if (clusterByRow != nullptr) { + rowIndex = sortIntoBuckets( + clusterer, + myCluster, + peak_positions[element].row(), + maxClusterPerRow, + clusterInRow, + clusterByRow); + if (clusterPosInRow != nullptr) { + clusterPosInRow[glo_idx + element] = rowIndex; + } + } else if (clusterPosInRow) { + rowIndex = clusterPosInRow[glo_idx + element]; + } + + CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow)); + } + } + } + + if (clusterer.nnClusterizerVerbosity > 4) { + LOG(info) << "[CF] Clusterization done!"; + } +} GPUdii() void GPUTPCNNClusterizer::computeClustersImpl(int nBlocks, int nThreads, int iBlock, int iThread, processorType& clusterer, diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 51a5c29022421..98d979d28cf15 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -66,20 +66,20 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate static int rowOffset(int, int); static bool isBoundary(int, int, int, const GPUTPCGeometry&); - template + template static GPUd() void nn_clusterizer(int, int, int, int, - processorType&, - const CfFragment&, - GPUSharedMemory&, - const Array2D&, - const ChargePos*, - const GPUSettingsRec&, - MCLabelAccumulator*, - uint, - uint, - uint*, - tpc::ClusterNative*, - uint*); + processorType&, + const CfFragment&, + GPUSharedMemory&, + const Array2D&, + const ChargePos*, + const GPUSettingsRec&, + MCLabelAccumulator*, + uint, + uint, + uint*, + tpc::ClusterNative*, + uint*); private: // --------------------------------- From 25093b33e1472d21a14e6396aa1d9fe1953d6b1b Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 18 Nov 2024 12:50:31 +0100 Subject: [PATCH 16/23] Adding an ONNX CPU library in the O2 framework --- Common/CMakeLists.txt | 1 + Common/ML/CMakeLists.txt | 15 + Common/ML/include/ML/3rdparty/GPUORTFloat16.h | 867 ++++++++++++++++++ Common/ML/include/ML/ort_interface.h | 94 ++ Common/ML/src/ort_interface.cxx | 262 ++++++ 5 files changed, 1239 insertions(+) create mode 100644 Common/ML/CMakeLists.txt create mode 100644 Common/ML/include/ML/3rdparty/GPUORTFloat16.h create mode 100644 Common/ML/include/ML/ort_interface.h create mode 100644 Common/ML/src/ort_interface.cxx diff --git a/Common/CMakeLists.txt b/Common/CMakeLists.txt index f435e269575aa..0b92758e45f43 100644 --- a/Common/CMakeLists.txt +++ b/Common/CMakeLists.txt @@ -16,5 +16,6 @@ add_subdirectory(Types) add_subdirectory(Utils) add_subdirectory(SimConfig) add_subdirectory(DCAFitter) +add_subdirectory(ML) o2_data_file(COPY maps DESTINATION Common) diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt new file mode 100644 index 0000000000000..954d29d6e2793 --- /dev/null +++ b/Common/ML/CMakeLists.txt @@ -0,0 +1,15 @@ +# Copyright 2019-2020 CERN and copyright holders of ALICE O2. +# See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +# All rights not expressly granted are reserved. +# +# This software is distributed under the terms of the GNU General Public +# License v3 (GPL Version 3), copied verbatim in the file "COPYING". +# +# In applying this license CERN does not waive the privileges and immunities +# granted to it by virtue of its status as an Intergovernmental Organization +# or submit itself to any jurisdiction. + +o2_add_library(ML + SOURCES src/ort_interface.cxx + TARGETVARNAME targetName + PRIVATE_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime) \ No newline at end of file diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h new file mode 100644 index 0000000000000..db65328409d3c --- /dev/null +++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h @@ -0,0 +1,867 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// This code was created from: +// - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_float16.h +// - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_cxx_api.h + +#include +#include +#include +#include + +namespace o2 +{ + +namespace OrtDataType +{ + +namespace detail +{ + +enum class endian { +#if defined(_WIN32) + little = 0, + big = 1, + native = little, +#elif defined(__GNUC__) || defined(__clang__) + little = __ORDER_LITTLE_ENDIAN__, + big = __ORDER_BIG_ENDIAN__, + native = __BYTE_ORDER__, +#else +#error OrtDataType::detail::endian is not implemented in this environment. +#endif +}; + +static_assert( + endian::native == endian::little || endian::native == endian::big, + "Only little-endian or big-endian native byte orders are supported."); + +} // namespace detail + +/// +/// Shared implementation between public and internal classes. CRTP pattern. +/// +template +struct Float16Impl { + protected: + /// + /// Converts from float to uint16_t float16 representation + /// + /// + /// + constexpr static uint16_t ToUint16Impl(float v) noexcept; + + /// + /// Converts float16 to float + /// + /// float representation of float16 value + float ToFloatImpl() const noexcept; + + /// + /// Creates an instance that represents absolute value. + /// + /// Absolute value + uint16_t AbsImpl() const noexcept + { + return static_cast(val & ~kSignMask); + } + + /// + /// Creates a new instance with the sign flipped. + /// + /// Flipped sign instance + uint16_t NegateImpl() const noexcept + { + return IsNaN() ? val : static_cast(val ^ kSignMask); + } + + public: + // uint16_t special values + static constexpr uint16_t kSignMask = 0x8000U; + static constexpr uint16_t kBiasedExponentMask = 0x7C00U; + static constexpr uint16_t kPositiveInfinityBits = 0x7C00U; + static constexpr uint16_t kNegativeInfinityBits = 0xFC00U; + static constexpr uint16_t kPositiveQNaNBits = 0x7E00U; + static constexpr uint16_t kNegativeQNaNBits = 0xFE00U; + static constexpr uint16_t kEpsilonBits = 0x4170U; + static constexpr uint16_t kMinValueBits = 0xFBFFU; // Minimum normal number + static constexpr uint16_t kMaxValueBits = 0x7BFFU; // Largest normal number + static constexpr uint16_t kOneBits = 0x3C00U; + static constexpr uint16_t kMinusOneBits = 0xBC00U; + + uint16_t val{0}; + + Float16Impl() = default; + + /// + /// Checks if the value is negative + /// + /// true if negative + bool IsNegative() const noexcept + { + return static_cast(val) < 0; + } + + /// + /// Tests if the value is NaN + /// + /// true if NaN + bool IsNaN() const noexcept + { + return AbsImpl() > kPositiveInfinityBits; + } + + /// + /// Tests if the value is finite + /// + /// true if finite + bool IsFinite() const noexcept + { + return AbsImpl() < kPositiveInfinityBits; + } + + /// + /// Tests if the value represents positive infinity. + /// + /// true if positive infinity + bool IsPositiveInfinity() const noexcept + { + return val == kPositiveInfinityBits; + } + + /// + /// Tests if the value represents negative infinity + /// + /// true if negative infinity + bool IsNegativeInfinity() const noexcept + { + return val == kNegativeInfinityBits; + } + + /// + /// Tests if the value is either positive or negative infinity. + /// + /// True if absolute value is infinity + bool IsInfinity() const noexcept + { + return AbsImpl() == kPositiveInfinityBits; + } + + /// + /// Tests if the value is NaN or zero. Useful for comparisons. + /// + /// True if NaN or zero. + bool IsNaNOrZero() const noexcept + { + auto abs = AbsImpl(); + return (abs == 0 || abs > kPositiveInfinityBits); + } + + /// + /// Tests if the value is normal (not zero, subnormal, infinite, or NaN). + /// + /// True if so + bool IsNormal() const noexcept + { + auto abs = AbsImpl(); + return (abs < kPositiveInfinityBits) // is finite + && (abs != 0) // is not zero + && ((abs & kBiasedExponentMask) != 0); // is not subnormal (has a non-zero exponent) + } + + /// + /// Tests if the value is subnormal (denormal). + /// + /// True if so + bool IsSubnormal() const noexcept + { + auto abs = AbsImpl(); + return (abs < kPositiveInfinityBits) // is finite + && (abs != 0) // is not zero + && ((abs & kBiasedExponentMask) == 0); // is subnormal (has a zero exponent) + } + + /// + /// Creates an instance that represents absolute value. + /// + /// Absolute value + Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); } + + /// + /// Creates a new instance with the sign flipped. + /// + /// Flipped sign instance + Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); } + + /// + /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check + /// for two values by or'ing the private bits together and stripping the sign. They are both zero, + /// and therefore equivalent, if the resulting value is still zero. + /// + /// first value + /// second value + /// True if both arguments represent zero + static bool AreZero(const Float16Impl& lhs, const Float16Impl& rhs) noexcept + { + return static_cast((lhs.val | rhs.val) & ~kSignMask) == 0; + } + + bool operator==(const Float16Impl& rhs) const noexcept + { + if (IsNaN() || rhs.IsNaN()) { + // IEEE defines that NaN is not equal to anything, including itself. + return false; + } + return val == rhs.val; + } + + bool operator!=(const Float16Impl& rhs) const noexcept { return !(*this == rhs); } + + bool operator<(const Float16Impl& rhs) const noexcept + { + if (IsNaN() || rhs.IsNaN()) { + // IEEE defines that NaN is unordered with respect to everything, including itself. + return false; + } + + const bool left_is_negative = IsNegative(); + if (left_is_negative != rhs.IsNegative()) { + // When the signs of left and right differ, we know that left is less than right if it is + // the negative value. The exception to this is if both values are zero, in which case IEEE + // says they should be equal, even if the signs differ. + return left_is_negative && !AreZero(*this, rhs); + } + return (val != rhs.val) && ((val < rhs.val) ^ left_is_negative); + } +}; + +// The following Float16_t conversions are based on the code from +// Eigen library. + +// The conversion routines are Copyright (c) Fabian Giesen, 2016. +// The original license follows: +// +// Copyright (c) Fabian Giesen, 2016 +// All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted. +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +namespace detail +{ +union float32_bits { + unsigned int u; + float f; +}; +}; // namespace detail + +template +inline constexpr uint16_t Float16Impl::ToUint16Impl(float v) noexcept +{ + detail::float32_bits f{}; + f.f = v; + + constexpr detail::float32_bits f32infty = {255 << 23}; + constexpr detail::float32_bits f16max = {(127 + 16) << 23}; + constexpr detail::float32_bits denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23}; + constexpr unsigned int sign_mask = 0x80000000u; + uint16_t val = static_cast(0x0u); + + unsigned int sign = f.u & sign_mask; + f.u ^= sign; + + // NOTE all the integer compares in this function can be safely + // compiled into signed compares since all operands are below + // 0x80000000. Important if you want fast straight SSE2 code + // (since there's no unsigned PCMPGTD). + + if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) + val = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + } else { // (De)normalized number or zero + if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + + // and one integer subtract of the bias later, we have our final float! + val = static_cast(f.u - denorm_magic.u); + } else { + unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + + // update exponent, rounding bias part 1 + // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but + // without arithmetic overflow. + f.u += 0xc8000fffU; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + val = static_cast(f.u >> 13); + } + } + + val |= static_cast(sign >> 16); + return val; +} + +template +inline float Float16Impl::ToFloatImpl() const noexcept +{ + constexpr detail::float32_bits magic = {113 << 23}; + constexpr unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift + detail::float32_bits o{}; + + o.u = (val & 0x7fff) << 13; // exponent/mantissa bits + unsigned int exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) { // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + } else if (exp == 0) { // Zero/Denormal? + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // re-normalize + } + + // Attempt to workaround the Internal Compiler Error on ARM64 + // for bitwise | operator, including std::bitset +#if (defined _MSC_VER) && (defined _M_ARM || defined _M_ARM64 || defined _M_ARM64EC) + if (IsNegative()) { + return -o.f; + } +#else + // original code: + o.u |= (val & 0x8000U) << 16U; // sign bit +#endif + return o.f; +} + +/// Shared implementation between public and internal classes. CRTP pattern. +template +struct BFloat16Impl { + protected: + /// + /// Converts from float to uint16_t float16 representation + /// + /// + /// + static uint16_t ToUint16Impl(float v) noexcept; + + /// + /// Converts bfloat16 to float + /// + /// float representation of bfloat16 value + float ToFloatImpl() const noexcept; + + /// + /// Creates an instance that represents absolute value. + /// + /// Absolute value + uint16_t AbsImpl() const noexcept + { + return static_cast(val & ~kSignMask); + } + + /// + /// Creates a new instance with the sign flipped. + /// + /// Flipped sign instance + uint16_t NegateImpl() const noexcept + { + return IsNaN() ? val : static_cast(val ^ kSignMask); + } + + public: + // uint16_t special values + static constexpr uint16_t kSignMask = 0x8000U; + static constexpr uint16_t kBiasedExponentMask = 0x7F80U; + static constexpr uint16_t kPositiveInfinityBits = 0x7F80U; + static constexpr uint16_t kNegativeInfinityBits = 0xFF80U; + static constexpr uint16_t kPositiveQNaNBits = 0x7FC1U; + static constexpr uint16_t kNegativeQNaNBits = 0xFFC1U; + static constexpr uint16_t kSignaling_NaNBits = 0x7F80U; + static constexpr uint16_t kEpsilonBits = 0x0080U; + static constexpr uint16_t kMinValueBits = 0xFF7FU; + static constexpr uint16_t kMaxValueBits = 0x7F7FU; + static constexpr uint16_t kRoundToNearest = 0x7FFFU; + static constexpr uint16_t kOneBits = 0x3F80U; + static constexpr uint16_t kMinusOneBits = 0xBF80U; + + uint16_t val{0}; + + BFloat16Impl() = default; + + /// + /// Checks if the value is negative + /// + /// true if negative + bool IsNegative() const noexcept + { + return static_cast(val) < 0; + } + + /// + /// Tests if the value is NaN + /// + /// true if NaN + bool IsNaN() const noexcept + { + return AbsImpl() > kPositiveInfinityBits; + } + + /// + /// Tests if the value is finite + /// + /// true if finite + bool IsFinite() const noexcept + { + return AbsImpl() < kPositiveInfinityBits; + } + + /// + /// Tests if the value represents positive infinity. + /// + /// true if positive infinity + bool IsPositiveInfinity() const noexcept + { + return val == kPositiveInfinityBits; + } + + /// + /// Tests if the value represents negative infinity + /// + /// true if negative infinity + bool IsNegativeInfinity() const noexcept + { + return val == kNegativeInfinityBits; + } + + /// + /// Tests if the value is either positive or negative infinity. + /// + /// True if absolute value is infinity + bool IsInfinity() const noexcept + { + return AbsImpl() == kPositiveInfinityBits; + } + + /// + /// Tests if the value is NaN or zero. Useful for comparisons. + /// + /// True if NaN or zero. + bool IsNaNOrZero() const noexcept + { + auto abs = AbsImpl(); + return (abs == 0 || abs > kPositiveInfinityBits); + } + + /// + /// Tests if the value is normal (not zero, subnormal, infinite, or NaN). + /// + /// True if so + bool IsNormal() const noexcept + { + auto abs = AbsImpl(); + return (abs < kPositiveInfinityBits) // is finite + && (abs != 0) // is not zero + && ((abs & kBiasedExponentMask) != 0); // is not subnormal (has a non-zero exponent) + } + + /// + /// Tests if the value is subnormal (denormal). + /// + /// True if so + bool IsSubnormal() const noexcept + { + auto abs = AbsImpl(); + return (abs < kPositiveInfinityBits) // is finite + && (abs != 0) // is not zero + && ((abs & kBiasedExponentMask) == 0); // is subnormal (has a zero exponent) + } + + /// + /// Creates an instance that represents absolute value. + /// + /// Absolute value + Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); } + + /// + /// Creates a new instance with the sign flipped. + /// + /// Flipped sign instance + Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); } + + /// + /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check + /// for two values by or'ing the private bits together and stripping the sign. They are both zero, + /// and therefore equivalent, if the resulting value is still zero. + /// + /// first value + /// second value + /// True if both arguments represent zero + static bool AreZero(const BFloat16Impl& lhs, const BFloat16Impl& rhs) noexcept + { + // IEEE defines that positive and negative zero are equal, this gives us a quick equality check + // for two values by or'ing the private bits together and stripping the sign. They are both zero, + // and therefore equivalent, if the resulting value is still zero. + return static_cast((lhs.val | rhs.val) & ~kSignMask) == 0; + } +}; + +template +inline uint16_t BFloat16Impl::ToUint16Impl(float v) noexcept +{ + uint16_t result; + if (std::isnan(v)) { + result = kPositiveQNaNBits; + } else { + auto get_msb_half = [](float fl) { + uint16_t result; +#ifdef __cpp_if_constexpr + if constexpr (detail::endian::native == detail::endian::little) +#else + if (detail::endian::native == detail::endian::little) +#endif + { + std::memcpy(&result, reinterpret_cast(&fl) + sizeof(uint16_t), sizeof(uint16_t)); + } else { + std::memcpy(&result, &fl, sizeof(uint16_t)); + } + return result; + }; + + uint16_t upper_bits = get_msb_half(v); + union { + uint32_t U32; + float F32; + }; + F32 = v; + U32 += (upper_bits & 1) + kRoundToNearest; + result = get_msb_half(F32); + } + return result; +} + +template +inline float BFloat16Impl::ToFloatImpl() const noexcept +{ + if (IsNaN()) { + return std::numeric_limits::quiet_NaN(); + } + float result; + char* const first = reinterpret_cast(&result); + char* const second = first + sizeof(uint16_t); +#ifdef __cpp_if_constexpr + if constexpr (detail::endian::native == detail::endian::little) +#else + if (detail::endian::native == detail::endian::little) +#endif + { + std::memset(first, 0, sizeof(uint16_t)); + std::memcpy(second, &val, sizeof(uint16_t)); + } else { + std::memcpy(first, &val, sizeof(uint16_t)); + std::memset(second, 0, sizeof(uint16_t)); + } + return result; +} + +/** \brief IEEE 754 half-precision floating point data type + * + * \details This struct is used for converting float to float16 and back + * so the user could feed inputs and fetch outputs using these type. + * + * The size of the structure should align with uint16_t and one can freely cast + * uint16_t buffers to/from Ort::Float16_t to feed and retrieve data. + * + * \code{.unparsed} + * // This example demonstrates converion from float to float16 + * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f}; + * std::vector fp16_values; + * fp16_values.reserve(std::size(values)); + * std::transform(std::begin(values), std::end(values), std::back_inserter(fp16_values), + * [](float value) { return Ort::Float16_t(value); }); + * + * \endcode + */ +struct Float16_t : OrtDataType::Float16Impl { + private: + /// + /// Constructor from a 16-bit representation of a float16 value + /// No conversion is done here. + /// + /// 16-bit representation + constexpr explicit Float16_t(uint16_t v) noexcept { val = v; } + + public: + using Base = OrtDataType::Float16Impl; + + /// + /// Default constructor + /// + Float16_t() = default; + + /// + /// Explicit conversion to uint16_t representation of float16. + /// + /// uint16_t bit representation of float16 + /// new instance of Float16_t + constexpr static Float16_t FromBits(uint16_t v) noexcept { return Float16_t(v); } + + /// + /// __ctor from float. Float is converted into float16 16-bit representation. + /// + /// float value + explicit Float16_t(float v) noexcept { val = Base::ToUint16Impl(v); } + + /// + /// Converts float16 to float + /// + /// float representation of float16 value + float ToFloat() const noexcept { return Base::ToFloatImpl(); } + + /// + /// Checks if the value is negative + /// + /// true if negative + using Base::IsNegative; + + /// + /// Tests if the value is NaN + /// + /// true if NaN + using Base::IsNaN; + + /// + /// Tests if the value is finite + /// + /// true if finite + using Base::IsFinite; + + /// + /// Tests if the value represents positive infinity. + /// + /// true if positive infinity + using Base::IsPositiveInfinity; + + /// + /// Tests if the value represents negative infinity + /// + /// true if negative infinity + using Base::IsNegativeInfinity; + + /// + /// Tests if the value is either positive or negative infinity. + /// + /// True if absolute value is infinity + using Base::IsInfinity; + + /// + /// Tests if the value is NaN or zero. Useful for comparisons. + /// + /// True if NaN or zero. + using Base::IsNaNOrZero; + + /// + /// Tests if the value is normal (not zero, subnormal, infinite, or NaN). + /// + /// True if so + using Base::IsNormal; + + /// + /// Tests if the value is subnormal (denormal). + /// + /// True if so + using Base::IsSubnormal; + + /// + /// Creates an instance that represents absolute value. + /// + /// Absolute value + using Base::Abs; + + /// + /// Creates a new instance with the sign flipped. + /// + /// Flipped sign instance + using Base::Negate; + + /// + /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check + /// for two values by or'ing the private bits together and stripping the sign. They are both zero, + /// and therefore equivalent, if the resulting value is still zero. + /// + /// first value + /// second value + /// True if both arguments represent zero + using Base::AreZero; + + /// + /// User defined conversion operator. Converts Float16_t to float. + /// + explicit operator float() const noexcept { return ToFloat(); } + + using Base::operator==; + using Base::operator!=; + using Base::operator<; +}; + +static_assert(sizeof(Float16_t) == sizeof(uint16_t), "Sizes must match"); + +/** \brief bfloat16 (Brain Floating Point) data type + * + * \details This struct is used for converting float to bfloat16 and back + * so the user could feed inputs and fetch outputs using these type. + * + * The size of the structure should align with uint16_t and one can freely cast + * uint16_t buffers to/from Ort::BFloat16_t to feed and retrieve data. + * + * \code{.unparsed} + * // This example demonstrates converion from float to float16 + * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f}; + * std::vector bfp16_values; + * bfp16_values.reserve(std::size(values)); + * std::transform(std::begin(values), std::end(values), std::back_inserter(bfp16_values), + * [](float value) { return Ort::BFloat16_t(value); }); + * + * \endcode + */ +struct BFloat16_t : OrtDataType::BFloat16Impl { + private: + /// + /// Constructor from a uint16_t representation of bfloat16 + /// used in FromBits() to escape overload resolution issue with + /// constructor from float. + /// No conversion is done. + /// + /// 16-bit bfloat16 value + constexpr explicit BFloat16_t(uint16_t v) noexcept { val = v; } + + public: + using Base = OrtDataType::BFloat16Impl; + + BFloat16_t() = default; + + /// + /// Explicit conversion to uint16_t representation of bfloat16. + /// + /// uint16_t bit representation of bfloat16 + /// new instance of BFloat16_t + static constexpr BFloat16_t FromBits(uint16_t v) noexcept { return BFloat16_t(v); } + + /// + /// __ctor from float. Float is converted into bfloat16 16-bit representation. + /// + /// float value + explicit BFloat16_t(float v) noexcept { val = Base::ToUint16Impl(v); } + + /// + /// Converts bfloat16 to float + /// + /// float representation of bfloat16 value + float ToFloat() const noexcept { return Base::ToFloatImpl(); } + + /// + /// Checks if the value is negative + /// + /// true if negative + using Base::IsNegative; + + /// + /// Tests if the value is NaN + /// + /// true if NaN + using Base::IsNaN; + + /// + /// Tests if the value is finite + /// + /// true if finite + using Base::IsFinite; + + /// + /// Tests if the value represents positive infinity. + /// + /// true if positive infinity + using Base::IsPositiveInfinity; + + /// + /// Tests if the value represents negative infinity + /// + /// true if negative infinity + using Base::IsNegativeInfinity; + + /// + /// Tests if the value is either positive or negative infinity. + /// + /// True if absolute value is infinity + using Base::IsInfinity; + + /// + /// Tests if the value is NaN or zero. Useful for comparisons. + /// + /// True if NaN or zero. + using Base::IsNaNOrZero; + + /// + /// Tests if the value is normal (not zero, subnormal, infinite, or NaN). + /// + /// True if so + using Base::IsNormal; + + /// + /// Tests if the value is subnormal (denormal). + /// + /// True if so + using Base::IsSubnormal; + + /// + /// Creates an instance that represents absolute value. + /// + /// Absolute value + using Base::Abs; + + /// + /// Creates a new instance with the sign flipped. + /// + /// Flipped sign instance + using Base::Negate; + + /// + /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check + /// for two values by or'ing the private bits together and stripping the sign. They are both zero, + /// and therefore equivalent, if the resulting value is still zero. + /// + /// first value + /// second value + /// True if both arguments represent zero + using Base::AreZero; + + /// + /// User defined conversion operator. Converts BFloat16_t to float. + /// + explicit operator float() const noexcept { return ToFloat(); } + + // We do not have an inherited impl for the below operators + // as the internal class implements them a little differently + bool operator==(const BFloat16_t& rhs) const noexcept; + bool operator!=(const BFloat16_t& rhs) const noexcept { return !(*this == rhs); } + bool operator<(const BFloat16_t& rhs) const noexcept; +}; + +static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match"); + +} // namespace OrtDataType + +} // namespace o2 \ No newline at end of file diff --git a/Common/ML/include/ML/ort_interface.h b/Common/ML/include/ML/ort_interface.h new file mode 100644 index 0000000000000..a365860db3279 --- /dev/null +++ b/Common/ML/include/ML/ort_interface.h @@ -0,0 +1,94 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file ort_interface.h +/// \author Christian Sonnabend +/// \brief A header library for loading ONNX models and inferencing them on CPU and GPU + +#ifndef O2_ML_ONNX_INTERFACE_H +#define O2_ML_ONNX_INTERFACE_H + +// C++ and system includes +#include +#include +#include +#include +#include + +// O2 includes +#include "Framework/Logger.h" + +namespace o2 +{ + +namespace ml +{ + +class OrtModel +{ + + public: + // Constructor + OrtModel() = default; + OrtModel(std::unordered_map optionsMap){ reset(optionsMap); } + void init(std::unordered_map optionsMap){ reset(optionsMap); } + void reset(std::unordered_map); + + virtual ~OrtModel() = default; + + // Conversion + template + std::vector v2v(std::vector&, bool = true); + + // Inferencing + template // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h + std::vector inference(std::vector&); + + template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h + std::vector inference(std::vector>&); + + // template // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type + // std::vector inference(std::vector&); + + // Reset session + void resetSession(); + + std::vector> getNumInputNodes() const { return mInputShapes; } + std::vector> getNumOutputNodes() const { return mOutputShapes; } + std::vector getInputNames() const { return mInputNames; } + std::vector getOutputNames() const { return mOutputNames; } + + void setActiveThreads(int threads) { intraOpNumThreads = threads; } + + private: + + // ORT variables -> need to be hidden as Pimpl + struct OrtVariables; + OrtVariables* pImplOrt; + + // Input & Output specifications of the loaded network + std::vector inputNamesChar, outputNamesChar; + std::vector mInputNames, mOutputNames; + std::vector> mInputShapes, mOutputShapes; + + // Environment settings + std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda + int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; + + std::string printShape(const std::vector&); + +}; + +} // namespace ml + +} // namespace ml + +#endif // O2_ML_ORT_INTERFACE_H \ No newline at end of file diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx new file mode 100644 index 0000000000000..84a06ce1da068 --- /dev/null +++ b/Common/ML/src/ort_interface.cxx @@ -0,0 +1,262 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file ort_interface.cxx +/// \author Christian Sonnabend +/// \brief A header library for loading ONNX models and inferencing them on CPU and GPU + +#include "ML/ort_interface.h" +#include "ML/3rdparty/GPUORTFloat16.h" + +// ONNX includes +#include + +namespace o2 +{ + +namespace ml +{ + +struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file + // ORT runtime objects + Ort::RunOptions runOptions; + std::shared_ptr env = nullptr; + std::shared_ptr session = nullptr; ///< ONNX session + Ort::SessionOptions sessionOptions; + Ort::AllocatorWithDefaultOptions allocator; + Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); +}; + +void OrtModel::reset(std::unordered_map optionsMap){ + + pImplOrt = new OrtVariables(); + + // Load from options map + if(!optionsMap.contains("model-path")){ + LOG(fatal) << "(ORT) Model path cannot be empty!"; + } + modelPath = optionsMap["model-path"]; + device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU"); + dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float"); + deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0); + allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); + intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); + loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0); + enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0); + enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); + + std::string dev_mem_str = "Hip"; +#ifdef ORT_ROCM_BUILD + if(device == "ROCM") { + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId)); + LOG(info) << "(ORT) ROCM execution provider set"; + } +#endif +#ifdef ORT_MIGRAPHX_BUILD + if(device == "MIGRAPHX") { + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId)); + LOG(info) << "(ORT) MIGraphX execution provider set"; + } +#endif +#ifdef ORT_CUDA_BUILD + if(device == "CUDA") { + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId)); + LOG(info) << "(ORT) CUDA execution provider set"; + dev_mem_str = "Cuda"; + } +#endif + + if(allocateDeviceMemory){ + pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); + LOG(info) << "(ORT) Memory info set to on-device memory"; + } + + if(device == "CPU") { + (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads); + if(intraOpNumThreads > 1){ + (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL); + } else if(intraOpNumThreads == 1){ + (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); + } + LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads"; + } + + (pImplOrt->sessionOptions).DisableMemPattern(); + (pImplOrt->sessionOptions).DisableCpuMemArena(); + + if(enableProfiling){ + if(optionsMap.contains("profiling-output-path")){ + (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str()); + } else { + LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now."; + (pImplOrt->sessionOptions).DisableProfiling(); + } + } else { + (pImplOrt->sessionOptions).DisableProfiling(); + } + (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations)); + (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel)); + + pImplOrt->env = std::make_shared(OrtLoggingLevel(loggingLevel), (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str())); + (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); + + for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { + mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); + } + for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { + mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + } + for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { + mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get()); + } + for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { + mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + } + + inputNamesChar.resize(mInputNames.size(), nullptr); + std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar), + [&](const std::string& str) { return str.c_str(); }); + outputNamesChar.resize(mOutputNames.size(), nullptr); + std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar), + [&](const std::string& str) { return str.c_str(); }); + + // Print names + if(loggingLevel > 1) { + LOG(info) << "Input Nodes:"; + for (size_t i = 0; i < mInputNames.size(); i++) { + LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]); + } + + LOG(info) << "Output Nodes:"; + for (size_t i = 0; i < mOutputNames.size(); i++) { + LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]); + } + } +} + +void OrtModel::resetSession() { + (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); +} + +template +std::vector OrtModel::v2v(std::vector& input, bool clearInput) { + if constexpr (std::is_same_v){ + return input; + } else { + std::vector output(input.size()); + std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); }); + if(clearInput) input.clear(); + return output; + } +} + +template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h +std::vector OrtModel::inference(std::vector& input){ + std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + std::vector inputTensor; + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (reinterpret_cast(input)).data(), input.size(), inputShape.data(), inputShape.size())); + // input.clear(); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + O* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); + std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; + outputTensors.clear(); + return outputValuesVec; +} + +template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h +std::vector OrtModel::inference(std::vector>& input){ + std::vector inputTensor; + for(auto i : input){ + std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (reinterpret_cast(i)).data(), i.size(), inputShape.data(), inputShape.size())); + } + // input.clear(); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + O* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); + std::vector outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]}; + outputTensors.clear(); + return outputValuesVec; +} + +std::string OrtModel::printShape(const std::vector& v) +{ + std::stringstream ss(""); + for (size_t i = 0; i < v.size() - 1; i++) + ss << v[i] << "x"; + ss << v[v.size() - 1]; + return ss.str(); +} + +template <> std::vector OrtModel::inference(std::vector& input) { + std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + std::vector inputTensor; + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size())); + // input.clear(); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + float* outputValues = outputTensors[0].template GetTensorMutableData(); + std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; + outputTensors.clear(); + return outputValuesVec; +} + +template <> std::vector OrtModel::inference(std::vector& input) { + std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + std::vector inputTensor; + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); + // input.clear(); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + float* outputValues = outputTensors[0].template GetTensorMutableData(); + std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; + outputTensors.clear(); + return outputValuesVec; +} + +template <> std::vector OrtModel::inference(std::vector& input) { + std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + std::vector inputTensor; + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); + // input.clear(); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); + std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; + outputTensors.clear(); + return outputValuesVec; +} + +template <> std::vector OrtModel::inference(std::vector& input) { + std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + std::vector inputTensor; + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); + // input.clear(); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); + std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; + outputTensors.clear(); + return outputValuesVec; +} + +template <> std::vector OrtModel::inference(std::vector>& input) { + std::vector inputTensor; + for(auto i : input){ + std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(i.data()), i.size(), inputShape.data(), inputShape.size())); + } + // input.clear(); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); + std::vector outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]}; + outputTensors.clear(); + return outputValuesVec; +} + +} // namespace ml + +} // namespace o2 \ No newline at end of file From 9232328476bbafb06cc660c2f122d81b67da9d73 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Mon, 18 Nov 2024 18:48:18 +0000 Subject: [PATCH 17/23] Please consider the following formatting changes --- Common/ML/include/ML/ort_interface.h | 76 ++++++++++++------------ Common/ML/src/ort_interface.cxx | 88 ++++++++++++++++------------ 2 files changed, 89 insertions(+), 75 deletions(-) diff --git a/Common/ML/include/ML/ort_interface.h b/Common/ML/include/ML/ort_interface.h index a365860db3279..2fe9a44a0623c 100644 --- a/Common/ML/include/ML/ort_interface.h +++ b/Common/ML/include/ML/ort_interface.h @@ -35,60 +35,58 @@ namespace ml class OrtModel { - public: - // Constructor - OrtModel() = default; - OrtModel(std::unordered_map optionsMap){ reset(optionsMap); } - void init(std::unordered_map optionsMap){ reset(optionsMap); } - void reset(std::unordered_map); + public: + // Constructor + OrtModel() = default; + OrtModel(std::unordered_map optionsMap) { reset(optionsMap); } + void init(std::unordered_map optionsMap) { reset(optionsMap); } + void reset(std::unordered_map); - virtual ~OrtModel() = default; + virtual ~OrtModel() = default; - // Conversion - template - std::vector v2v(std::vector&, bool = true); + // Conversion + template + std::vector v2v(std::vector&, bool = true); - // Inferencing - template // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h - std::vector inference(std::vector&); + // Inferencing + template // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h + std::vector inference(std::vector&); - template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h - std::vector inference(std::vector>&); + template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h + std::vector inference(std::vector>&); - // template // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type - // std::vector inference(std::vector&); + // template // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type + // std::vector inference(std::vector&); - // Reset session - void resetSession(); + // Reset session + void resetSession(); - std::vector> getNumInputNodes() const { return mInputShapes; } - std::vector> getNumOutputNodes() const { return mOutputShapes; } - std::vector getInputNames() const { return mInputNames; } - std::vector getOutputNames() const { return mOutputNames; } + std::vector> getNumInputNodes() const { return mInputShapes; } + std::vector> getNumOutputNodes() const { return mOutputShapes; } + std::vector getInputNames() const { return mInputNames; } + std::vector getOutputNames() const { return mOutputNames; } - void setActiveThreads(int threads) { intraOpNumThreads = threads; } + void setActiveThreads(int threads) { intraOpNumThreads = threads; } - private: + private: + // ORT variables -> need to be hidden as Pimpl + struct OrtVariables; + OrtVariables* pImplOrt; - // ORT variables -> need to be hidden as Pimpl - struct OrtVariables; - OrtVariables* pImplOrt; + // Input & Output specifications of the loaded network + std::vector inputNamesChar, outputNamesChar; + std::vector mInputNames, mOutputNames; + std::vector> mInputShapes, mOutputShapes; - // Input & Output specifications of the loaded network - std::vector inputNamesChar, outputNamesChar; - std::vector mInputNames, mOutputNames; - std::vector> mInputShapes, mOutputShapes; - - // Environment settings - std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda - int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; - - std::string printShape(const std::vector&); + // Environment settings + std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda + int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; + std::string printShape(const std::vector&); }; } // namespace ml -} // namespace ml +} // namespace o2 #endif // O2_ML_ORT_INTERFACE_H \ No newline at end of file diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx index 84a06ce1da068..8ebe0588b4a2b 100644 --- a/Common/ML/src/ort_interface.cxx +++ b/Common/ML/src/ort_interface.cxx @@ -25,7 +25,7 @@ namespace o2 namespace ml { -struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file +struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file // ORT runtime objects Ort::RunOptions runOptions; std::shared_ptr env = nullptr; @@ -35,12 +35,13 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the . Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); }; -void OrtModel::reset(std::unordered_map optionsMap){ +void OrtModel::reset(std::unordered_map optionsMap) +{ pImplOrt = new OrtVariables(); // Load from options map - if(!optionsMap.contains("model-path")){ + if (!optionsMap.contains("model-path")) { LOG(fatal) << "(ORT) Model path cannot be empty!"; } modelPath = optionsMap["model-path"]; @@ -48,42 +49,42 @@ void OrtModel::reset(std::unordered_map optionsMap){ dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float"); deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0); allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); - intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); + intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0); enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0); enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); std::string dev_mem_str = "Hip"; #ifdef ORT_ROCM_BUILD - if(device == "ROCM") { + if (device == "ROCM") { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) ROCM execution provider set"; } #endif #ifdef ORT_MIGRAPHX_BUILD - if(device == "MIGRAPHX") { + if (device == "MIGRAPHX") { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) MIGraphX execution provider set"; } #endif #ifdef ORT_CUDA_BUILD - if(device == "CUDA") { + if (device == "CUDA") { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) CUDA execution provider set"; dev_mem_str = "Cuda"; } #endif - if(allocateDeviceMemory){ + if (allocateDeviceMemory) { pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); LOG(info) << "(ORT) Memory info set to on-device memory"; } - if(device == "CPU") { + if (device == "CPU") { (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads); - if(intraOpNumThreads > 1){ + if (intraOpNumThreads > 1) { (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL); - } else if(intraOpNumThreads == 1){ + } else if (intraOpNumThreads == 1) { (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); } LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads"; @@ -92,8 +93,8 @@ void OrtModel::reset(std::unordered_map optionsMap){ (pImplOrt->sessionOptions).DisableMemPattern(); (pImplOrt->sessionOptions).DisableCpuMemArena(); - if(enableProfiling){ - if(optionsMap.contains("profiling-output-path")){ + if (enableProfiling) { + if (optionsMap.contains("profiling-output-path")) { (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str()); } else { LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now."; @@ -109,27 +110,27 @@ void OrtModel::reset(std::unordered_map optionsMap){ (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { - mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); + mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); } for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { - mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); } for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { - mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get()); + mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get()); } for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { - mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); } inputNamesChar.resize(mInputNames.size(), nullptr); std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar), - [&](const std::string& str) { return str.c_str(); }); + [&](const std::string& str) { return str.c_str(); }); outputNamesChar.resize(mOutputNames.size(), nullptr); std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar), - [&](const std::string& str) { return str.c_str(); }); + [&](const std::string& str) { return str.c_str(); }); // Print names - if(loggingLevel > 1) { + if (loggingLevel > 1) { LOG(info) << "Input Nodes:"; for (size_t i = 0; i < mInputNames.size(); i++) { LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]); @@ -142,24 +143,28 @@ void OrtModel::reset(std::unordered_map optionsMap){ } } -void OrtModel::resetSession() { +void OrtModel::resetSession() +{ (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); } -template -std::vector OrtModel::v2v(std::vector& input, bool clearInput) { - if constexpr (std::is_same_v){ +template +std::vector OrtModel::v2v(std::vector& input, bool clearInput) +{ + if constexpr (std::is_same_v) { return input; } else { std::vector output(input.size()); std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); }); - if(clearInput) input.clear(); + if (clearInput) + input.clear(); return output; } } -template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h -std::vector OrtModel::inference(std::vector& input){ +template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h +std::vector OrtModel::inference(std::vector& input) +{ std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (reinterpret_cast(input)).data(), input.size(), inputShape.data(), inputShape.size())); @@ -171,10 +176,11 @@ std::vector OrtModel::inference(std::vector& input){ return outputValuesVec; } -template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h -std::vector OrtModel::inference(std::vector>& input){ +template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h +std::vector OrtModel::inference(std::vector>& input) +{ std::vector inputTensor; - for(auto i : input){ + for (auto i : input) { std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (reinterpret_cast(i)).data(), i.size(), inputShape.data(), inputShape.size())); } @@ -195,7 +201,9 @@ std::string OrtModel::printShape(const std::vector& v) return ss.str(); } -template <> std::vector OrtModel::inference(std::vector& input) { +template <> +std::vector OrtModel::inference(std::vector& input) +{ std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size())); @@ -207,7 +215,9 @@ template <> std::vector OrtModel::inference(std::vector std::vector OrtModel::inference(std::vector& input) { +template <> +std::vector OrtModel::inference(std::vector& input) +{ std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); @@ -219,7 +229,9 @@ template <> std::vector OrtModel::inference std::vector OrtModel::inference(std::vector& input) { +template <> +std::vector OrtModel::inference(std::vector& input) +{ std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); @@ -231,7 +243,9 @@ template <> std::vector OrtModel::inference std::vector OrtModel::inference(std::vector& input) { +template <> +std::vector OrtModel::inference(std::vector& input) +{ std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); @@ -243,9 +257,11 @@ template <> std::vector OrtModel::inference std::vector OrtModel::inference(std::vector>& input) { +template <> +std::vector OrtModel::inference(std::vector>& input) +{ std::vector inputTensor; - for(auto i : input){ + for (auto i : input) { std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(i.data()), i.size(), inputShape.data(), inputShape.size())); } From 7251c5cfb30266479d3f8d7df38c733ba65add77 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Tue, 19 Nov 2024 09:23:26 +0100 Subject: [PATCH 18/23] Fixing macOS build issues with calling O*.data() --- Common/ML/src/ort_interface.cxx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx index 8ebe0588b4a2b..222dab55e6e6b 100644 --- a/Common/ML/src/ort_interface.cxx +++ b/Common/ML/src/ort_interface.cxx @@ -167,7 +167,7 @@ std::vector OrtModel::inference(std::vector& input) { std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (reinterpret_cast(input)).data(), input.size(), inputShape.data(), inputShape.size())); + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); // input.clear(); auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); O* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); @@ -182,7 +182,7 @@ std::vector OrtModel::inference(std::vector>& input) std::vector inputTensor; for (auto i : input) { std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (reinterpret_cast(i)).data(), i.size(), inputShape.data(), inputShape.size())); + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(i.data()), i.size(), inputShape.data(), inputShape.size())); } // input.clear(); auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); From d0f4dd8271a880c3152cc4e7ae511bb8439aa466 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Tue, 19 Nov 2024 20:40:17 +0100 Subject: [PATCH 19/23] Fixing compiler issues and char -> uint8_t --- Common/ML/src/ort_interface.cxx | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx index 222dab55e6e6b..cf60a3369613a 100644 --- a/Common/ML/src/ort_interface.cxx +++ b/Common/ML/src/ort_interface.cxx @@ -107,7 +107,7 @@ void OrtModel::reset(std::unordered_map optionsMap) (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel)); pImplOrt->env = std::make_shared(OrtLoggingLevel(loggingLevel), (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str())); - (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); + pImplOrt->session = std::make_shared(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions); for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); @@ -145,7 +145,7 @@ void OrtModel::reset(std::unordered_map optionsMap) void OrtModel::resetSession() { - (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); + (pImplOrt->session).reset(std::make_shared{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); } template @@ -156,8 +156,9 @@ std::vector OrtModel::v2v(std::vector& input, bool clearInput) } else { std::vector output(input.size()); std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); }); - if (clearInput) + if (clearInput) { input.clear(); + } return output; } } @@ -195,8 +196,9 @@ std::vector OrtModel::inference(std::vector>& input) std::string OrtModel::printShape(const std::vector& v) { std::stringstream ss(""); - for (size_t i = 0; i < v.size() - 1; i++) + for (size_t i = 0; i < v.size() - 1; i++) { ss << v[i] << "x"; + } ss << v[v.size() - 1]; return ss.str(); } From 7859ab25223ec10c475bbbfa4c6b2da09dfcc609 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Tue, 19 Nov 2024 21:09:14 +0100 Subject: [PATCH 20/23] Fixing curly braces --- Common/ML/src/ort_interface.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx index cf60a3369613a..feeebe99fa6fa 100644 --- a/Common/ML/src/ort_interface.cxx +++ b/Common/ML/src/ort_interface.cxx @@ -145,7 +145,7 @@ void OrtModel::reset(std::unordered_map optionsMap) void OrtModel::resetSession() { - (pImplOrt->session).reset(std::make_shared{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); + (pImplOrt->session).reset(std::make_shared(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions)); } template From c6cb3e6f2992f9328185c360c1590a412f401575 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Tue, 19 Nov 2024 22:29:48 +0100 Subject: [PATCH 21/23] Fixing std::make_shared --- Common/ML/src/ort_interface.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx index feeebe99fa6fa..160fdbadf84e4 100644 --- a/Common/ML/src/ort_interface.cxx +++ b/Common/ML/src/ort_interface.cxx @@ -145,7 +145,7 @@ void OrtModel::reset(std::unordered_map optionsMap) void OrtModel::resetSession() { - (pImplOrt->session).reset(std::make_shared(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions)); + pImplOrt->session = std::make_shared(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions); } template From 40bc4371920d9f7b51469d58135d7ee742ea5606 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Wed, 20 Nov 2024 10:38:11 +0100 Subject: [PATCH 22/23] Changing order for --- GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index bccff6328cb1d..c528f65c3924f 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -12,6 +12,8 @@ /// \file GPUChainTrackingClusterizer.cxx /// \author David Rohr +#include + #include "GPUChainTracking.h" #include "GPUChainTrackingDefs.h" #include "GPULogging.h" @@ -37,7 +39,6 @@ #endif #include "utils/strtag.h" -#include #ifndef GPUCA_NO_VC #include From 52b033f0c9594fc5238c986037c3dc9645a04841 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Tue, 17 Dec 2024 22:46:16 +0100 Subject: [PATCH 23/23] Bug-fixing file name --- GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h index fd420357073e9..af5315ddae4ac 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h @@ -19,7 +19,7 @@ #include "GPUProcessor.h" #include "GPUDataTypes.h" #include "CfFragment.h" -#include "ML/ort_interface.h" +#include "ML/OrtInterface.h" #include "ML/3rdparty/GPUORTFloat16.h" using namespace o2::ml;