uxlfoundation · akukanov · Nov 28, 2024 · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024
diff --git a/doc/main/tbb_userguide/Advanced_Topic_Other_Kinds_of_Iteration_Spaces.rst b/doc/main/tbb_userguide/Advanced_Topic_Other_Kinds_of_Iteration_Spaces.rst
@@ -72,4 +72,40 @@ along its longest axis. When used with ``parallel_for``, it causes the
 loop to be "recursively blocked" in a way that improves cache usage.
 This nice cache behavior means that using ``parallel_for`` over a
 ``blocked_range2d<T>`` can make a loop run faster than the sequential
-equivalent, even on a single processor.
+equivalent, even on a single processor. 
+
+The ``blocked_range2d`` allows you to use different value types for
+its first dimension, *rows*, and the second one, *columns*.
+That means you can combine indexes, pointers, and iterators into a joint
+iteration space. Use the methods ``rows()`` and ``cols()`` to obtain
+``blocked_range`` objects that represent the respective dimensions.
+
+The ``blocked_range3d`` class template extends this approach to 3D by adding
+``pages()`` as the first dimension, followed by ``rows()`` and ``cols()``.
+
+The ``blocked_nd_range<T,N>`` class template represents a blocked iteration
+space of any dimensionality. Unlike the previously described 2D and 3D ranges,
+``blocked_nd_range`` uses the same value type for all its axes, and its
+constructor requires you to pass N instances of ``blocked_range<T>`` instead of
+individual boundary values. The change in the naming pattern reflects these
+differences.
+
+
+Example of a Multidimensional Iteration Space
+------------------------------------------------
+
+The example demonstrates calculation of a 3-dimensional filter over the pack
+of feature maps.
+
+The ``convolution3d`` function iterates over the output cells, assigning to
+each cell the result of the ``kernel3d`` function that combines the values
+from a range in the feature maps.
+
+To run the computation in parallel, ``tbb::parallel_for`` is called with
+``tbb::blocked_nd_range<int,3>`` as an argument. The body function processes
+the received 3D subrange in nested loops, using the method ``dim`` to get
+the loop boundaries for each dimension.
+
+
+.. literalinclude:: ./snippets/blocked_nd_range_example.h
+   :language: c++
diff --git a/doc/main/tbb_userguide/parallel_for_os.rst b/doc/main/tbb_userguide/parallel_for_os.rst
@@ -55,8 +55,9 @@ before each identifier. The rest of the examples assume that such a
 Note the argument to ``operator()``. A ``blocked_range<T>`` is a
 template class provided by the library. It describes a one-dimensional
 iteration space over type ``T``. Class ``parallel_for`` works with other
-kinds of iteration spaces too. The library provides ``blocked_range2d``
-for two-dimensional spaces. You can define your own spaces as explained
+kinds of iteration spaces too. The library provides ``blocked_range2d``,
+``blocked_range3d``, and ``blocked_nd_range`` for multidimensional spaces.
+You can define your own spaces as explained
 in :ref:`Advanced_Topic_Other_Kinds_of_Iteration_Spaces`.
 
 

diff --git a/doc/main/tbb_userguide/snippets/blocked_nd_range_example.cpp b/doc/main/tbb_userguide/snippets/blocked_nd_range_example.cpp
@@ -0,0 +1,37 @@
+#include "blocked_nd_range_example.h"
+#include <vector>
+#include <cassert>
+
+int main() {
+    const int kernel_length = 9;
+    const int kernel_width = 5;
+    const int kernel_height = 5;
+
+    const int feature_maps_length = 128;
+    const int feature_maps_width = 16;
+    const int feature_maps_heigth = 16;
+
+    const int out_length = feature_maps_length - kernel_length + 1;
+    const int out_width = feature_maps_width - kernel_width + 1;
+    const int out_heigth = feature_maps_heigth - kernel_height + 1;
+
+    // Initializes feature maps with 1 in each cell and out with zeros.
+    std::vector<std::vector<std::vector<float>>> feature_maps(feature_maps_length, std::vector<std::vector<float>>(feature_maps_width, std::vector<float>(feature_maps_heigth, 1.0f)));
+    std::vector<std::vector<std::vector<float>>> out(out_length, std::vector<std::vector<float>>(out_width, std::vector<float>(out_heigth, 0.f)));
+
+    // 3D convolution calculates the sum of all elements in the kernel
+    convolution3d(feature_maps, out,
+                  out_length, out_width, out_heigth,
+                  kernel_length, kernel_width, kernel_height);
+
+    // Checks correctness of convolution by equality to the expected sum of elements
+    float expected = float(kernel_length * kernel_height * kernel_width);
+    for (auto i : out) {
+        for (auto j : i) {
+            for (auto k : j) {
+                assert(k == expected && "convolution failed to calculate correctly");
+            }
+        }
+    }
+    return 0;
+}
diff --git a/doc/main/tbb_userguide/snippets/blocked_nd_range_example.h b/doc/main/tbb_userguide/snippets/blocked_nd_range_example.h
@@ -0,0 +1,37 @@
+#include "oneapi/tbb/blocked_nd_range.h"
+#include "oneapi/tbb/parallel_for.h"
+
+template<typename Features>
+float kernel3d(const Features& feature_maps, int i, int j, int k,
+               int kernel_length, int kernel_width, int kernel_height) {
+    float result = 0.f;
+
+    for (int feature_i = i; feature_i < i + kernel_length; ++feature_i)
+        for (int feature_j = j; feature_j < j + kernel_width; ++feature_j)
+            for (int feature_k = k; feature_k < k + kernel_width; ++feature_k)
+                result += feature_maps[feature_i][feature_j][feature_k];
+
+    return result;
+}
+
+template<typename Features, typename Output>
+void convolution3d(const Features& feature_maps, Output& out,
+                   int out_length, int out_width, int out_heigth,
+                   int kernel_length, int kernel_width, int kernel_height) {
+    using range_t = oneapi::tbb::blocked_nd_range<int, 3>;
+
+    oneapi::tbb::parallel_for(
+        range_t({0, out_length}, {0, out_width}, {0, out_heigth}),
+        [&](const range_t& out_range) {
+            auto out_x = out_range.dim(0);
+            auto out_y = out_range.dim(1);
+            auto out_z = out_range.dim(2);
+
+            for (int i = out_x.begin(); i < out_x.end(); ++i)
+                for (int j = out_y.begin(); j < out_y.end(); ++j)
+                    for (int k = out_z.begin(); k < out_z.end(); ++k)
+                        out[i][j][k] = kernel3d(feature_maps, i, j, k,
+                                                kernel_length, kernel_width, kernel_height);
+        }
+    );
+}