Merge pull request #73 from lm-konda/kernel_transform

Improve kernel transform
blue-oil · Feb 8, 2019 · 9f3b5d8 · 9f3b5d8
2 parents 94488d9 + d79172d
commit 9f3b5d8
Show file tree

Hide file tree

Showing 8 changed files with 122 additions and 71 deletions.
diff --git a/dlk/backends/include/fpga_utils.h b/dlk/backends/include/fpga_utils.h
@@ -97,7 +97,7 @@ class MappedMem
 
 
   template<typename T>
-  memtype Write(T *data, unsigned int size)
+  memtype Write(const T *data, unsigned int size)
   {
     T *mem_ptr = (T *) mem;
     for(unsigned int i = 0; i < size; i++) {
@@ -107,7 +107,7 @@ class MappedMem
 
 
   template<typename T>
-  bool Check(T *data, unsigned int size)
+  bool Check(const T *data, unsigned int size)
   {
     bool success = true;
     T *mem_ptr = (T *) mem;

diff --git a/dlk/python/dlk/core/operators.py b/dlk/python/dlk/core/operators.py
@@ -560,14 +560,16 @@ def __init__(self,
                  data: np.ndarray,
                  dimension_format: str = 'NHWC',
                  packed: bool = False,
-                 actual_shape: List[int] = []) -> None:
+                 actual_shape: List[int] = [],
+                 transposed_data: List[int] = None) -> None:
         """Init the variable.
 
         If the constant is hard quantized, data is packed and the actual shape
         must be expressed with `actual_shape`.
         """
         shape = list(data.shape) if not packed else actual_shape
         self._packed = packed
+        self._transposed_data = transposed_data
         super().__init__(name, shape, dtype, {}, data, dimension_format=dimension_format)
 
     def run_forward(self) -> np.ndarray:
@@ -577,6 +579,11 @@ def run_forward(self) -> np.ndarray:
     def is_packed(self) -> bool:
         return self._packed
 
+    @property
+    def transposed_data(self) -> List[int]:
+        """Return transposed data."""
+        return self._transposed_data
+
 
 class Output(Variable):
     """Output class."""

diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py
@@ -19,13 +19,85 @@
 
 from core.graph import Graph
 from core.graph_pattern_matching import get_nodes_in_branch, sort_graph
-from core.operators import Constant, Operator
+from core.operators import Constant, Operator, Conv
 from core.data_types import Uint32, QUANTIZED_NOT_PACKED
-from typing import cast
+from typing import cast, List, Any
 from collections import defaultdict
 from modules.packer import Packer
 
 
+def _transpose_kernels(kernel_data: np.ndarray,
+                       oh: int,
+                       ow: int,
+                       od: int,
+                       kh: int,
+                       kw: int,
+                       kd: int) -> List[int]:
+    """Calculates and prepares the transposed kernel data in advance.
+
+    Parameters
+    ----------
+    kernel_data : np.ndarray
+        The input data.
+    oh : int
+        output height
+    ow : int
+        output width
+    od : int
+        output depth
+    kh : int
+        kernel height
+    kw : int
+        kernel width
+    kd : int
+        kernel depth
+    """
+    NUM_PE = 16
+    NBIT_QDYPE = 32
+    MAX_NBIT_QINPUT = 2
+    MAX_NBIT_KERNEL = 1
+    num_qinput_per_qword = int(NBIT_QDYPE / MAX_NBIT_QINPUT)
+    num_qkernel_per_qword = int(NBIT_QDYPE / MAX_NBIT_KERNEL)
+    k_c_by_word = int((kd + (num_qkernel_per_qword - 1)) / num_qkernel_per_qword)
+    k_n_aligned_with_num_pe = int(((od + (NUM_PE - 1)) / NUM_PE) * NUM_PE)
+    if od < NUM_PE:
+        k_size = k_n_aligned_with_num_pe * kh * kw * k_c_by_word
+    else:
+        k_size = od * kh * kw * k_c_by_word
+
+    flatten_value = []
+    for elem in kernel_data:
+        flatten_value.extend(elem)
+    while len(flatten_value) != k_size:
+        flatten_value.extend("0")
+
+    copy_value = [0] * k_size
+    for i in range(od * kh * kw * k_c_by_word):
+        copy_value[i] = flatten_value[i]
+
+    transposed_values = [0] * k_size
+    if (od < NUM_PE):
+        kn_out = int(k_n_aligned_with_num_pe / NUM_PE)
+    else:
+        kn_out = int(od / NUM_PE)
+    idx_src = 0
+
+    for no in range(kn_out):
+        for ni in range(NUM_PE):
+            for h in range(kh):
+                for w in range(kw):
+                    for c in range(k_c_by_word):
+                        idx_dst = h * (kw * kn_out * k_c_by_word * NUM_PE)
+                        idx_dst += w * (kn_out * k_c_by_word * NUM_PE)
+                        idx_dst += no * (k_c_by_word * NUM_PE)
+                        idx_dst += c * (NUM_PE)
+                        idx_dst += ni
+                        transposed_values[idx_dst] = copy_value[idx_src]
+                        idx_src += 1
+
+    return transposed_values
+
+
 def pass_remove_identities(graph: Graph) -> None:
     """Removes those nodes of a Graph that satisfies the condition node.op_type() == Identity.
 
@@ -173,6 +245,7 @@ def pass_propagate_quantization_details_into_conv(graph: Graph) -> None:
     graph : Graph
         The input graph. It will be modified in-place.
     """
+
     exec_list = sort_graph(graph)
     qtypes = [
         'QTZ_binary_mean_scaling',
@@ -364,12 +437,19 @@ def pass_pack_weights(graph: Graph) -> None:
         data = packer.run(op_data.astype(np.float32), weight_quantizer.dimension)
 
         # Create the new constant with the quantized weights
+        oh = conv_node.height
+        ow = conv_node.width
+        od = conv_node.channel
+        kh = conv_node.kernel_height
+        kw = conv_node.kernel_width
+        kd = conv_node.input_ops['X'].channel
         quantized_constant = Constant(
             weight_quantizer.name + '_new',
             Uint32(),
             data,
             packed=True,
-            actual_shape=weight_quantizer.shape
+            actual_shape=weight_quantizer.shape,
+            transposed_data=_transpose_kernels(data, oh, ow, od, kh, kw, kd)
         )
 
         # get nodes to be removed after being disconnected

diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py
@@ -108,7 +108,6 @@ def run(self):
             nbit_qinput = 8 if x_op.op_type == 'Input' else 2
 
             if op.is_quantized and nbit_qinput == 2:
-                qconv_idx = 0  # temporary
                 qk_elems = w_op.data.shape[1]
 
                 kh = self.op.kernel_height
@@ -157,7 +156,6 @@ def run(self):
                     binConv2D_struct.bin_kernel_ndata = {qk_elems};
                     binConv2D_struct.bin_input_nwords = {qk_elems};
                     binConv2D_struct.bin_input_ndata = {qk_elems}*{nbit_qinput};
-                    binConv2D_struct.layer_index = {qconv_idx};
                     binConv2D_struct.device_input_buf = device_input_buf;
                     binConv2D_struct.device_output_buf = device_output_buf;
                     binConv2D_struct.thresholds = {threshold};

diff --git a/dlk/python/dlk/templates/include/de10_nano.h b/dlk/python/dlk/templates/include/de10_nano.h
@@ -126,7 +126,7 @@ class QconvWithKn2row {
 };
 
 void qconv_with_kn2row(unsigned long input_addr, unsigned long output_addr,
-                       T_UINT k_data_packed[], BIN_CONV_OUTPUT th_data[],
+                       const T_UINT k_data_packed[], BIN_CONV_OUTPUT th_data[],
                        unsigned in_w, unsigned in_h, unsigned in_c_by_word,
                        unsigned nbits_in_data, unsigned out_w, unsigned out_h,
                        unsigned out_c, unsigned k_w, unsigned k_h, unsigned pad,

diff --git a/dlk/python/dlk/templates/include/memdriver.h b/dlk/python/dlk/templates/include/memdriver.h
@@ -92,7 +92,7 @@ class MappedMem
 
 
   template<typename T>
-  memtype Write(T *data, unsigned int size)
+  memtype Write(const T *data, unsigned int size)
   {
     T *mem_ptr = (T *) mem;
     for(unsigned int i = 0; i < size; i++)
@@ -101,7 +101,7 @@ class MappedMem
 
 
   template<typename T>
-  bool Check(T *data, unsigned int size)
+  bool Check(const T *data, unsigned int size)
   {
     bool success = true;
     T *mem_ptr = (T *) mem;

diff --git a/dlk/python/dlk/templates/manual/consts/input.tpl.cpp b/dlk/python/dlk/templates/manual/consts/input.tpl.cpp
@@ -22,10 +22,30 @@ limitations under the License.
 
 {% else -%}
 
+{% if node.transposed_data %}
+
+#if defined(RUN_ON_FPGA)
+{{ node.dtype.cpptype() }} {{ node.name }}[] = {
+  {% for d in node.transposed_data -%}
+  {{- d -}},
+  {%- endfor %}
+};
+#else
+{{ node.dtype.cpptype() }} {{ node.name }}[] = {
+  {% for d in node.data.flatten() -%}
+  {{- d -}},
+  {%- endfor %}
+};
+#endif
+
+{% else -%}
+
 {{ node.dtype.cpptype() }} {{ node.name }}[] = {
   {% for d in node.data.flatten() -%}
   {{- d -}},
   {%- endfor %}
 };
 
+{% endif %}
+
 {%- endif %}
diff --git a/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp b/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp
@@ -16,46 +16,18 @@ limitations under the License.
 #include <cassert>
 #include <cstdio>
 
+#include "network.h"
 #include "global.h"
 #include "de10_nano.h"
 #include "func/impl/quantized_conv2d_kn2row.h"
 #include "pack_input_to_qwords.h"
 #include "time_measurement.h"
 
 namespace {
-
-const unsigned int in_nbits = 2;
-const unsigned int byte_nbits = 8;
-
-void kernel_transform_NHWC_to_HWNoCNi(
-    const T_UINT src[],
-    T_UINT dst[],
-    const unsigned kn,
-    const unsigned kh,
-    const unsigned kw,
-    const unsigned kc,
-    const unsigned kn_in)
-{
-  unsigned idx_src = 0;
-  const unsigned kn_out = kn / kn_in;
-
-  for (unsigned no = 0; no < kn_out; no++)
-    for (unsigned ni = 0; ni < kn_in; ni++)
-      for (unsigned h = 0; h < kh; h++)
-        for (unsigned w = 0; w < kw; w++)
-          for (unsigned c = 0; c < kc; c++)
-          {
-            unsigned idx_dst = h * (kw * kn_out * kc * kn_in);
-            idx_dst += w * (kn_out * kc * kn_in);
-            idx_dst += no * (kc * kn_in);
-            idx_dst += c * (kn_in);
-            idx_dst += ni;
-            dst[idx_dst] = src[idx_src++];
-          }
+  const unsigned int in_nbits = 2;
+  const unsigned int byte_nbits = 8;
 }
 
-} // namespace
-
 namespace dlk {
 
 namespace impl {
@@ -118,24 +90,7 @@ void QuantizedConv2DKn2Row(QUANTIZED_NOT_PACKED input[],
 
   if (out_c_less_than_num_pe) {
 
-    const T_UINT k_n_aligend_with_num_pe =
-        ((k_n + (NUM_PE - 1)) / NUM_PE) * NUM_PE;
-    const T_UINT out_c_aligend_with_num_pe = k_n_aligend_with_num_pe;
-    const T_UINT k_size = k_n_aligend_with_num_pe * k_h * k_w * k_c_by_word;
-
-    T_UINT kernel_hwnocni[k_size];
-    T_UINT kernel_filled_extra[k_size];
-
-    for (size_t k = 0; k < k_n * k_h * k_w * k_c_by_word; k++) {
-      kernel_filled_extra[k] = kernel[k];
-    }
-
-    Measurement::Start("Kernel transpose NHWC to HWNoCNi");
-    kernel_transform_NHWC_to_HWNoCNi(kernel_filled_extra, kernel_hwnocni,
-                                     k_n_aligend_with_num_pe, k_h, k_w,
-                                     k_c_by_word, NUM_PE);
-    Measurement::Stop();
-
+    const T_UINT out_c_aligend_with_num_pe = ((k_n + (NUM_PE - 1)) / NUM_PE) * NUM_PE;
     T_UINT input_byte_size =
         (cp.input_height * cp.input_width * cp.kernel_depth * in_nbits) /
         byte_nbits;
@@ -153,7 +108,7 @@ void QuantizedConv2DKn2Row(QUANTIZED_NOT_PACKED input[],
 
     Measurement::Start("QConv2D with kn2row");
     de10_nano::qconv_with_kn2row(
-        p.device_input_phys_addr, p.device_output_phys_addr, kernel_hwnocni,
+	p.device_input_phys_addr, p.device_output_phys_addr, kernel,
         p.thresholds, in_w, in_h, in_c_by_word, MAX_NBIT_QINPUT, out_w, out_h,
         out_c_aligend_with_num_pe, k_w, k_h, cp.padding,
         cp.stride_along_height);
@@ -176,15 +131,6 @@ void QuantizedConv2DKn2Row(QUANTIZED_NOT_PACKED input[],
         }
 
   } else {
-
-    const T_UINT k_size = k_n * k_h * k_w * k_c_by_word;
-    T_UINT kernel_hwnocni[k_size];
-
-    Measurement::Start("Kernel transpose NHWC to HWNoCNi");
-    kernel_transform_NHWC_to_HWNoCNi(kernel, kernel_hwnocni, k_n, k_h, k_w,
-                                     k_c_by_word, NUM_PE);
-    Measurement::Stop();
-
     T_UINT input_byte_size =
         (cp.input_height * cp.input_width * cp.kernel_depth * in_nbits) /
         byte_nbits;
@@ -199,7 +145,7 @@ void QuantizedConv2DKn2Row(QUANTIZED_NOT_PACKED input[],
 
     Measurement::Start("QConv2D with kn2row");
     de10_nano::qconv_with_kn2row(
-        p.device_input_phys_addr, p.device_output_phys_addr, kernel_hwnocni,
+        p.device_input_phys_addr, p.device_output_phys_addr, kernel,
         p.thresholds, in_w, in_h, in_c_by_word, MAX_NBIT_QINPUT, out_w, out_h,
         out_c, k_w, k_h, cp.padding, cp.stride_along_height);
     Measurement::Stop();