Skip to content
This repository has been archived by the owner on Dec 1, 2021. It is now read-only.

Commit

Permalink
Merge pull request #73 from lm-konda/kernel_transform
Browse files Browse the repository at this point in the history
Improve kernel transform
  • Loading branch information
ruimashita authored Feb 8, 2019
2 parents 94488d9 + d79172d commit 9f3b5d8
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 71 deletions.
4 changes: 2 additions & 2 deletions dlk/backends/include/fpga_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ class MappedMem


template<typename T>
memtype Write(T *data, unsigned int size)
memtype Write(const T *data, unsigned int size)
{
T *mem_ptr = (T *) mem;
for(unsigned int i = 0; i < size; i++) {
Expand All @@ -107,7 +107,7 @@ class MappedMem


template<typename T>
bool Check(T *data, unsigned int size)
bool Check(const T *data, unsigned int size)
{
bool success = true;
T *mem_ptr = (T *) mem;
Expand Down
9 changes: 8 additions & 1 deletion dlk/python/dlk/core/operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,14 +560,16 @@ def __init__(self,
data: np.ndarray,
dimension_format: str = 'NHWC',
packed: bool = False,
actual_shape: List[int] = []) -> None:
actual_shape: List[int] = [],
transposed_data: List[int] = None) -> None:
"""Init the variable.
If the constant is hard quantized, data is packed and the actual shape
must be expressed with `actual_shape`.
"""
shape = list(data.shape) if not packed else actual_shape
self._packed = packed
self._transposed_data = transposed_data
super().__init__(name, shape, dtype, {}, data, dimension_format=dimension_format)

def run_forward(self) -> np.ndarray:
Expand All @@ -577,6 +579,11 @@ def run_forward(self) -> np.ndarray:
def is_packed(self) -> bool:
return self._packed

@property
def transposed_data(self) -> List[int]:
"""Return transposed data."""
return self._transposed_data


class Output(Variable):
"""Output class."""
Expand Down
86 changes: 83 additions & 3 deletions dlk/python/dlk/core/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,85 @@

from core.graph import Graph
from core.graph_pattern_matching import get_nodes_in_branch, sort_graph
from core.operators import Constant, Operator
from core.operators import Constant, Operator, Conv
from core.data_types import Uint32, QUANTIZED_NOT_PACKED
from typing import cast
from typing import cast, List, Any
from collections import defaultdict
from modules.packer import Packer


def _transpose_kernels(kernel_data: np.ndarray,
oh: int,
ow: int,
od: int,
kh: int,
kw: int,
kd: int) -> List[int]:
"""Calculates and prepares the transposed kernel data in advance.
Parameters
----------
kernel_data : np.ndarray
The input data.
oh : int
output height
ow : int
output width
od : int
output depth
kh : int
kernel height
kw : int
kernel width
kd : int
kernel depth
"""
NUM_PE = 16
NBIT_QDYPE = 32
MAX_NBIT_QINPUT = 2
MAX_NBIT_KERNEL = 1
num_qinput_per_qword = int(NBIT_QDYPE / MAX_NBIT_QINPUT)
num_qkernel_per_qword = int(NBIT_QDYPE / MAX_NBIT_KERNEL)
k_c_by_word = int((kd + (num_qkernel_per_qword - 1)) / num_qkernel_per_qword)
k_n_aligned_with_num_pe = int(((od + (NUM_PE - 1)) / NUM_PE) * NUM_PE)
if od < NUM_PE:
k_size = k_n_aligned_with_num_pe * kh * kw * k_c_by_word
else:
k_size = od * kh * kw * k_c_by_word

flatten_value = []
for elem in kernel_data:
flatten_value.extend(elem)
while len(flatten_value) != k_size:
flatten_value.extend("0")

copy_value = [0] * k_size
for i in range(od * kh * kw * k_c_by_word):
copy_value[i] = flatten_value[i]

transposed_values = [0] * k_size
if (od < NUM_PE):
kn_out = int(k_n_aligned_with_num_pe / NUM_PE)
else:
kn_out = int(od / NUM_PE)
idx_src = 0

for no in range(kn_out):
for ni in range(NUM_PE):
for h in range(kh):
for w in range(kw):
for c in range(k_c_by_word):
idx_dst = h * (kw * kn_out * k_c_by_word * NUM_PE)
idx_dst += w * (kn_out * k_c_by_word * NUM_PE)
idx_dst += no * (k_c_by_word * NUM_PE)
idx_dst += c * (NUM_PE)
idx_dst += ni
transposed_values[idx_dst] = copy_value[idx_src]
idx_src += 1

return transposed_values


def pass_remove_identities(graph: Graph) -> None:
"""Removes those nodes of a Graph that satisfies the condition node.op_type() == Identity.
Expand Down Expand Up @@ -173,6 +245,7 @@ def pass_propagate_quantization_details_into_conv(graph: Graph) -> None:
graph : Graph
The input graph. It will be modified in-place.
"""

exec_list = sort_graph(graph)
qtypes = [
'QTZ_binary_mean_scaling',
Expand Down Expand Up @@ -364,12 +437,19 @@ def pass_pack_weights(graph: Graph) -> None:
data = packer.run(op_data.astype(np.float32), weight_quantizer.dimension)

# Create the new constant with the quantized weights
oh = conv_node.height
ow = conv_node.width
od = conv_node.channel
kh = conv_node.kernel_height
kw = conv_node.kernel_width
kd = conv_node.input_ops['X'].channel
quantized_constant = Constant(
weight_quantizer.name + '_new',
Uint32(),
data,
packed=True,
actual_shape=weight_quantizer.shape
actual_shape=weight_quantizer.shape,
transposed_data=_transpose_kernels(data, oh, ow, od, kh, kw, kd)
)

# get nodes to be removed after being disconnected
Expand Down
2 changes: 0 additions & 2 deletions dlk/python/dlk/core/view.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ def run(self):
nbit_qinput = 8 if x_op.op_type == 'Input' else 2

if op.is_quantized and nbit_qinput == 2:
qconv_idx = 0 # temporary
qk_elems = w_op.data.shape[1]

kh = self.op.kernel_height
Expand Down Expand Up @@ -157,7 +156,6 @@ def run(self):
binConv2D_struct.bin_kernel_ndata = {qk_elems};
binConv2D_struct.bin_input_nwords = {qk_elems};
binConv2D_struct.bin_input_ndata = {qk_elems}*{nbit_qinput};
binConv2D_struct.layer_index = {qconv_idx};
binConv2D_struct.device_input_buf = device_input_buf;
binConv2D_struct.device_output_buf = device_output_buf;
binConv2D_struct.thresholds = {threshold};
Expand Down
2 changes: 1 addition & 1 deletion dlk/python/dlk/templates/include/de10_nano.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ class QconvWithKn2row {
};

void qconv_with_kn2row(unsigned long input_addr, unsigned long output_addr,
T_UINT k_data_packed[], BIN_CONV_OUTPUT th_data[],
const T_UINT k_data_packed[], BIN_CONV_OUTPUT th_data[],
unsigned in_w, unsigned in_h, unsigned in_c_by_word,
unsigned nbits_in_data, unsigned out_w, unsigned out_h,
unsigned out_c, unsigned k_w, unsigned k_h, unsigned pad,
Expand Down
4 changes: 2 additions & 2 deletions dlk/python/dlk/templates/include/memdriver.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ class MappedMem


template<typename T>
memtype Write(T *data, unsigned int size)
memtype Write(const T *data, unsigned int size)
{
T *mem_ptr = (T *) mem;
for(unsigned int i = 0; i < size; i++)
Expand All @@ -101,7 +101,7 @@ class MappedMem


template<typename T>
bool Check(T *data, unsigned int size)
bool Check(const T *data, unsigned int size)
{
bool success = true;
T *mem_ptr = (T *) mem;
Expand Down
20 changes: 20 additions & 0 deletions dlk/python/dlk/templates/manual/consts/input.tpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,30 @@ limitations under the License.

{% else -%}

{% if node.transposed_data %}

#if defined(RUN_ON_FPGA)
{{ node.dtype.cpptype() }} {{ node.name }}[] = {
{% for d in node.transposed_data -%}
{{- d -}},
{%- endfor %}
};
#else
{{ node.dtype.cpptype() }} {{ node.name }}[] = {
{% for d in node.data.flatten() -%}
{{- d -}},
{%- endfor %}
};
#endif

{% else -%}

{{ node.dtype.cpptype() }} {{ node.name }}[] = {
{% for d in node.data.flatten() -%}
{{- d -}},
{%- endfor %}
};

{% endif %}

{%- endif %}
Original file line number Diff line number Diff line change
Expand Up @@ -16,46 +16,18 @@ limitations under the License.
#include <cassert>
#include <cstdio>

#include "network.h"
#include "global.h"
#include "de10_nano.h"
#include "func/impl/quantized_conv2d_kn2row.h"
#include "pack_input_to_qwords.h"
#include "time_measurement.h"

namespace {

const unsigned int in_nbits = 2;
const unsigned int byte_nbits = 8;

void kernel_transform_NHWC_to_HWNoCNi(
const T_UINT src[],
T_UINT dst[],
const unsigned kn,
const unsigned kh,
const unsigned kw,
const unsigned kc,
const unsigned kn_in)
{
unsigned idx_src = 0;
const unsigned kn_out = kn / kn_in;

for (unsigned no = 0; no < kn_out; no++)
for (unsigned ni = 0; ni < kn_in; ni++)
for (unsigned h = 0; h < kh; h++)
for (unsigned w = 0; w < kw; w++)
for (unsigned c = 0; c < kc; c++)
{
unsigned idx_dst = h * (kw * kn_out * kc * kn_in);
idx_dst += w * (kn_out * kc * kn_in);
idx_dst += no * (kc * kn_in);
idx_dst += c * (kn_in);
idx_dst += ni;
dst[idx_dst] = src[idx_src++];
}
const unsigned int in_nbits = 2;
const unsigned int byte_nbits = 8;
}

} // namespace

namespace dlk {

namespace impl {
Expand Down Expand Up @@ -118,24 +90,7 @@ void QuantizedConv2DKn2Row(QUANTIZED_NOT_PACKED input[],

if (out_c_less_than_num_pe) {

const T_UINT k_n_aligend_with_num_pe =
((k_n + (NUM_PE - 1)) / NUM_PE) * NUM_PE;
const T_UINT out_c_aligend_with_num_pe = k_n_aligend_with_num_pe;
const T_UINT k_size = k_n_aligend_with_num_pe * k_h * k_w * k_c_by_word;

T_UINT kernel_hwnocni[k_size];
T_UINT kernel_filled_extra[k_size];

for (size_t k = 0; k < k_n * k_h * k_w * k_c_by_word; k++) {
kernel_filled_extra[k] = kernel[k];
}

Measurement::Start("Kernel transpose NHWC to HWNoCNi");
kernel_transform_NHWC_to_HWNoCNi(kernel_filled_extra, kernel_hwnocni,
k_n_aligend_with_num_pe, k_h, k_w,
k_c_by_word, NUM_PE);
Measurement::Stop();

const T_UINT out_c_aligend_with_num_pe = ((k_n + (NUM_PE - 1)) / NUM_PE) * NUM_PE;
T_UINT input_byte_size =
(cp.input_height * cp.input_width * cp.kernel_depth * in_nbits) /
byte_nbits;
Expand All @@ -153,7 +108,7 @@ void QuantizedConv2DKn2Row(QUANTIZED_NOT_PACKED input[],

Measurement::Start("QConv2D with kn2row");
de10_nano::qconv_with_kn2row(
p.device_input_phys_addr, p.device_output_phys_addr, kernel_hwnocni,
p.device_input_phys_addr, p.device_output_phys_addr, kernel,
p.thresholds, in_w, in_h, in_c_by_word, MAX_NBIT_QINPUT, out_w, out_h,
out_c_aligend_with_num_pe, k_w, k_h, cp.padding,
cp.stride_along_height);
Expand All @@ -176,15 +131,6 @@ void QuantizedConv2DKn2Row(QUANTIZED_NOT_PACKED input[],
}

} else {

const T_UINT k_size = k_n * k_h * k_w * k_c_by_word;
T_UINT kernel_hwnocni[k_size];

Measurement::Start("Kernel transpose NHWC to HWNoCNi");
kernel_transform_NHWC_to_HWNoCNi(kernel, kernel_hwnocni, k_n, k_h, k_w,
k_c_by_word, NUM_PE);
Measurement::Stop();

T_UINT input_byte_size =
(cp.input_height * cp.input_width * cp.kernel_depth * in_nbits) /
byte_nbits;
Expand All @@ -199,7 +145,7 @@ void QuantizedConv2DKn2Row(QUANTIZED_NOT_PACKED input[],

Measurement::Start("QConv2D with kn2row");
de10_nano::qconv_with_kn2row(
p.device_input_phys_addr, p.device_output_phys_addr, kernel_hwnocni,
p.device_input_phys_addr, p.device_output_phys_addr, kernel,
p.thresholds, in_w, in_h, in_c_by_word, MAX_NBIT_QINPUT, out_w, out_h,
out_c, k_w, k_h, cp.padding, cp.stride_along_height);
Measurement::Stop();
Expand Down

0 comments on commit 9f3b5d8

Please sign in to comment.