From 455d22d6e97db142d47b68e273b7833f7f4123c1 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Wed, 12 Dec 2018 07:51:40 +0900 Subject: [PATCH 01/50] prepare transposed array --- dlk/python/dlk/core/view.py | 47 +++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index 38fb28319..526c908b6 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -17,7 +17,7 @@ from core.data_types import * from textwrap import dedent - +qconv_idx = 0 class View(object): def __init__(self, op): self.op = op @@ -36,6 +36,7 @@ def shape_list(self): return ','.join(map(lambda x: str(x), self.op.shape)) def run(self): + global qconv_idx; op = self.op input_ops = op.input_ops output_ops = op.output_ops @@ -108,7 +109,6 @@ def run(self): nbit_qinput = 8 if x_op.op_type == 'Input' else 2 if op.is_quantized and nbit_qinput == 2: - qconv_idx = 0 # temporary qk_elems = w_op.data.shape[1] kh = self.op.kernel_height @@ -135,6 +135,46 @@ def run(self): nbit_aqtz = 2 max_value = 2.0 + NUM_PE = 16 + NBIT_QDYPE = 32 + MAX_NBIT_QINPUT = 2 + MAX_NBIT_KERNEL = 1 + num_qinput_per_qword = int(NBIT_QDYPE / MAX_NBIT_QINPUT) + num_qkernel_per_qword = int(NBIT_QDYPE / MAX_NBIT_KERNEL) + k_c_by_word = int((kd + (num_qkernel_per_qword - 1)) / num_qkernel_per_qword); + k_n_aligned_with_num_pe = int(((od + (NUM_PE - 1)) / NUM_PE) * NUM_PE); + if od < NUM_PE: + k_size = k_n_aligned_with_num_pe * kh * kw * k_c_by_word; + else: + k_size = od * kh * kw * k_c_by_word; + + flatten_value = [] + for elem in input_ops['W'].data: + flatten_value.extend(elem) + copy_value = [0] * k_size + for i in range(od * kh * kw * k_c_by_word): + copy_value[i] = flatten_value[i] + + transpose_values = [0] * k_size + if (od < NUM_PE): + kn_out = int(k_n_aligned_with_num_pe / NUM_PE) + else: + kn_out = int(od / NUM_PE) + idx_src = 0 + for no in range(kn_out): + for ni in range(NUM_PE): + for h in range(kh): + for w in range(kw): + for c in range(k_c_by_word): + idx_dst = h * (kw * kn_out * k_c_by_word * NUM_PE) + idx_dst += w * (kn_out * k_c_by_word * NUM_PE) + idx_dst += no * (k_c_by_word * NUM_PE) + idx_dst += c * (NUM_PE) + idx_dst += ni + transpose_values[idx_dst] = copy_value[idx_src] + idx_src += 1 + transpose_string = "{" + ",".join([str(temp) for temp in transpose_values]) + "}" + # temporary: formula which derive number of qinput is not complete render_string = self.format_string( f""" @@ -164,9 +204,12 @@ def run(self): binConv2D_struct.n_bit = {nbit_aqtz}; binConv2D_struct.max_value = {max_value}; + static T_UINT kernel_hwnocni{qconv_idx}[{k_size}] = {transpose_string}; + vecCoefficient.emplace_back(kernel_hwnocni{qconv_idx}); {conv_func}({inputs_string}, {op.name}, scaling_factors::{op.name}, binConv2D_struct); """ ) + qconv_idx += 1 else: # temporary From f20f052e40aff717c19ed0c5fe95492e170bee08 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Wed, 12 Dec 2018 07:52:35 +0900 Subject: [PATCH 02/50] value passing interface --- .../dlk/templates/include/network.tpl.h | 3 + .../impl/fpga/quantized_conv2d_kn2row.cpp | 59 ++----------------- dlk/python/dlk/templates/src/network.tpl.cpp | 2 + 3 files changed, 11 insertions(+), 53 deletions(-) diff --git a/dlk/python/dlk/templates/include/network.tpl.h b/dlk/python/dlk/templates/include/network.tpl.h index 8802bea65..ec321841b 100644 --- a/dlk/python/dlk/templates/include/network.tpl.h +++ b/dlk/python/dlk/templates/include/network.tpl.h @@ -18,10 +18,13 @@ limitations under the License. #include "global.h" #include "dma_buffer.h" +#include #define SYM_PUBLIC __attribute__ ((visibility ("default"))) #define SYM_LOCAL __attribute__ ((visibility ("hidden"))) +extern std::vector vecCoefficient; + class SYM_PUBLIC Network { public: diff --git a/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp b/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp index f92f9fbe9..09bdbf9d1 100644 --- a/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp +++ b/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp @@ -16,6 +16,7 @@ limitations under the License. #include #include +#include "network.h" #include "global.h" #include "de10_nano.h" #include "func/impl/quantized_conv2d_kn2row.h" @@ -23,39 +24,10 @@ limitations under the License. #include "time_measurement.h" namespace { - -const unsigned int in_nbits = 2; -const unsigned int byte_nbits = 8; - -void kernel_transform_NHWC_to_HWNoCNi( - const T_UINT src[], - T_UINT dst[], - const unsigned kn, - const unsigned kh, - const unsigned kw, - const unsigned kc, - const unsigned kn_in) -{ - unsigned idx_src = 0; - const unsigned kn_out = kn / kn_in; - - for (unsigned no = 0; no < kn_out; no++) - for (unsigned ni = 0; ni < kn_in; ni++) - for (unsigned h = 0; h < kh; h++) - for (unsigned w = 0; w < kw; w++) - for (unsigned c = 0; c < kc; c++) - { - unsigned idx_dst = h * (kw * kn_out * kc * kn_in); - idx_dst += w * (kn_out * kc * kn_in); - idx_dst += no * (kc * kn_in); - idx_dst += c * (kn_in); - idx_dst += ni; - dst[idx_dst] = src[idx_src++]; - } + const unsigned int in_nbits = 2; + const unsigned int byte_nbits = 8; } -} // namespace - namespace dlk { namespace impl { @@ -120,22 +92,9 @@ void QuantizedConv2DKn2Row(QUANTIZED_NOT_PACKED input[], if (out_c_less_than_num_pe) { - const T_UINT k_n_aligend_with_num_pe = - ((k_n + (NUM_PE - 1)) / NUM_PE) * NUM_PE; - const T_UINT out_c_aligend_with_num_pe = k_n_aligend_with_num_pe; - const T_UINT k_size = k_n_aligend_with_num_pe * k_h * k_w * k_c_by_word; - - T_UINT kernel_hwnocni[k_size]; - T_UINT kernel_filled_extra[k_size]; - - for (size_t k = 0; k < k_n * k_h * k_w * k_c_by_word; k++) { - kernel_filled_extra[k] = kernel[k]; - } + const T_UINT out_c_aligend_with_num_pe = ((k_n + (NUM_PE - 1)) / NUM_PE) * NUM_PE; Measurement::Start("Kernel transpose NHWC to HWNoCNi"); - kernel_transform_NHWC_to_HWNoCNi(kernel_filled_extra, kernel_hwnocni, - k_n_aligend_with_num_pe, k_h, k_w, - k_c_by_word, NUM_PE); Measurement::Stop(); T_UINT input_byte_size = @@ -155,7 +114,7 @@ void QuantizedConv2DKn2Row(QUANTIZED_NOT_PACKED input[], Measurement::Start("QConv2D with kn2row"); de10_nano::qconv_with_kn2row( - p.device_input_phys_addr, p.device_output_phys_addr, kernel_hwnocni, + p.device_input_phys_addr, p.device_output_phys_addr, vecCoefficient.at(p.layer_index), p.thresholds, in_w, in_h, in_c_by_word, MAX_NBIT_QINPUT, out_w, out_h, out_c_aligend_with_num_pe, k_w, k_h, cp.padding, cp.stride_along_height); @@ -178,13 +137,7 @@ void QuantizedConv2DKn2Row(QUANTIZED_NOT_PACKED input[], } } else { - - const T_UINT k_size = k_n * k_h * k_w * k_c_by_word; - T_UINT kernel_hwnocni[k_size]; - Measurement::Start("Kernel transpose NHWC to HWNoCNi"); - kernel_transform_NHWC_to_HWNoCNi(kernel, kernel_hwnocni, k_n, k_h, k_w, - k_c_by_word, NUM_PE); Measurement::Stop(); T_UINT input_byte_size = @@ -201,7 +154,7 @@ void QuantizedConv2DKn2Row(QUANTIZED_NOT_PACKED input[], Measurement::Start("QConv2D with kn2row"); de10_nano::qconv_with_kn2row( - p.device_input_phys_addr, p.device_output_phys_addr, kernel_hwnocni, + p.device_input_phys_addr, p.device_output_phys_addr, vecCoefficient.at(p.layer_index), p.thresholds, in_w, in_h, in_c_by_word, MAX_NBIT_QINPUT, out_w, out_h, out_c, k_w, k_h, cp.padding, cp.stride_along_height); Measurement::Stop(); diff --git a/dlk/python/dlk/templates/src/network.tpl.cpp b/dlk/python/dlk/templates/src/network.tpl.cpp index 159f61b2f..3e901c87b 100644 --- a/dlk/python/dlk/templates/src/network.tpl.cpp +++ b/dlk/python/dlk/templates/src/network.tpl.cpp @@ -136,6 +136,8 @@ void save_uint32_data(const std::string &name, uint32_t size, uint32_t *data, fl {{ '\n' -}} ///////////////////////////////////////// +std::vector vecCoefficient; + Network::Network() {} From ee573dcf0280c061cdb53d898bea7e1a50c1889e Mon Sep 17 00:00:00 2001 From: lm_konda Date: Wed, 12 Dec 2018 08:38:58 +0900 Subject: [PATCH 03/50] remove Kernel transpose NHWC to HWNoCNi section --- .../src/func/impl/fpga/quantized_conv2d_kn2row.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp b/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp index 09bdbf9d1..2bfe0b9c4 100644 --- a/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp +++ b/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp @@ -93,10 +93,6 @@ void QuantizedConv2DKn2Row(QUANTIZED_NOT_PACKED input[], if (out_c_less_than_num_pe) { const T_UINT out_c_aligend_with_num_pe = ((k_n + (NUM_PE - 1)) / NUM_PE) * NUM_PE; - - Measurement::Start("Kernel transpose NHWC to HWNoCNi"); - Measurement::Stop(); - T_UINT input_byte_size = (cp.input_height * cp.input_width * cp.kernel_depth * in_nbits) / byte_nbits; @@ -137,9 +133,6 @@ void QuantizedConv2DKn2Row(QUANTIZED_NOT_PACKED input[], } } else { - Measurement::Start("Kernel transpose NHWC to HWNoCNi"); - Measurement::Stop(); - T_UINT input_byte_size = (cp.input_height * cp.input_width * cp.kernel_depth * in_nbits) / byte_nbits; From 4e6fef5b26fd116785b90e15e27496c2059bc2fb Mon Sep 17 00:00:00 2001 From: lm_konda Date: Wed, 26 Dec 2018 14:09:46 +0900 Subject: [PATCH 04/50] modify kernel transpose --- dlk/python/dlk/core/view.py | 41 ++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index 526c908b6..333d25c14 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -161,18 +161,35 @@ def run(self): else: kn_out = int(od / NUM_PE) idx_src = 0 - for no in range(kn_out): - for ni in range(NUM_PE): - for h in range(kh): - for w in range(kw): - for c in range(k_c_by_word): - idx_dst = h * (kw * kn_out * k_c_by_word * NUM_PE) - idx_dst += w * (kn_out * k_c_by_word * NUM_PE) - idx_dst += no * (k_c_by_word * NUM_PE) - idx_dst += c * (NUM_PE) - idx_dst += ni - transpose_values[idx_dst] = copy_value[idx_src] - idx_src += 1 + + if op._dimension_format == "NHWC": + for no in range(kn_out): + for ni in range(NUM_PE): + for h in range(kh): + for w in range(kw): + for c in range(k_c_by_word): + idx_dst = h * (kw * kn_out * k_c_by_word * NUM_PE) + idx_dst += w * (kn_out * k_c_by_word * NUM_PE) + idx_dst += no * (k_c_by_word * NUM_PE) + idx_dst += c * (NUM_PE) + idx_dst += ni + transpose_values[idx_dst] = copy_value[idx_src] + idx_src += 1 + elif op._dimension_format == "NCHW": + for no in range(kn_out): + for ni in range(NUM_PE): + for c in range(k_c_by_word): + for h in range(kh): + for w in range(kw): + idx_dst = h * (kw * kn_out * k_c_by_word * NUM_PE) + idx_dst += w * (kn_out * k_c_by_word * NUM_PE) + idx_dst += no * (k_c_by_word * NUM_PE) + idx_dst += c * (NUM_PE) + idx_dst += ni + transpose_values[idx_dst] = copy_value[idx_src] + idx_src += 1 + else: + NotImplementedError("only NCHW and NHWC formats are suppported") transpose_string = "{" + ",".join([str(temp) for temp in transpose_values]) + "}" # temporary: formula which derive number of qinput is not complete From 4e5db3881d0dce9cbf3dbf792c57c1745cf47675 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Wed, 26 Dec 2018 16:59:33 +0900 Subject: [PATCH 05/50] for debug --- dlk/python/dlk/core/view.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index 333d25c14..1191a2a2a 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +::# -*- coding: utf-8 -*- # Copyright 2018 The Blueoil Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -192,6 +192,7 @@ def run(self): NotImplementedError("only NCHW and NHWC formats are suppported") transpose_string = "{" + ",".join([str(temp) for temp in transpose_values]) + "}" + print (transpose_string) # temporary: formula which derive number of qinput is not complete render_string = self.format_string( f""" From 9b01d493485782bb8f8dda95f342a6fcc18aa501 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Wed, 26 Dec 2018 17:17:59 +0900 Subject: [PATCH 06/50] for debug --- dlk/python/dlk/core/view.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index 1191a2a2a..a9f75c66e 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -1,4 +1,4 @@ -::# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- # Copyright 2018 The Blueoil Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -160,6 +160,8 @@ def run(self): kn_out = int(k_n_aligned_with_num_pe / NUM_PE) else: kn_out = int(od / NUM_PE) + if kn_out == 0: + kn_out = 1 idx_src = 0 if op._dimension_format == "NHWC": @@ -192,7 +194,6 @@ def run(self): NotImplementedError("only NCHW and NHWC formats are suppported") transpose_string = "{" + ",".join([str(temp) for temp in transpose_values]) + "}" - print (transpose_string) # temporary: formula which derive number of qinput is not complete render_string = self.format_string( f""" From e11a12a4a125b158ba3ba6acc950c8ce676d56d1 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Wed, 26 Dec 2018 18:05:29 +0900 Subject: [PATCH 07/50] Revert "for debug" This reverts commit 9b01d493485782bb8f8dda95f342a6fcc18aa501. --- dlk/python/dlk/core/view.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index a9f75c66e..1191a2a2a 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +::# -*- coding: utf-8 -*- # Copyright 2018 The Blueoil Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -160,8 +160,6 @@ def run(self): kn_out = int(k_n_aligned_with_num_pe / NUM_PE) else: kn_out = int(od / NUM_PE) - if kn_out == 0: - kn_out = 1 idx_src = 0 if op._dimension_format == "NHWC": @@ -194,6 +192,7 @@ def run(self): NotImplementedError("only NCHW and NHWC formats are suppported") transpose_string = "{" + ",".join([str(temp) for temp in transpose_values]) + "}" + print (transpose_string) # temporary: formula which derive number of qinput is not complete render_string = self.format_string( f""" From cef06da41534af76483063b501f60caf13c559a5 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Wed, 26 Dec 2018 18:05:58 +0900 Subject: [PATCH 08/50] Revert "Revert "for debug"" This reverts commit e11a12a4a125b158ba3ba6acc950c8ce676d56d1. --- dlk/python/dlk/core/view.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index 1191a2a2a..a9f75c66e 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -1,4 +1,4 @@ -::# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- # Copyright 2018 The Blueoil Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -160,6 +160,8 @@ def run(self): kn_out = int(k_n_aligned_with_num_pe / NUM_PE) else: kn_out = int(od / NUM_PE) + if kn_out == 0: + kn_out = 1 idx_src = 0 if op._dimension_format == "NHWC": @@ -192,7 +194,6 @@ def run(self): NotImplementedError("only NCHW and NHWC formats are suppported") transpose_string = "{" + ",".join([str(temp) for temp in transpose_values]) + "}" - print (transpose_string) # temporary: formula which derive number of qinput is not complete render_string = self.format_string( f""" From 2954a8f75617185cecb5f350dfa9f1b3f3b4e2c6 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Wed, 26 Dec 2018 18:07:38 +0900 Subject: [PATCH 09/50] Revert "for debug" This reverts commit 9b01d493485782bb8f8dda95f342a6fcc18aa501. --- dlk/python/dlk/core/view.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index a9f75c66e..1191a2a2a 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +::# -*- coding: utf-8 -*- # Copyright 2018 The Blueoil Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -160,8 +160,6 @@ def run(self): kn_out = int(k_n_aligned_with_num_pe / NUM_PE) else: kn_out = int(od / NUM_PE) - if kn_out == 0: - kn_out = 1 idx_src = 0 if op._dimension_format == "NHWC": @@ -194,6 +192,7 @@ def run(self): NotImplementedError("only NCHW and NHWC formats are suppported") transpose_string = "{" + ",".join([str(temp) for temp in transpose_values]) + "}" + print (transpose_string) # temporary: formula which derive number of qinput is not complete render_string = self.format_string( f""" From 33c94cce681f825a507ff6ec0245eb32e2604ac5 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Wed, 26 Dec 2018 18:08:31 +0900 Subject: [PATCH 10/50] Revert "for debug" This reverts commit 4e5db3881d0dce9cbf3dbf792c57c1745cf47675. --- dlk/python/dlk/core/view.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index 1191a2a2a..333d25c14 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -1,4 +1,4 @@ -::# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- # Copyright 2018 The Blueoil Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -192,7 +192,6 @@ def run(self): NotImplementedError("only NCHW and NHWC formats are suppported") transpose_string = "{" + ",".join([str(temp) for temp in transpose_values]) + "}" - print (transpose_string) # temporary: formula which derive number of qinput is not complete render_string = self.format_string( f""" From 75f9baa69d45feadcbf552b4726ee44b20f17e86 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Thu, 27 Dec 2018 09:23:22 +0900 Subject: [PATCH 11/50] for debug --- dlk/python/dlk/core/view.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index 333d25c14..a580941e6 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -163,6 +163,10 @@ def run(self): idx_src = 0 if op._dimension_format == "NHWC": + print ("NHWC") + print (kn_out) + print (kh) + print (kw) for no in range(kn_out): for ni in range(NUM_PE): for h in range(kh): @@ -175,7 +179,12 @@ def run(self): idx_dst += ni transpose_values[idx_dst] = copy_value[idx_src] idx_src += 1 + elif op._dimension_format == "NCHW": + print ("NCHW") + print (kn_out) + print (kh) + print (kw) for no in range(kn_out): for ni in range(NUM_PE): for c in range(k_c_by_word): @@ -192,6 +201,7 @@ def run(self): NotImplementedError("only NCHW and NHWC formats are suppported") transpose_string = "{" + ",".join([str(temp) for temp in transpose_values]) + "}" + print (transpose_string) // for debug # temporary: formula which derive number of qinput is not complete render_string = self.format_string( f""" From 450f964f021ffe205925be9f00e1ca3607ed9360 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Thu, 27 Dec 2018 10:03:49 +0900 Subject: [PATCH 12/50] for debug --- dlk/python/dlk/core/view.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index a580941e6..f782246ba 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -201,7 +201,7 @@ def run(self): NotImplementedError("only NCHW and NHWC formats are suppported") transpose_string = "{" + ",".join([str(temp) for temp in transpose_values]) + "}" - print (transpose_string) // for debug + print (transpose_string) # temporary: formula which derive number of qinput is not complete render_string = self.format_string( f""" From 10c189db4d715a4a57c08f71f79f8483be195ebd Mon Sep 17 00:00:00 2001 From: lm_konda Date: Thu, 27 Dec 2018 10:56:38 +0900 Subject: [PATCH 13/50] Revert "for debug" This reverts commit 450f964f021ffe205925be9f00e1ca3607ed9360. --- dlk/python/dlk/core/view.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index f782246ba..a580941e6 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -201,7 +201,7 @@ def run(self): NotImplementedError("only NCHW and NHWC formats are suppported") transpose_string = "{" + ",".join([str(temp) for temp in transpose_values]) + "}" - print (transpose_string) + print (transpose_string) // for debug # temporary: formula which derive number of qinput is not complete render_string = self.format_string( f""" From e920116e05a91fa31215cced22b3742fdbabdd68 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Thu, 27 Dec 2018 10:56:43 +0900 Subject: [PATCH 14/50] Revert "for debug" This reverts commit 75f9baa69d45feadcbf552b4726ee44b20f17e86. --- dlk/python/dlk/core/view.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index a580941e6..333d25c14 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -163,10 +163,6 @@ def run(self): idx_src = 0 if op._dimension_format == "NHWC": - print ("NHWC") - print (kn_out) - print (kh) - print (kw) for no in range(kn_out): for ni in range(NUM_PE): for h in range(kh): @@ -179,12 +175,7 @@ def run(self): idx_dst += ni transpose_values[idx_dst] = copy_value[idx_src] idx_src += 1 - elif op._dimension_format == "NCHW": - print ("NCHW") - print (kn_out) - print (kh) - print (kw) for no in range(kn_out): for ni in range(NUM_PE): for c in range(k_c_by_word): @@ -201,7 +192,6 @@ def run(self): NotImplementedError("only NCHW and NHWC formats are suppported") transpose_string = "{" + ",".join([str(temp) for temp in transpose_values]) + "}" - print (transpose_string) // for debug # temporary: formula which derive number of qinput is not complete render_string = self.format_string( f""" From a22a4c786568d3cd788acf9420ec5ef8b625a7ee Mon Sep 17 00:00:00 2001 From: lm_konda Date: Thu, 27 Dec 2018 10:59:00 +0900 Subject: [PATCH 15/50] for debug --- .../src/func/impl/fpga/quantized_conv2d_kn2row.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp b/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp index 2bfe0b9c4..9914bc489 100644 --- a/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp +++ b/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp @@ -108,6 +108,11 @@ void QuantizedConv2DKn2Row(QUANTIZED_NOT_PACKED input[], p.dma_input_buffer->sync_for_device(); p.dma_output_buffer->sync_for_device(); +#if 1 // DEBUG + std::cout << "debug" << std::endl; + std::cout << sizeof(vecCoefficient.size()) << std::endl; + std::cout << p.layer_index << std::endl; +#endif Measurement::Start("QConv2D with kn2row"); de10_nano::qconv_with_kn2row( p.device_input_phys_addr, p.device_output_phys_addr, vecCoefficient.at(p.layer_index), @@ -145,6 +150,11 @@ void QuantizedConv2DKn2Row(QUANTIZED_NOT_PACKED input[], p.dma_input_buffer->sync_for_device(); p.dma_output_buffer->sync_for_device(); +#if 1 // DEBUG + std::cout << "debug" << std::endl; + std::cout << sizeof(vecCoefficient.size()) << std::endl; + std::cout << p.layer_index << std::endl; +#endif Measurement::Start("QConv2D with kn2row"); de10_nano::qconv_with_kn2row( p.device_input_phys_addr, p.device_output_phys_addr, vecCoefficient.at(p.layer_index), From 541ab09a01505cfee49db1c2a713e5dd73103d0b Mon Sep 17 00:00:00 2001 From: lm_konda Date: Thu, 27 Dec 2018 12:31:14 +0900 Subject: [PATCH 16/50] for debug --- dlk/python/dlk/core/view.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index 333d25c14..2932ef643 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -226,6 +226,7 @@ def run(self): {conv_func}({inputs_string}, {op.name}, scaling_factors::{op.name}, binConv2D_struct); """ ) + print(qconv_idx) qconv_idx += 1 else: From dc7dc9165a5a516ff1ded53ba03a5598ddb2f0f0 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Thu, 27 Dec 2018 13:14:59 +0900 Subject: [PATCH 17/50] for debug --- dlk/python/dlk/core/view.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index 2932ef643..6f7cb76ab 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -17,8 +17,8 @@ from core.data_types import * from textwrap import dedent -qconv_idx = 0 class View(object): + qconv_idx = 0 def __init__(self, op): self.op = op self.reuse_buffer_str = '' @@ -36,7 +36,6 @@ def shape_list(self): return ','.join(map(lambda x: str(x), self.op.shape)) def run(self): - global qconv_idx; op = self.op input_ops = op.input_ops output_ops = op.output_ops @@ -214,20 +213,20 @@ def run(self): binConv2D_struct.bin_kernel_ndata = {qk_elems}; binConv2D_struct.bin_input_nwords = {qk_elems}; binConv2D_struct.bin_input_ndata = {qk_elems}*{nbit_qinput}; - binConv2D_struct.layer_index = {qconv_idx}; + binConv2D_struct.layer_index = {View.qconv_idx}; binConv2D_struct.device_input_buf = device_input_buf; binConv2D_struct.device_output_buf = device_output_buf; binConv2D_struct.thresholds = {threshold}; binConv2D_struct.n_bit = {nbit_aqtz}; binConv2D_struct.max_value = {max_value}; - static T_UINT kernel_hwnocni{qconv_idx}[{k_size}] = {transpose_string}; - vecCoefficient.emplace_back(kernel_hwnocni{qconv_idx}); + static T_UINT kernel_hwnocni{View.qconv_idx}[{k_size}] = {transpose_string}; + vecCoefficient.emplace_back(kernel_hwnocni{View.qconv_idx}); {conv_func}({inputs_string}, {op.name}, scaling_factors::{op.name}, binConv2D_struct); """ ) - print(qconv_idx) - qconv_idx += 1 + print(View.qconv_idx) + View.qconv_idx += 1 else: # temporary From b5ee75be3f14d3a8412b7bb2863cb62ca25f2b17 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Thu, 27 Dec 2018 13:39:31 +0900 Subject: [PATCH 18/50] for debug --- dlk/python/dlk/core/view.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index 6f7cb76ab..9c957230d 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -18,11 +18,11 @@ from textwrap import dedent class View(object): - qconv_idx = 0 def __init__(self, op): self.op = op self.reuse_buffer_str = '' - + self.qconv_idx = 0 + @property def rank(self): return len(self.node.shape) @@ -213,20 +213,20 @@ def run(self): binConv2D_struct.bin_kernel_ndata = {qk_elems}; binConv2D_struct.bin_input_nwords = {qk_elems}; binConv2D_struct.bin_input_ndata = {qk_elems}*{nbit_qinput}; - binConv2D_struct.layer_index = {View.qconv_idx}; + binConv2D_struct.layer_index = {self.qconv_idx}; binConv2D_struct.device_input_buf = device_input_buf; binConv2D_struct.device_output_buf = device_output_buf; binConv2D_struct.thresholds = {threshold}; binConv2D_struct.n_bit = {nbit_aqtz}; binConv2D_struct.max_value = {max_value}; - static T_UINT kernel_hwnocni{View.qconv_idx}[{k_size}] = {transpose_string}; - vecCoefficient.emplace_back(kernel_hwnocni{View.qconv_idx}); + static T_UINT kernel_hwnocni{self.qconv_idx}[{k_size}] = {transpose_string}; + vecCoefficient.emplace_back(kernel_hwnocni{self.qconv_idx}); {conv_func}({inputs_string}, {op.name}, scaling_factors::{op.name}, binConv2D_struct); """ ) - print(View.qconv_idx) - View.qconv_idx += 1 + print(self.qconv_idx) + self.qconv_idx += 1 else: # temporary From 485e04563fa18aff20dd6732d1ade0e6038d66bc Mon Sep 17 00:00:00 2001 From: lm_konda Date: Thu, 27 Dec 2018 13:56:54 +0900 Subject: [PATCH 19/50] Revert "for debug" This reverts commit b5ee75be3f14d3a8412b7bb2863cb62ca25f2b17. --- dlk/python/dlk/core/view.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index 9c957230d..6f7cb76ab 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -18,11 +18,11 @@ from textwrap import dedent class View(object): + qconv_idx = 0 def __init__(self, op): self.op = op self.reuse_buffer_str = '' - self.qconv_idx = 0 - + @property def rank(self): return len(self.node.shape) @@ -213,20 +213,20 @@ def run(self): binConv2D_struct.bin_kernel_ndata = {qk_elems}; binConv2D_struct.bin_input_nwords = {qk_elems}; binConv2D_struct.bin_input_ndata = {qk_elems}*{nbit_qinput}; - binConv2D_struct.layer_index = {self.qconv_idx}; + binConv2D_struct.layer_index = {View.qconv_idx}; binConv2D_struct.device_input_buf = device_input_buf; binConv2D_struct.device_output_buf = device_output_buf; binConv2D_struct.thresholds = {threshold}; binConv2D_struct.n_bit = {nbit_aqtz}; binConv2D_struct.max_value = {max_value}; - static T_UINT kernel_hwnocni{self.qconv_idx}[{k_size}] = {transpose_string}; - vecCoefficient.emplace_back(kernel_hwnocni{self.qconv_idx}); + static T_UINT kernel_hwnocni{View.qconv_idx}[{k_size}] = {transpose_string}; + vecCoefficient.emplace_back(kernel_hwnocni{View.qconv_idx}); {conv_func}({inputs_string}, {op.name}, scaling_factors::{op.name}, binConv2D_struct); """ ) - print(self.qconv_idx) - self.qconv_idx += 1 + print(View.qconv_idx) + View.qconv_idx += 1 else: # temporary From b7c6b38f6619fa6514881bb5c33c801cf0df9a16 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Thu, 27 Dec 2018 15:00:59 +0900 Subject: [PATCH 20/50] Revert "for debug" This reverts commit dc7dc9165a5a516ff1ded53ba03a5598ddb2f0f0. --- dlk/python/dlk/core/view.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index 6f7cb76ab..2932ef643 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -17,8 +17,8 @@ from core.data_types import * from textwrap import dedent +qconv_idx = 0 class View(object): - qconv_idx = 0 def __init__(self, op): self.op = op self.reuse_buffer_str = '' @@ -36,6 +36,7 @@ def shape_list(self): return ','.join(map(lambda x: str(x), self.op.shape)) def run(self): + global qconv_idx; op = self.op input_ops = op.input_ops output_ops = op.output_ops @@ -213,20 +214,20 @@ def run(self): binConv2D_struct.bin_kernel_ndata = {qk_elems}; binConv2D_struct.bin_input_nwords = {qk_elems}; binConv2D_struct.bin_input_ndata = {qk_elems}*{nbit_qinput}; - binConv2D_struct.layer_index = {View.qconv_idx}; + binConv2D_struct.layer_index = {qconv_idx}; binConv2D_struct.device_input_buf = device_input_buf; binConv2D_struct.device_output_buf = device_output_buf; binConv2D_struct.thresholds = {threshold}; binConv2D_struct.n_bit = {nbit_aqtz}; binConv2D_struct.max_value = {max_value}; - static T_UINT kernel_hwnocni{View.qconv_idx}[{k_size}] = {transpose_string}; - vecCoefficient.emplace_back(kernel_hwnocni{View.qconv_idx}); + static T_UINT kernel_hwnocni{qconv_idx}[{k_size}] = {transpose_string}; + vecCoefficient.emplace_back(kernel_hwnocni{qconv_idx}); {conv_func}({inputs_string}, {op.name}, scaling_factors::{op.name}, binConv2D_struct); """ ) - print(View.qconv_idx) - View.qconv_idx += 1 + print(qconv_idx) + qconv_idx += 1 else: # temporary From 62a63a3244e92bc71d0c112c4aa54e91ecd308ce Mon Sep 17 00:00:00 2001 From: lm_konda Date: Thu, 27 Dec 2018 15:01:22 +0900 Subject: [PATCH 21/50] Revert "for debug" This reverts commit 541ab09a01505cfee49db1c2a713e5dd73103d0b. --- dlk/python/dlk/core/view.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index 2932ef643..333d25c14 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -226,7 +226,6 @@ def run(self): {conv_func}({inputs_string}, {op.name}, scaling_factors::{op.name}, binConv2D_struct); """ ) - print(qconv_idx) qconv_idx += 1 else: From 394ef5161cee644efcaae129c16a2c873de7e5d4 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Thu, 27 Dec 2018 15:01:39 +0900 Subject: [PATCH 22/50] Revert "for debug" This reverts commit a22a4c786568d3cd788acf9420ec5ef8b625a7ee. --- .../src/func/impl/fpga/quantized_conv2d_kn2row.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp b/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp index 9914bc489..2bfe0b9c4 100644 --- a/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp +++ b/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp @@ -108,11 +108,6 @@ void QuantizedConv2DKn2Row(QUANTIZED_NOT_PACKED input[], p.dma_input_buffer->sync_for_device(); p.dma_output_buffer->sync_for_device(); -#if 1 // DEBUG - std::cout << "debug" << std::endl; - std::cout << sizeof(vecCoefficient.size()) << std::endl; - std::cout << p.layer_index << std::endl; -#endif Measurement::Start("QConv2D with kn2row"); de10_nano::qconv_with_kn2row( p.device_input_phys_addr, p.device_output_phys_addr, vecCoefficient.at(p.layer_index), @@ -150,11 +145,6 @@ void QuantizedConv2DKn2Row(QUANTIZED_NOT_PACKED input[], p.dma_input_buffer->sync_for_device(); p.dma_output_buffer->sync_for_device(); -#if 1 // DEBUG - std::cout << "debug" << std::endl; - std::cout << sizeof(vecCoefficient.size()) << std::endl; - std::cout << p.layer_index << std::endl; -#endif Measurement::Start("QConv2D with kn2row"); de10_nano::qconv_with_kn2row( p.device_input_phys_addr, p.device_output_phys_addr, vecCoefficient.at(p.layer_index), From 23a899a3af037dd9196bcde48cde402b53bf408d Mon Sep 17 00:00:00 2001 From: lm_konda Date: Thu, 27 Dec 2018 18:13:12 +0900 Subject: [PATCH 23/50] for debug --- dlk/python/dlk/core/view.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index 333d25c14..94d338425 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -36,7 +36,7 @@ def shape_list(self): return ','.join(map(lambda x: str(x), self.op.shape)) def run(self): - global qconv_idx; + global qconv_idx op = self.op input_ops = op.input_ops output_ops = op.output_ops @@ -227,6 +227,8 @@ def run(self): """ ) qconv_idx += 1 + if len(op.output_ops.keys()) < 1: + qconv_idx = 0 else: # temporary From 641763e99b7303db5e56b6e6edda402ec7754255 Mon Sep 17 00:00:00 2001 From: Nikolay Nez Date: Mon, 7 Jan 2019 17:37:31 +0900 Subject: [PATCH 24/50] Use const for kernel data --- dlk/backends/include/fpga_utils.h | 4 ++-- dlk/python/dlk/templates/include/de10_nano.h | 2 +- dlk/python/dlk/templates/include/memdriver.h | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dlk/backends/include/fpga_utils.h b/dlk/backends/include/fpga_utils.h index 2fc3812d0..f84110137 100644 --- a/dlk/backends/include/fpga_utils.h +++ b/dlk/backends/include/fpga_utils.h @@ -97,7 +97,7 @@ class MappedMem template - memtype Write(T *data, unsigned int size) + memtype Write(const T *data, unsigned int size) { T *mem_ptr = (T *) mem; for(unsigned int i = 0; i < size; i++) { @@ -107,7 +107,7 @@ class MappedMem template - bool Check(T *data, unsigned int size) + bool Check(const T *data, unsigned int size) { bool success = true; T *mem_ptr = (T *) mem; diff --git a/dlk/python/dlk/templates/include/de10_nano.h b/dlk/python/dlk/templates/include/de10_nano.h index 661e1aec1..07a21b13b 100644 --- a/dlk/python/dlk/templates/include/de10_nano.h +++ b/dlk/python/dlk/templates/include/de10_nano.h @@ -126,7 +126,7 @@ class QconvWithKn2row { }; void qconv_with_kn2row(unsigned long input_addr, unsigned long output_addr, - T_UINT k_data_packed[], BIN_CONV_OUTPUT th_data[], + const T_UINT k_data_packed[], BIN_CONV_OUTPUT th_data[], unsigned in_w, unsigned in_h, unsigned in_c_by_word, unsigned nbits_in_data, unsigned out_w, unsigned out_h, unsigned out_c, unsigned k_w, unsigned k_h, unsigned pad, diff --git a/dlk/python/dlk/templates/include/memdriver.h b/dlk/python/dlk/templates/include/memdriver.h index 2c2e363af..6a72c84f6 100644 --- a/dlk/python/dlk/templates/include/memdriver.h +++ b/dlk/python/dlk/templates/include/memdriver.h @@ -92,7 +92,7 @@ class MappedMem template - memtype Write(T *data, unsigned int size) + memtype Write(const T *data, unsigned int size) { T *mem_ptr = (T *) mem; for(unsigned int i = 0; i < size; i++) @@ -101,7 +101,7 @@ class MappedMem template - bool Check(T *data, unsigned int size) + bool Check(const T *data, unsigned int size) { bool success = true; T *mem_ptr = (T *) mem; From bbe3d7351d7da62ad7edf9eea7da1c67f7f1b0cf Mon Sep 17 00:00:00 2001 From: Nikolay Nez Date: Mon, 7 Jan 2019 17:38:25 +0900 Subject: [PATCH 25/50] Move kernel transposing to optimizer --- dlk/python/dlk/core/operators.py | 10 ++- dlk/python/dlk/core/optimizer.py | 68 ++++++++++++++++++- dlk/python/dlk/core/view.py | 65 ++---------------- .../dlk/templates/include/network.tpl.h | 2 - .../dlk/templates/manual/consts/input.tpl.cpp | 8 +++ .../dlk/templates/manual/consts/input.tpl.h | 3 + .../impl/fpga/quantized_conv2d_kn2row.cpp | 4 +- dlk/python/dlk/templates/src/network.tpl.cpp | 2 - 8 files changed, 95 insertions(+), 67 deletions(-) diff --git a/dlk/python/dlk/core/operators.py b/dlk/python/dlk/core/operators.py index 4f47d6187..71993a1e7 100644 --- a/dlk/python/dlk/core/operators.py +++ b/dlk/python/dlk/core/operators.py @@ -612,7 +612,8 @@ def __init__(self, data: np.ndarray, dimension_format: str = 'NHWC', packed: bool = False, - actual_shape: List[int] = []) -> None: + actual_shape: List[int] = [], + transposed_data=None) -> None: """Init the variable. If the constant is hard quantized, data is packed and the actual shape @@ -620,12 +621,19 @@ def __init__(self, """ shape = list(data.shape) if not packed else actual_shape self._packed = packed + self._transposed_data = transposed_data super().__init__(name, shape, dtype, {}, data, dimension_format=dimension_format) @property def is_packed(self) -> bool: return self._packed + @property + def transposed_data(self): + """Return transposed data.""" + return self._transposed_data + + class Output(Variable): """Output class.""" diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py index 247bc1798..aa157faee 100644 --- a/dlk/python/dlk/core/optimizer.py +++ b/dlk/python/dlk/core/optimizer.py @@ -65,6 +65,65 @@ def node_is_activation_quantizer(node: Operator) -> bool: return node.op_type == 'QTZ_linear_mid_tread_half' +def transpose_kernels(kernel_data, dimension_format, oh, ow, od, kh, kw, kd): + NUM_PE = 16 + NBIT_QDYPE = 32 + MAX_NBIT_QINPUT = 2 + MAX_NBIT_KERNEL = 1 + num_qinput_per_qword = int(NBIT_QDYPE / MAX_NBIT_QINPUT) + num_qkernel_per_qword = int(NBIT_QDYPE / MAX_NBIT_KERNEL) + k_c_by_word = int((kd + (num_qkernel_per_qword - 1)) / num_qkernel_per_qword); + k_n_aligned_with_num_pe = int(((od + (NUM_PE - 1)) / NUM_PE) * NUM_PE); + if od < NUM_PE: + k_size = k_n_aligned_with_num_pe * kh * kw * k_c_by_word; + else: + k_size = od * kh * kw * k_c_by_word; + + flatten_value = [] + for elem in kernel_data: + flatten_value.extend(elem) + copy_value = [0] * k_size + for i in range(od * kh * kw * k_c_by_word): + copy_value[i] = flatten_value[i] + + transpose_values = [0] * k_size + if (od < NUM_PE): + kn_out = int(k_n_aligned_with_num_pe / NUM_PE) + else: + kn_out = int(od / NUM_PE) + idx_src = 0 + + if dimension_format == "NHWC": + for no in range(kn_out): + for ni in range(NUM_PE): + for h in range(kh): + for w in range(kw): + for c in range(k_c_by_word): + idx_dst = h * (kw * kn_out * k_c_by_word * NUM_PE) + idx_dst += w * (kn_out * k_c_by_word * NUM_PE) + idx_dst += no * (k_c_by_word * NUM_PE) + idx_dst += c * (NUM_PE) + idx_dst += ni + transpose_values[idx_dst] = copy_value[idx_src] + idx_src += 1 + elif dimension_format == "NCHW": + for no in range(kn_out): + for ni in range(NUM_PE): + for c in range(k_c_by_word): + for h in range(kh): + for w in range(kw): + idx_dst = h * (kw * kn_out * k_c_by_word * NUM_PE) + idx_dst += w * (kn_out * k_c_by_word * NUM_PE) + idx_dst += no * (k_c_by_word * NUM_PE) + idx_dst += c * (NUM_PE) + idx_dst += ni + transpose_values[idx_dst] = copy_value[idx_src] + idx_src += 1 + else: + NotImplementedError("only NCHW and NHWC formats are suppported") + + return transpose_values + class NHWC_Transposer(GraphRunner): """Transposer of all nodes to NHWC.""" @@ -312,6 +371,12 @@ def run_forward_conv(self, node: Conv, **kwargs: Any) -> None: for key, op in zip(node.input_names, ops): if self._is_prunable(op): + oh = node.height + ow = node.width + od = node.channel + kh = node.kernel_height + kw = node.kernel_width + kd = op.channel shape = op.shape op_data = node.quantizer.binarizer(op.data) data = packer.run(op_data.astype(np.float32), op.dimension) @@ -321,7 +386,8 @@ def run_forward_conv(self, node: Conv, **kwargs: Any) -> None: dtype, data, packed=True, - actual_shape=shape + actual_shape=shape, + transposed_data=transpose_kernels(data, node.dimension, oh, ow, od, kh, kw, kd) ) node.add_input(key, new_op) self._graph.add_op(new_op) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index 94d338425..43204bad8 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -120,8 +120,10 @@ def run(self): for k, v in input_ops['X'].output_ops.items(): if v[0] == op: inputs_string = str(input_ops['X'].name) + '_' + str(k) + inputs_string_transposed = inputs_string + ', ' + input_ops['W'].name + '_transposed' inputs_string = inputs_string + ', ' + input_ops['W'].name else: + inputs_string_transposed = ', '.join(str(x.name) if k != 'W' else str(x.name) + '_transposed' for k, x in input_ops.items()) inputs_string = self.inputs_to_string(input_ops) if op.has_thresholds: @@ -135,63 +137,6 @@ def run(self): nbit_aqtz = 2 max_value = 2.0 - NUM_PE = 16 - NBIT_QDYPE = 32 - MAX_NBIT_QINPUT = 2 - MAX_NBIT_KERNEL = 1 - num_qinput_per_qword = int(NBIT_QDYPE / MAX_NBIT_QINPUT) - num_qkernel_per_qword = int(NBIT_QDYPE / MAX_NBIT_KERNEL) - k_c_by_word = int((kd + (num_qkernel_per_qword - 1)) / num_qkernel_per_qword); - k_n_aligned_with_num_pe = int(((od + (NUM_PE - 1)) / NUM_PE) * NUM_PE); - if od < NUM_PE: - k_size = k_n_aligned_with_num_pe * kh * kw * k_c_by_word; - else: - k_size = od * kh * kw * k_c_by_word; - - flatten_value = [] - for elem in input_ops['W'].data: - flatten_value.extend(elem) - copy_value = [0] * k_size - for i in range(od * kh * kw * k_c_by_word): - copy_value[i] = flatten_value[i] - - transpose_values = [0] * k_size - if (od < NUM_PE): - kn_out = int(k_n_aligned_with_num_pe / NUM_PE) - else: - kn_out = int(od / NUM_PE) - idx_src = 0 - - if op._dimension_format == "NHWC": - for no in range(kn_out): - for ni in range(NUM_PE): - for h in range(kh): - for w in range(kw): - for c in range(k_c_by_word): - idx_dst = h * (kw * kn_out * k_c_by_word * NUM_PE) - idx_dst += w * (kn_out * k_c_by_word * NUM_PE) - idx_dst += no * (k_c_by_word * NUM_PE) - idx_dst += c * (NUM_PE) - idx_dst += ni - transpose_values[idx_dst] = copy_value[idx_src] - idx_src += 1 - elif op._dimension_format == "NCHW": - for no in range(kn_out): - for ni in range(NUM_PE): - for c in range(k_c_by_word): - for h in range(kh): - for w in range(kw): - idx_dst = h * (kw * kn_out * k_c_by_word * NUM_PE) - idx_dst += w * (kn_out * k_c_by_word * NUM_PE) - idx_dst += no * (k_c_by_word * NUM_PE) - idx_dst += c * (NUM_PE) - idx_dst += ni - transpose_values[idx_dst] = copy_value[idx_src] - idx_src += 1 - else: - NotImplementedError("only NCHW and NHWC formats are suppported") - transpose_string = "{" + ",".join([str(temp) for temp in transpose_values]) + "}" - # temporary: formula which derive number of qinput is not complete render_string = self.format_string( f""" @@ -221,9 +166,11 @@ def run(self): binConv2D_struct.n_bit = {nbit_aqtz}; binConv2D_struct.max_value = {max_value}; - static T_UINT kernel_hwnocni{qconv_idx}[{k_size}] = {transpose_string}; - vecCoefficient.emplace_back(kernel_hwnocni{qconv_idx}); + #if defined RUN_ON_FPGA + {conv_func}({inputs_string_transposed}, {op.name}, scaling_factors::{op.name}, binConv2D_struct); + #else {conv_func}({inputs_string}, {op.name}, scaling_factors::{op.name}, binConv2D_struct); + #endif """ ) qconv_idx += 1 diff --git a/dlk/python/dlk/templates/include/network.tpl.h b/dlk/python/dlk/templates/include/network.tpl.h index ec321841b..4562efc8f 100644 --- a/dlk/python/dlk/templates/include/network.tpl.h +++ b/dlk/python/dlk/templates/include/network.tpl.h @@ -23,8 +23,6 @@ limitations under the License. #define SYM_PUBLIC __attribute__ ((visibility ("default"))) #define SYM_LOCAL __attribute__ ((visibility ("hidden"))) -extern std::vector vecCoefficient; - class SYM_PUBLIC Network { public: diff --git a/dlk/python/dlk/templates/manual/consts/input.tpl.cpp b/dlk/python/dlk/templates/manual/consts/input.tpl.cpp index 7575183d3..d5f75aa6e 100644 --- a/dlk/python/dlk/templates/manual/consts/input.tpl.cpp +++ b/dlk/python/dlk/templates/manual/consts/input.tpl.cpp @@ -28,4 +28,12 @@ limitations under the License. {%- endfor %} }; +{% if node.transposed_data %} +{{ node.dtype.cpptype() }} {{ node.name }}_transposed[] = { + {% for d in node.transposed_data -%} + {{- d -}}, + {%- endfor %} +}; +{% endif %} + {%- endif %} diff --git a/dlk/python/dlk/templates/manual/consts/input.tpl.h b/dlk/python/dlk/templates/manual/consts/input.tpl.h index c83acc91e..d50af3c47 100644 --- a/dlk/python/dlk/templates/manual/consts/input.tpl.h +++ b/dlk/python/dlk/templates/manual/consts/input.tpl.h @@ -25,6 +25,9 @@ extern {{ node.dtype.cpptype() }} {{ node.name }}; {% else -%} extern {{ node.dtype.cpptype() }} {{ node.name }}[]; +{% if node.transposed_data %} +extern {{ node.dtype.cpptype() }} {{ node.name }}_transposed[]; +{%- endif %} {%- endif %} diff --git a/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp b/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp index 2bfe0b9c4..00017413a 100644 --- a/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp +++ b/dlk/python/dlk/templates/src/func/impl/fpga/quantized_conv2d_kn2row.cpp @@ -110,7 +110,7 @@ void QuantizedConv2DKn2Row(QUANTIZED_NOT_PACKED input[], Measurement::Start("QConv2D with kn2row"); de10_nano::qconv_with_kn2row( - p.device_input_phys_addr, p.device_output_phys_addr, vecCoefficient.at(p.layer_index), + p.device_input_phys_addr, p.device_output_phys_addr, kernel, p.thresholds, in_w, in_h, in_c_by_word, MAX_NBIT_QINPUT, out_w, out_h, out_c_aligend_with_num_pe, k_w, k_h, cp.padding, cp.stride_along_height); @@ -147,7 +147,7 @@ void QuantizedConv2DKn2Row(QUANTIZED_NOT_PACKED input[], Measurement::Start("QConv2D with kn2row"); de10_nano::qconv_with_kn2row( - p.device_input_phys_addr, p.device_output_phys_addr, vecCoefficient.at(p.layer_index), + p.device_input_phys_addr, p.device_output_phys_addr, kernel, p.thresholds, in_w, in_h, in_c_by_word, MAX_NBIT_QINPUT, out_w, out_h, out_c, k_w, k_h, cp.padding, cp.stride_along_height); Measurement::Stop(); diff --git a/dlk/python/dlk/templates/src/network.tpl.cpp b/dlk/python/dlk/templates/src/network.tpl.cpp index 3e901c87b..159f61b2f 100644 --- a/dlk/python/dlk/templates/src/network.tpl.cpp +++ b/dlk/python/dlk/templates/src/network.tpl.cpp @@ -136,8 +136,6 @@ void save_uint32_data(const std::string &name, uint32_t size, uint32_t *data, fl {{ '\n' -}} ///////////////////////////////////////// -std::vector vecCoefficient; - Network::Network() {} From 7466af17682224adb886fcfd55792a79b724ad75 Mon Sep 17 00:00:00 2001 From: konda Date: Mon, 7 Jan 2019 18:05:40 +0900 Subject: [PATCH 26/50] remove redundant file --- dlk/python/dlk/core/view.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index 43204bad8..914ff2cfa 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -17,7 +17,6 @@ from core.data_types import * from textwrap import dedent -qconv_idx = 0 class View(object): def __init__(self, op): self.op = op @@ -36,7 +35,6 @@ def shape_list(self): return ','.join(map(lambda x: str(x), self.op.shape)) def run(self): - global qconv_idx op = self.op input_ops = op.input_ops output_ops = op.output_ops @@ -159,7 +157,6 @@ def run(self): binConv2D_struct.bin_kernel_ndata = {qk_elems}; binConv2D_struct.bin_input_nwords = {qk_elems}; binConv2D_struct.bin_input_ndata = {qk_elems}*{nbit_qinput}; - binConv2D_struct.layer_index = {qconv_idx}; binConv2D_struct.device_input_buf = device_input_buf; binConv2D_struct.device_output_buf = device_output_buf; binConv2D_struct.thresholds = {threshold}; @@ -173,9 +170,6 @@ def run(self): #endif """ ) - qconv_idx += 1 - if len(op.output_ops.keys()) < 1: - qconv_idx = 0 else: # temporary From b49b0a93a4efb27aea37e53889562c41afa7c08e Mon Sep 17 00:00:00 2001 From: konda Date: Mon, 7 Jan 2019 18:06:38 +0900 Subject: [PATCH 27/50] Revert "remove redundant file" This reverts commit 7466af17682224adb886fcfd55792a79b724ad75. --- dlk/python/dlk/core/view.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index 914ff2cfa..43204bad8 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -17,6 +17,7 @@ from core.data_types import * from textwrap import dedent +qconv_idx = 0 class View(object): def __init__(self, op): self.op = op @@ -35,6 +36,7 @@ def shape_list(self): return ','.join(map(lambda x: str(x), self.op.shape)) def run(self): + global qconv_idx op = self.op input_ops = op.input_ops output_ops = op.output_ops @@ -157,6 +159,7 @@ def run(self): binConv2D_struct.bin_kernel_ndata = {qk_elems}; binConv2D_struct.bin_input_nwords = {qk_elems}; binConv2D_struct.bin_input_ndata = {qk_elems}*{nbit_qinput}; + binConv2D_struct.layer_index = {qconv_idx}; binConv2D_struct.device_input_buf = device_input_buf; binConv2D_struct.device_output_buf = device_output_buf; binConv2D_struct.thresholds = {threshold}; @@ -170,6 +173,9 @@ def run(self): #endif """ ) + qconv_idx += 1 + if len(op.output_ops.keys()) < 1: + qconv_idx = 0 else: # temporary From c0af670a94b29f3f16e076c97f51f9d0f0a9cca7 Mon Sep 17 00:00:00 2001 From: konda Date: Mon, 7 Jan 2019 18:07:32 +0900 Subject: [PATCH 28/50] delete redundant code --- dlk/python/dlk/core/view.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index 43204bad8..914ff2cfa 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -17,7 +17,6 @@ from core.data_types import * from textwrap import dedent -qconv_idx = 0 class View(object): def __init__(self, op): self.op = op @@ -36,7 +35,6 @@ def shape_list(self): return ','.join(map(lambda x: str(x), self.op.shape)) def run(self): - global qconv_idx op = self.op input_ops = op.input_ops output_ops = op.output_ops @@ -159,7 +157,6 @@ def run(self): binConv2D_struct.bin_kernel_ndata = {qk_elems}; binConv2D_struct.bin_input_nwords = {qk_elems}; binConv2D_struct.bin_input_ndata = {qk_elems}*{nbit_qinput}; - binConv2D_struct.layer_index = {qconv_idx}; binConv2D_struct.device_input_buf = device_input_buf; binConv2D_struct.device_output_buf = device_output_buf; binConv2D_struct.thresholds = {threshold}; @@ -173,9 +170,6 @@ def run(self): #endif """ ) - qconv_idx += 1 - if len(op.output_ops.keys()) < 1: - qconv_idx = 0 else: # temporary From 78c375308683573195840c3478f72b2b971bea03 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Tue, 8 Jan 2019 11:17:24 +0900 Subject: [PATCH 29/50] fix overrun for debug --- dlk/python/dlk/core/optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py index aa157faee..918afca41 100644 --- a/dlk/python/dlk/core/optimizer.py +++ b/dlk/python/dlk/core/optimizer.py @@ -79,7 +79,7 @@ def transpose_kernels(kernel_data, dimension_format, oh, ow, od, kh, kw, kd): else: k_size = od * kh * kw * k_c_by_word; - flatten_value = [] + flatten_value = [] * k_size for elem in kernel_data: flatten_value.extend(elem) copy_value = [0] * k_size From d3d3b2bdb1df633d87fb0440c4cabb31a3c646d0 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Tue, 8 Jan 2019 11:56:11 +0900 Subject: [PATCH 30/50] for debug --- dlk/python/dlk/core/optimizer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py index 918afca41..cdc883363 100644 --- a/dlk/python/dlk/core/optimizer.py +++ b/dlk/python/dlk/core/optimizer.py @@ -79,9 +79,12 @@ def transpose_kernels(kernel_data, dimension_format, oh, ow, od, kh, kw, kd): else: k_size = od * kh * kw * k_c_by_word; - flatten_value = [] * k_size + flatten_value = [] for elem in kernel_data: flatten_value.extend(elem) + while len(flatten_value) != k_size: + flatten_value.extend(0) + copy_value = [0] * k_size for i in range(od * kh * kw * k_c_by_word): copy_value[i] = flatten_value[i] From ecf689c1140f353b461da1068a6580b9b8bc6c09 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Tue, 8 Jan 2019 12:31:20 +0900 Subject: [PATCH 31/50] for debug --- dlk/python/dlk/core/optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py index cdc883363..5152ac726 100644 --- a/dlk/python/dlk/core/optimizer.py +++ b/dlk/python/dlk/core/optimizer.py @@ -83,7 +83,7 @@ def transpose_kernels(kernel_data, dimension_format, oh, ow, od, kh, kw, kd): for elem in kernel_data: flatten_value.extend(elem) while len(flatten_value) != k_size: - flatten_value.extend(0) + flatten_value.extend("0") copy_value = [0] * k_size for i in range(od * kh * kw * k_c_by_word): From 17e265b6dbe4eeb91885560831f7c1f57628562f Mon Sep 17 00:00:00 2001 From: lm_konda Date: Tue, 8 Jan 2019 12:47:50 +0900 Subject: [PATCH 32/50] fix overrun --- dlk/python/dlk/core/optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py index 5152ac726..d21375b55 100644 --- a/dlk/python/dlk/core/optimizer.py +++ b/dlk/python/dlk/core/optimizer.py @@ -379,7 +379,7 @@ def run_forward_conv(self, node: Conv, **kwargs: Any) -> None: od = node.channel kh = node.kernel_height kw = node.kernel_width - kd = op.channel + kd = node.input_ops['X'].channel shape = op.shape op_data = node.quantizer.binarizer(op.data) data = packer.run(op_data.astype(np.float32), op.dimension) From 168320f2685d2d040f078bc13be10974ab08a894 Mon Sep 17 00:00:00 2001 From: konda Date: Tue, 8 Jan 2019 14:53:08 +0900 Subject: [PATCH 33/50] fix indent etc. --- dlk/python/dlk/core/operators.py | 1 - dlk/python/dlk/core/optimizer.py | 11 ++++++----- dlk/python/dlk/core/view.py | 4 +++- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/dlk/python/dlk/core/operators.py b/dlk/python/dlk/core/operators.py index 71993a1e7..fd03b4c5a 100644 --- a/dlk/python/dlk/core/operators.py +++ b/dlk/python/dlk/core/operators.py @@ -634,7 +634,6 @@ def transposed_data(self): return self._transposed_data - class Output(Variable): """Output class.""" diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py index aa157faee..507de2d69 100644 --- a/dlk/python/dlk/core/optimizer.py +++ b/dlk/python/dlk/core/optimizer.py @@ -66,13 +66,13 @@ def node_is_activation_quantizer(node: Operator) -> bool: def transpose_kernels(kernel_data, dimension_format, oh, ow, od, kh, kw, kd): - NUM_PE = 16 - NBIT_QDYPE = 32 + NUM_PE = 16 + NBIT_QDYPE = 32 MAX_NBIT_QINPUT = 2 MAX_NBIT_KERNEL = 1 - num_qinput_per_qword = int(NBIT_QDYPE / MAX_NBIT_QINPUT) - num_qkernel_per_qword = int(NBIT_QDYPE / MAX_NBIT_KERNEL) - k_c_by_word = int((kd + (num_qkernel_per_qword - 1)) / num_qkernel_per_qword); + num_qinput_per_qword = int(NBIT_QDYPE / MAX_NBIT_QINPUT) + num_qkernel_per_qword = int(NBIT_QDYPE / MAX_NBIT_KERNEL) + k_c_by_word = int((kd + (num_qkernel_per_qword - 1)) / num_qkernel_per_qword); k_n_aligned_with_num_pe = int(((od + (NUM_PE - 1)) / NUM_PE) * NUM_PE); if od < NUM_PE: k_size = k_n_aligned_with_num_pe * kh * kw * k_c_by_word; @@ -127,6 +127,7 @@ def transpose_kernels(kernel_data, dimension_format, oh, ow, od, kh, kw, kd): class NHWC_Transposer(GraphRunner): """Transposer of all nodes to NHWC.""" + def _get_permutation(self, dim: str) -> List[int]: """Create a permutation from the source dimension.""" assert len(dim) == 4 and set(dim).issubset({'N', 'H', 'W', 'C', 'I', 'O'}), \ diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index 914ff2cfa..c71eb756d 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -17,6 +17,7 @@ from core.data_types import * from textwrap import dedent + class View(object): def __init__(self, op): self.op = op @@ -121,7 +122,8 @@ def run(self): inputs_string_transposed = inputs_string + ', ' + input_ops['W'].name + '_transposed' inputs_string = inputs_string + ', ' + input_ops['W'].name else: - inputs_string_transposed = ', '.join(str(x.name) if k != 'W' else str(x.name) + '_transposed' for k, x in input_ops.items()) + inputs_string_transposed = ', '.join(str(x.name) \ + if k != 'W' else str(x.name) + '_transposed' for k,x in input_ops.items()) inputs_string = self.inputs_to_string(input_ops) if op.has_thresholds: From 1c73e0ea6224060b16721bca2c86df20725d2884 Mon Sep 17 00:00:00 2001 From: konda Date: Tue, 8 Jan 2019 15:36:45 +0900 Subject: [PATCH 34/50] fix to pass test --- dlk/python/dlk/core/optimizer.py | 10 +++++----- dlk/python/dlk/core/view.py | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py index 27a42f5a9..5eee8f7c1 100644 --- a/dlk/python/dlk/core/optimizer.py +++ b/dlk/python/dlk/core/optimizer.py @@ -72,12 +72,12 @@ def transpose_kernels(kernel_data, dimension_format, oh, ow, od, kh, kw, kd): MAX_NBIT_KERNEL = 1 num_qinput_per_qword = int(NBIT_QDYPE / MAX_NBIT_QINPUT) num_qkernel_per_qword = int(NBIT_QDYPE / MAX_NBIT_KERNEL) - k_c_by_word = int((kd + (num_qkernel_per_qword - 1)) / num_qkernel_per_qword); - k_n_aligned_with_num_pe = int(((od + (NUM_PE - 1)) / NUM_PE) * NUM_PE); + k_c_by_word = int((kd + (num_qkernel_per_qword - 1)) / num_qkernel_per_qword) + k_n_aligned_with_num_pe = int(((od + (NUM_PE - 1)) / NUM_PE) * NUM_PE) if od < NUM_PE: - k_size = k_n_aligned_with_num_pe * kh * kw * k_c_by_word; + k_size = k_n_aligned_with_num_pe * kh * kw * k_c_by_word else: - k_size = od * kh * kw * k_c_by_word; + k_size = od * kh * kw * k_c_by_word flatten_value = [] for elem in kernel_data: @@ -127,10 +127,10 @@ def transpose_kernels(kernel_data, dimension_format, oh, ow, od, kh, kw, kd): return transpose_values + class NHWC_Transposer(GraphRunner): """Transposer of all nodes to NHWC.""" - def _get_permutation(self, dim: str) -> List[int]: """Create a permutation from the source dimension.""" assert len(dim) == 4 and set(dim).issubset({'N', 'H', 'W', 'C', 'I', 'O'}), \ diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index c71eb756d..b95f8f057 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -119,11 +119,11 @@ def run(self): for k, v in input_ops['X'].output_ops.items(): if v[0] == op: inputs_string = str(input_ops['X'].name) + '_' + str(k) - inputs_string_transposed = inputs_string + ', ' + input_ops['W'].name + '_transposed' + in_str_t = inputs_string + ', ' + input_ops['W'].name + '_transposed' inputs_string = inputs_string + ', ' + input_ops['W'].name else: - inputs_string_transposed = ', '.join(str(x.name) \ - if k != 'W' else str(x.name) + '_transposed' for k,x in input_ops.items()) + iops = input_ops + in_str_t = ', '.join(str(x.name) if k!='W' else str(x.name) + '_transposed' for k,x in iops.items()) inputs_string = self.inputs_to_string(input_ops) if op.has_thresholds: @@ -166,7 +166,7 @@ def run(self): binConv2D_struct.max_value = {max_value}; #if defined RUN_ON_FPGA - {conv_func}({inputs_string_transposed}, {op.name}, scaling_factors::{op.name}, binConv2D_struct); + {conv_func}({in_str_t}, {op.name}, scaling_factors::{op.name}, binConv2D_struct); #else {conv_func}({inputs_string}, {op.name}, scaling_factors::{op.name}, binConv2D_struct); #endif From 975ba6588fa3e2f628c135527f67fd9736cc7ea3 Mon Sep 17 00:00:00 2001 From: konda Date: Tue, 8 Jan 2019 16:16:44 +0900 Subject: [PATCH 35/50] fix to pass test --- dlk/python/dlk/core/view.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index b95f8f057..902a9221a 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -119,11 +119,11 @@ def run(self): for k, v in input_ops['X'].output_ops.items(): if v[0] == op: inputs_string = str(input_ops['X'].name) + '_' + str(k) - in_str_t = inputs_string + ', ' + input_ops['W'].name + '_transposed' + istrt = inputs_string + ', ' + input_ops['W'].name + '_transposed' inputs_string = inputs_string + ', ' + input_ops['W'].name else: iops = input_ops - in_str_t = ', '.join(str(x.name) if k!='W' else str(x.name) + '_transposed' for k,x in iops.items()) + istrt = ', '.join(str(x.name) if k != 'W' else str(x.name) + '_transposed' for k, x in iops.items()) inputs_string = self.inputs_to_string(input_ops) if op.has_thresholds: @@ -166,7 +166,7 @@ def run(self): binConv2D_struct.max_value = {max_value}; #if defined RUN_ON_FPGA - {conv_func}({in_str_t}, {op.name}, scaling_factors::{op.name}, binConv2D_struct); + {conv_func}({istrt}, {op.name}, scaling_factors::{op.name}, binConv2D_struct); #else {conv_func}({inputs_string}, {op.name}, scaling_factors::{op.name}, binConv2D_struct); #endif From bdbb1df5575834e7d041cbb72011a01fa9218d5f Mon Sep 17 00:00:00 2001 From: lm_konda Date: Wed, 9 Jan 2019 15:10:31 +0900 Subject: [PATCH 36/50] set layout same as tensorflow only --- dlk/python/dlk/core/optimizer.py | 40 ++++++++++---------------------- 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py index d21375b55..1bbed6188 100644 --- a/dlk/python/dlk/core/optimizer.py +++ b/dlk/python/dlk/core/optimizer.py @@ -96,34 +96,18 @@ def transpose_kernels(kernel_data, dimension_format, oh, ow, od, kh, kw, kd): kn_out = int(od / NUM_PE) idx_src = 0 - if dimension_format == "NHWC": - for no in range(kn_out): - for ni in range(NUM_PE): - for h in range(kh): - for w in range(kw): - for c in range(k_c_by_word): - idx_dst = h * (kw * kn_out * k_c_by_word * NUM_PE) - idx_dst += w * (kn_out * k_c_by_word * NUM_PE) - idx_dst += no * (k_c_by_word * NUM_PE) - idx_dst += c * (NUM_PE) - idx_dst += ni - transpose_values[idx_dst] = copy_value[idx_src] - idx_src += 1 - elif dimension_format == "NCHW": - for no in range(kn_out): - for ni in range(NUM_PE): - for c in range(k_c_by_word): - for h in range(kh): - for w in range(kw): - idx_dst = h * (kw * kn_out * k_c_by_word * NUM_PE) - idx_dst += w * (kn_out * k_c_by_word * NUM_PE) - idx_dst += no * (k_c_by_word * NUM_PE) - idx_dst += c * (NUM_PE) - idx_dst += ni - transpose_values[idx_dst] = copy_value[idx_src] - idx_src += 1 - else: - NotImplementedError("only NCHW and NHWC formats are suppported") + for no in range(kn_out): + for ni in range(NUM_PE): + for h in range(kh): + for w in range(kw): + for c in range(k_c_by_word): + idx_dst = h * (kw * kn_out * k_c_by_word * NUM_PE) + idx_dst += w * (kn_out * k_c_by_word * NUM_PE) + idx_dst += no * (k_c_by_word * NUM_PE) + idx_dst += c * (NUM_PE) + idx_dst += ni + transpose_values[idx_dst] = copy_value[idx_src] + idx_src += 1 return transpose_values From c327dcd0a58f739cb135001d52b0aa3736ad73f0 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Fri, 11 Jan 2019 10:55:59 +0900 Subject: [PATCH 37/50] reflect review --- dlk/python/dlk/core/operators.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlk/python/dlk/core/operators.py b/dlk/python/dlk/core/operators.py index fd03b4c5a..50ca04760 100644 --- a/dlk/python/dlk/core/operators.py +++ b/dlk/python/dlk/core/operators.py @@ -613,7 +613,7 @@ def __init__(self, dimension_format: str = 'NHWC', packed: bool = False, actual_shape: List[int] = [], - transposed_data=None) -> None: + transposed_data: List[int] = None) -> None: """Init the variable. If the constant is hard quantized, data is packed and the actual shape @@ -629,7 +629,7 @@ def is_packed(self) -> bool: return self._packed @property - def transposed_data(self): + def transposed_data(self) -> List[int]: """Return transposed data.""" return self._transposed_data From 87de45f4f45d7942737cd40cce5237e38642d100 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Fri, 11 Jan 2019 16:42:02 +0900 Subject: [PATCH 38/50] add type to function arguments --- dlk/python/dlk/core/optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py index 2074db783..2a07bfb2b 100644 --- a/dlk/python/dlk/core/optimizer.py +++ b/dlk/python/dlk/core/optimizer.py @@ -65,7 +65,7 @@ def node_is_activation_quantizer(node: Operator) -> bool: return node.op_type == 'QTZ_linear_mid_tread_half' -def transpose_kernels(kernel_data, dimension_format, oh, ow, od, kh, kw, kd): +def transpose_kernels(kernel_data: np.ndarray, dimension_format: str, oh: int, ow: int, od: int, kh: int, kw: int, kd: int) -> List[int]: NUM_PE = 16 NBIT_QDYPE = 32 MAX_NBIT_QINPUT = 2 From 4be631c9e9b7b10edf12aa5c15f039bd29ce58b7 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Sat, 12 Jan 2019 16:27:27 +0900 Subject: [PATCH 39/50] fix to pass jenkins test --- dlk/python/dlk/core/optimizer.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py index 2a07bfb2b..64ffc2d1c 100644 --- a/dlk/python/dlk/core/optimizer.py +++ b/dlk/python/dlk/core/optimizer.py @@ -65,7 +65,14 @@ def node_is_activation_quantizer(node: Operator) -> bool: return node.op_type == 'QTZ_linear_mid_tread_half' -def transpose_kernels(kernel_data: np.ndarray, dimension_format: str, oh: int, ow: int, od: int, kh: int, kw: int, kd: int) -> List[int]: +def transpose_kernels(kernel_data: np.ndarray, + dimension_format: str, + oh: int, + ow: int, + od: int, + kh: int, + kw: int, + kd: int) -> List[int]: NUM_PE = 16 NBIT_QDYPE = 32 MAX_NBIT_QINPUT = 2 From 6330105f0fe4c9e164b3cf3b7fe0799697467d57 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Wed, 16 Jan 2019 13:17:26 +0900 Subject: [PATCH 40/50] fix variable name --- dlk/python/dlk/core/optimizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py index 64ffc2d1c..771c1984b 100644 --- a/dlk/python/dlk/core/optimizer.py +++ b/dlk/python/dlk/core/optimizer.py @@ -96,7 +96,7 @@ def transpose_kernels(kernel_data: np.ndarray, for i in range(od * kh * kw * k_c_by_word): copy_value[i] = flatten_value[i] - transpose_values = [0] * k_size + transposed_values = [0] * k_size if (od < NUM_PE): kn_out = int(k_n_aligned_with_num_pe / NUM_PE) else: @@ -113,10 +113,10 @@ def transpose_kernels(kernel_data: np.ndarray, idx_dst += no * (k_c_by_word * NUM_PE) idx_dst += c * (NUM_PE) idx_dst += ni - transpose_values[idx_dst] = copy_value[idx_src] + transposed_values[idx_dst] = copy_value[idx_src] idx_src += 1 - return transpose_values + return transposed_values class NHWC_Transposer(GraphRunner): From 8fb71df45758911ab09509cccce4a7879a578c1b Mon Sep 17 00:00:00 2001 From: lm_konda Date: Fri, 1 Feb 2019 15:21:38 +0900 Subject: [PATCH 41/50] enable graph opt --- dlk/python/dlk/core/optimizer.py | 89 ++------------------------------ 1 file changed, 5 insertions(+), 84 deletions(-) diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py index d866cb17f..1a697623f 100644 --- a/dlk/python/dlk/core/optimizer.py +++ b/dlk/python/dlk/core/optimizer.py @@ -19,9 +19,9 @@ from core.graph import Graph from core.graph_pattern_matching import get_nodes_in_branch, sort_graph -from core.operators import Constant, Operator +from core.operators import Constant, Operator, Conv from core.data_types import Uint32, QUANTIZED_NOT_PACKED -from typing import cast +from typing import cast, List, Any from collections import defaultdict from modules.packer import Packer @@ -53,40 +53,13 @@ def pass_remove_identities(graph: Graph) -> None: v2.append(out_op) break break + to_be_removed.append(m) + for op in to_be_removed: graph.remove_op(op) -def node_is_conv(node: Operator) -> bool: - return node.op_type == 'Conv' - - -def node_is_concat(node: Operator) -> bool: - return node.op_type == 'ConcatV2' - - -def node_is_const(node: Operator) -> bool: - return node.op_type == 'Constant' - - -def node_is_qconv(node: Operator) -> bool: - return node.op_type == 'Conv' and cast(Conv, node).is_quantized - - -def node_is_input(node: Operator) -> bool: - return node.op_type == 'Input' - - -def node_is_weight_quantizer(node: Operator) -> bool: - return (node.op_type == 'QTZ_binary_mean_scaling' - or node.op_type == 'QTZ_binary_channel_wise_mean_scaling') - - -def node_is_activation_quantizer(node: Operator) -> bool: - return node.op_type == 'QTZ_linear_mid_tread_half' - - def transpose_kernels(kernel_data: np.ndarray, dimension_format: str, oh: int, @@ -141,50 +114,6 @@ def transpose_kernels(kernel_data: np.ndarray, return transposed_values -class NHWC_Transposer(GraphRunner): - """Transposer of all nodes to NHWC.""" - - def _get_permutation(self, dim: str) -> List[int]: - """Create a permutation from the source dimension.""" - assert len(dim) == 4 and set(dim).issubset({'N', 'H', 'W', 'C', 'I', 'O'}), \ - f'illegal dimension found: {dim}' - - if set(dim) == set('HWIO'): - dim = dim.replace('I', 'C') - dim = dim.replace('O', 'N') - - return list(map(lambda s: dim.index(s), 'NHWC')) - - def _check_and_transpose(self, node: Operator) -> None: - perm = self._get_permutation(node.dimension) - node.transpose(perm) - - def run_backward_input(self, node: Input, **kwargs: Any) -> None: - self._check_and_transpose(node) - - def run_backward_constant(self, node: Constant, **kwargs: Any) -> None: - if node.ndims == 4 and set(node.dimension).issubset({'N', 'H', 'W', 'C', 'I', 'O'}): - self._check_and_transpose(node) - - def run_backward_identity(self, node: Identity, **kwargs: Any) -> None: - if node.ndims == 4 and set(node.dimension).issubset({'N', 'H', 'W', 'C', 'I', 'O'}): - self._check_and_transpose(node) - - def run_backward_QTZ_binary_mean_scaling(self, node: QTZ_binary_mean_scaling, **kwargs: Any) -> None: - self._check_and_transpose(node) - - def run_backward_transpose(self, node: Transpose, **kwargs: Any) -> None: - raise NotImplementedError('Transposing Transpose operator is not supported yet.') - - def run_backward_conv(self, node: Conv, **kwargs: Any) -> None: - self._check_and_transpose(node) - - def run_backward_batch_normalization(self, node: BatchNormalization, **kwargs: Any) -> None: - self._check_and_transpose(node) - - def run_backward_QTZ_linear_mid_tread_half(self, node: QTZ_linear_mid_tread_half, **kwargs: Any) -> None: - self._check_and_transpose(node) - def pass_transpose(graph: Graph) -> None: """Changes the data order of every node to be NHWC. The fastest changing dimension is C @@ -332,14 +261,6 @@ def pass_propagate_quantization_details_into_conv(graph: Graph) -> None: quant_details[m.name] = qtzs if len(qtzs) == len(m.input_nodes) else [] # TODO: check if the quantizers use same n_bits - # if it can be precomputed - if self._has_precompute_value(in_op): - node.run_forward() - self._precomp_dic[node.name] = True # this node can be pruned - self._quantizers[node.name] = node # add itself as the quantizer - else: - self._precomp_dic[node.name] = False - def run_forward_conv(self, node: Conv, **kwargs: Any) -> None: ops: List[Operator] = [node.input_ops[i] for i in node.input_names if node.input_ops.get(i)] @@ -367,7 +288,7 @@ def run_forward_conv(self, node: Conv, **kwargs: Any) -> None: node.quantizer = list(quantizers.values())[0] for key, op in zip(node.input_names, ops): - + if self._is_prunable(op): oh = node.height ow = node.width From 9cc8c10edcf5dbb4260e51f71f32e41517b6a59a Mon Sep 17 00:00:00 2001 From: lm_konda Date: Mon, 4 Feb 2019 10:46:51 +0900 Subject: [PATCH 42/50] fix kernel transpose --- dlk/python/dlk/core/optimizer.py | 128 ++++++++++--------------------- 1 file changed, 42 insertions(+), 86 deletions(-) diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py index 1a697623f..1b7e20eaa 100644 --- a/dlk/python/dlk/core/optimizer.py +++ b/dlk/python/dlk/core/optimizer.py @@ -25,41 +25,6 @@ from collections import defaultdict from modules.packer import Packer - -def pass_remove_identities(graph: Graph) -> None: - """Removes those nodes of a Graph that satisfies the condition node.op_type() == Identity. - - Parameters - ---------- - graph : Graph - The input graph. It will be modified in-place. - - """ - exec_list = [n for n in sort_graph(graph) if n.op_type == 'Identity'] - to_be_removed = list() - for m in exec_list: - """skip all identity.""" - in_op = m.input_ops['input'] - out_ops = m.output_ops['output'] - for out_op in out_ops: - for k, v in out_op.input_ops.items(): - if v == m: - # change the output's input to this identity's input - out_op.add_input(k, in_op) - # change the input's output to this identity's output - for k2, v2 in in_op.output_ops.items(): - if m in v2: - v2.remove(m) - v2.append(out_op) - break - break - - to_be_removed.append(m) - - for op in to_be_removed: - graph.remove_op(op) - - def transpose_kernels(kernel_data: np.ndarray, dimension_format: str, oh: int, @@ -114,6 +79,40 @@ def transpose_kernels(kernel_data: np.ndarray, return transposed_values +def pass_remove_identities(graph: Graph) -> None: + """Removes those nodes of a Graph that satisfies the condition node.op_type() == Identity. + + Parameters + ---------- + graph : Graph + The input graph. It will be modified in-place. + + """ + exec_list = [n for n in sort_graph(graph) if n.op_type == 'Identity'] + to_be_removed = list() + for m in exec_list: + """skip all identity.""" + in_op = m.input_ops['input'] + out_ops = m.output_ops['output'] + for out_op in out_ops: + for k, v in out_op.input_ops.items(): + if v == m: + # change the output's input to this identity's input + out_op.add_input(k, in_op) + # change the input's output to this identity's output + for k2, v2 in in_op.output_ops.items(): + if m in v2: + v2.remove(m) + v2.append(out_op) + break + break + + to_be_removed.append(m) + + for op in to_be_removed: + graph.remove_op(op) + + def pass_transpose(graph: Graph) -> None: """Changes the data order of every node to be NHWC. The fastest changing dimension is C @@ -261,56 +260,6 @@ def pass_propagate_quantization_details_into_conv(graph: Graph) -> None: quant_details[m.name] = qtzs if len(qtzs) == len(m.input_nodes) else [] # TODO: check if the quantizers use same n_bits - def run_forward_conv(self, node: Conv, **kwargs: Any) -> None: - ops: List[Operator] = [node.input_ops[i] for i in node.input_names if node.input_ops.get(i)] - - if self._hard_quantized and node in kwargs['qconv']: - # data is to be packed - ops_have_precomp_values = list(map(lambda x: self._has_precompute_value(x), ops)) - ops_are_prunable = list(map(lambda x: self._is_prunable(x), ops)) - - # check which input node can be pruned - if reduce(lambda x, y: x and y, ops_have_precomp_values): # all input has concrete values - node.run_forward() - self._precomp_dic[node.name] = True # this node can be pruned - quantizers = {op.name: self._quantizers[op.name] for op in ops if self._quantizers.get(op.name)} - if len(quantizers) > 1: - ValueError(f'{node.name}: multiple quantized inputs with {node.op_type} are not supported.') - self._quantizers[node.name] = list(quantizers.values())[0] - - else: # an input (must be weight) is to be quantized and packed - self._precomp_dic[node.name] = False - node.is_quantized = True - packer = Packer(self._quantized_bitwidth, self._wordsize) - quantizers = {op.name: self._quantizers[op.name] for op in ops if self._quantizers.get(op.name)} - if len(quantizers) > 1: - ValueError(f'{node.name}: multiple quantized inputs with {node.op_type} are not supported.') - node.quantizer = list(quantizers.values())[0] - - for key, op in zip(node.input_names, ops): - - if self._is_prunable(op): - oh = node.height - ow = node.width - od = node.channel - kh = node.kernel_height - kw = node.kernel_width - kd = node.input_ops['X'].channel - shape = op.shape - op_data = node.quantizer.binarizer(op.data) - data = packer.run(op_data.astype(np.float32), op.dimension) - dtype = op.dtype - new_op = Constant( - op.name + '_new', - dtype, - data, - packed=True, - actual_shape=shape, - transposed_data=transpose_kernels(data, node.dimension, oh, ow, od, kh, kw, kd) - ) - node.add_input(key, new_op) - self._graph.add_op(new_op) - self._prune(op) def pass_compute_thresholds(graph: Graph) -> None: """Given a Quantizer node Q: @@ -469,12 +418,19 @@ def pass_pack_weights(graph: Graph) -> None: data = packer.run(op_data.astype(np.float32), weight_quantizer.dimension) # Create the new constant with the quantized weights + oh = conv_node.height + ow = conv_node.width + od = conv_node.channel + kh = conv_node.kernel_height + kw = conv_node.kernel_width + kd = conv_node.input_ops['X'].channel quantized_constant = Constant( weight_quantizer.name + '_new', Uint32(), data, packed=True, - actual_shape=weight_quantizer.shape + actual_shape=weight_quantizer.shape, + transposed_data=transpose_kernels(data, conv_node.dimension, oh, ow, od, kh, kw, kd) ) # get nodes to be removed after being disconnected From a62ab502c8f02d0b5b694b7fa35dd0aaec5f06d6 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Mon, 4 Feb 2019 16:20:33 +0900 Subject: [PATCH 43/50] fix line --- dlk/python/dlk/core/optimizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py index 1b7e20eaa..0e4408c66 100644 --- a/dlk/python/dlk/core/optimizer.py +++ b/dlk/python/dlk/core/optimizer.py @@ -25,6 +25,7 @@ from collections import defaultdict from modules.packer import Packer + def transpose_kernels(kernel_data: np.ndarray, dimension_format: str, oh: int, From 347ede93b08012a573de30e0200c303af3904fa4 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Tue, 5 Feb 2019 08:59:56 +0900 Subject: [PATCH 44/50] refactoring --- dlk/python/dlk/templates/include/network.tpl.h | 1 - 1 file changed, 1 deletion(-) diff --git a/dlk/python/dlk/templates/include/network.tpl.h b/dlk/python/dlk/templates/include/network.tpl.h index 4562efc8f..8802bea65 100644 --- a/dlk/python/dlk/templates/include/network.tpl.h +++ b/dlk/python/dlk/templates/include/network.tpl.h @@ -18,7 +18,6 @@ limitations under the License. #include "global.h" #include "dma_buffer.h" -#include #define SYM_PUBLIC __attribute__ ((visibility ("default"))) #define SYM_LOCAL __attribute__ ((visibility ("hidden"))) From 788519e3e14e50637e6548564fd918a1bf4eb9e7 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Wed, 6 Feb 2019 09:11:56 +0900 Subject: [PATCH 45/50] refactoring --- dlk/python/dlk/core/optimizer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py index 0e4408c66..a0b614092 100644 --- a/dlk/python/dlk/core/optimizer.py +++ b/dlk/python/dlk/core/optimizer.py @@ -27,7 +27,6 @@ def transpose_kernels(kernel_data: np.ndarray, - dimension_format: str, oh: int, ow: int, od: int, @@ -431,7 +430,7 @@ def pass_pack_weights(graph: Graph) -> None: data, packed=True, actual_shape=weight_quantizer.shape, - transposed_data=transpose_kernels(data, conv_node.dimension, oh, ow, od, kh, kw, kd) + transposed_data=transpose_kernels(data, oh, ow, od, kh, kw, kd) ) # get nodes to be removed after being disconnected From 55b168e248ca0fd3e4c4c0924eee99b9b1155dbc Mon Sep 17 00:00:00 2001 From: lm_konda Date: Thu, 7 Feb 2019 10:39:51 +0900 Subject: [PATCH 46/50] fix naming convention --- dlk/python/dlk/core/optimizer.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py index a0b614092..ef3afe65d 100644 --- a/dlk/python/dlk/core/optimizer.py +++ b/dlk/python/dlk/core/optimizer.py @@ -26,13 +26,13 @@ from modules.packer import Packer -def transpose_kernels(kernel_data: np.ndarray, - oh: int, - ow: int, - od: int, - kh: int, - kw: int, - kd: int) -> List[int]: +def _transpose_kernels(kernel_data: np.ndarray, + oh: int, + ow: int, + od: int, + kh: int, + kw: int, + kd: int) -> List[int]: NUM_PE = 16 NBIT_QDYPE = 32 MAX_NBIT_QINPUT = 2 @@ -430,7 +430,7 @@ def pass_pack_weights(graph: Graph) -> None: data, packed=True, actual_shape=weight_quantizer.shape, - transposed_data=transpose_kernels(data, oh, ow, od, kh, kw, kd) + transposed_data=_transpose_kernels(data, oh, ow, od, kh, kw, kd) ) # get nodes to be removed after being disconnected From aeabe9d78b7450ffc1bc4a86e5be9f6e9f47a849 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Thu, 7 Feb 2019 10:40:05 +0900 Subject: [PATCH 47/50] refactoring --- dlk/python/dlk/templates/manual/consts/input.tpl.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/dlk/python/dlk/templates/manual/consts/input.tpl.cpp b/dlk/python/dlk/templates/manual/consts/input.tpl.cpp index d5f75aa6e..082a027fc 100644 --- a/dlk/python/dlk/templates/manual/consts/input.tpl.cpp +++ b/dlk/python/dlk/templates/manual/consts/input.tpl.cpp @@ -22,18 +22,28 @@ limitations under the License. {% else -%} +{% if node.transposed_data %} + {{ node.dtype.cpptype() }} {{ node.name }}[] = { {% for d in node.data.flatten() -%} {{- d -}}, {%- endfor %} }; -{% if node.transposed_data %} {{ node.dtype.cpptype() }} {{ node.name }}_transposed[] = { {% for d in node.transposed_data -%} {{- d -}}, {%- endfor %} }; + +{% else -%} + +{{ node.dtype.cpptype() }} {{ node.name }}[] = { + {% for d in node.data.flatten() -%} + {{- d -}}, + {%- endfor %} +}; + {% endif %} {%- endif %} From 5763a235ed3fb85aa59ea193e7f32b66ebfcbc71 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Thu, 7 Feb 2019 13:10:23 +0900 Subject: [PATCH 48/50] add comments for global function --- dlk/python/dlk/core/optimizer.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py index ef3afe65d..240f2fb53 100644 --- a/dlk/python/dlk/core/optimizer.py +++ b/dlk/python/dlk/core/optimizer.py @@ -33,6 +33,25 @@ def _transpose_kernels(kernel_data: np.ndarray, kh: int, kw: int, kd: int) -> List[int]: + """Calculates and prepares the transposed kernel data in advance. + + Parameters + ---------- + kernel_data : np.ndarray + The input data. It will be modified(transposed) in-place. + oh : int + output height + ow : int + output width + od : int + output depth + kh : int + kernel height + kw : int + kernel width + kd : int + kernel depth + """ NUM_PE = 16 NBIT_QDYPE = 32 MAX_NBIT_QINPUT = 2 From c02f69718a8086bdc2ee1f536fc8d5adfc465593 Mon Sep 17 00:00:00 2001 From: lm_konda Date: Thu, 7 Feb 2019 15:59:29 +0900 Subject: [PATCH 49/50] reduce constant fold and refactoring --- dlk/python/dlk/core/view.py | 7 ------- dlk/python/dlk/templates/manual/consts/input.tpl.cpp | 10 ++++++---- dlk/python/dlk/templates/manual/consts/input.tpl.h | 3 --- 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/dlk/python/dlk/core/view.py b/dlk/python/dlk/core/view.py index 98bdf86a1..fa67191e6 100644 --- a/dlk/python/dlk/core/view.py +++ b/dlk/python/dlk/core/view.py @@ -119,11 +119,8 @@ def run(self): for k, v in input_ops['X'].output_ops.items(): if v[0] == op: inputs_string = str(input_ops['X'].name) + '_' + str(k) - istrt = inputs_string + ', ' + input_ops['W'].name + '_transposed' inputs_string = inputs_string + ', ' + input_ops['W'].name else: - iops = input_ops - istrt = ', '.join(str(x.name) if k != 'W' else str(x.name) + '_transposed' for k, x in iops.items()) inputs_string = self.inputs_to_string(input_ops) if op.has_thresholds: @@ -165,11 +162,7 @@ def run(self): binConv2D_struct.n_bit = {nbit_aqtz}; binConv2D_struct.max_value = {max_value}; - #if defined RUN_ON_FPGA - {conv_func}({istrt}, {op.name}, scaling_factors::{op.name}, binConv2D_struct); - #else {conv_func}({inputs_string}, {op.name}, scaling_factors::{op.name}, binConv2D_struct); - #endif """ ) diff --git a/dlk/python/dlk/templates/manual/consts/input.tpl.cpp b/dlk/python/dlk/templates/manual/consts/input.tpl.cpp index 082a027fc..200986235 100644 --- a/dlk/python/dlk/templates/manual/consts/input.tpl.cpp +++ b/dlk/python/dlk/templates/manual/consts/input.tpl.cpp @@ -24,17 +24,19 @@ limitations under the License. {% if node.transposed_data %} +#if defined(RUN_ON_FPGA) {{ node.dtype.cpptype() }} {{ node.name }}[] = { - {% for d in node.data.flatten() -%} + {% for d in node.transposed_data -%} {{- d -}}, {%- endfor %} }; - -{{ node.dtype.cpptype() }} {{ node.name }}_transposed[] = { - {% for d in node.transposed_data -%} +#else +{{ node.dtype.cpptype() }} {{ node.name }}[] = { + {% for d in node.data.flatten() -%} {{- d -}}, {%- endfor %} }; +#endif {% else -%} diff --git a/dlk/python/dlk/templates/manual/consts/input.tpl.h b/dlk/python/dlk/templates/manual/consts/input.tpl.h index d50af3c47..c83acc91e 100644 --- a/dlk/python/dlk/templates/manual/consts/input.tpl.h +++ b/dlk/python/dlk/templates/manual/consts/input.tpl.h @@ -25,9 +25,6 @@ extern {{ node.dtype.cpptype() }} {{ node.name }}; {% else -%} extern {{ node.dtype.cpptype() }} {{ node.name }}[]; -{% if node.transposed_data %} -extern {{ node.dtype.cpptype() }} {{ node.name }}_transposed[]; -{%- endif %} {%- endif %} From 1d1c7d918cef3feecc249b0647dba95671e426cc Mon Sep 17 00:00:00 2001 From: lm_konda Date: Fri, 8 Feb 2019 13:36:22 +0900 Subject: [PATCH 50/50] fix comment --- dlk/python/dlk/core/optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlk/python/dlk/core/optimizer.py b/dlk/python/dlk/core/optimizer.py index 240f2fb53..f5d988447 100644 --- a/dlk/python/dlk/core/optimizer.py +++ b/dlk/python/dlk/core/optimizer.py @@ -38,7 +38,7 @@ def _transpose_kernels(kernel_data: np.ndarray, Parameters ---------- kernel_data : np.ndarray - The input data. It will be modified(transposed) in-place. + The input data. oh : int output height ow : int