From 8efab875091b171f4f39ff5dfcfd95dc77b7bd98 Mon Sep 17 00:00:00 2001 From: marktwtn Date: Thu, 3 Oct 2019 03:15:08 +0800 Subject: [PATCH 1/5] common/trinary: Accelerate trit tryte conversion with SIMD SSE4.2 The acceleration is enabled when the compiler option -msse4.2 is used and the input size of trit/tryte for conversion is larger than 384/128-bit. 128-bit is the basic operation unit of the most SSE instructions. The implementation is complex but it does accelerate the conversion speed with large input size. --- common/trinary/BUILD | 14 +- common/trinary/trit_tryte.c | 16 +- common/trinary/trit_tryte.h | 5 + common/trinary/trit_tryte_sse42.h | 274 ++++++++++++++++++++++++++++++ 4 files changed, 303 insertions(+), 6 deletions(-) create mode 100644 common/trinary/trit_tryte_sse42.h diff --git a/common/trinary/BUILD b/common/trinary/BUILD index 4f1d8346e0..c6bbd0e448 100644 --- a/common/trinary/BUILD +++ b/common/trinary/BUILD @@ -8,6 +8,15 @@ cc_library( ], ) +cc_library( + name = "trit_tryte_sse42", + hdrs = ["trit_tryte_sse42.h"], + deps = [ + ":tryte", + "//common:defs", + ], +) + cc_library( name = "trits", hdrs = ["trits.h"], @@ -102,7 +111,10 @@ cc_library( cc_library( name = "trit_tryte", srcs = ["trit_tryte.c"], - hdrs = ["trit_tryte.h"], + hdrs = [ + "trit_tryte.h", + "trit_tryte_sse42.h", + ], deps = [ ":trits", ":tryte", diff --git a/common/trinary/trit_tryte.c b/common/trinary/trit_tryte.c index 5e81ecdd46..e8e5d7e6a8 100644 --- a/common/trinary/trit_tryte.c +++ b/common/trinary/trit_tryte.c @@ -8,11 +8,9 @@ #include #include "common/trinary/trit_tryte.h" - -static const trit_t TRYTES_TRITS_LUT[TRYTE_SPACE_SIZE][NUMBER_OF_TRITS_IN_A_TRYTE] = { - {0, 0, 0}, {1, 0, 0}, {-1, 1, 0}, {0, 1, 0}, {1, 1, 0}, {-1, -1, 1}, {0, -1, 1}, {1, -1, 1}, {-1, 0, 1}, - {0, 0, 1}, {1, 0, 1}, {-1, 1, 1}, {0, 1, 1}, {1, 1, 1}, {-1, -1, -1}, {0, -1, -1}, {1, -1, -1}, {-1, 0, -1}, - {0, 0, -1}, {1, 0, -1}, {-1, 1, -1}, {0, 1, -1}, {1, 1, -1}, {-1, -1, 0}, {0, -1, 0}, {1, -1, 0}, {-1, 0, 0}}; +#if defined(__SSE4_2__) +#include "common/trinary/trit_tryte_sse42.h" +#endif trit_t get_trit_at(tryte_t const *const trytes, size_t const length, size_t const index) { size_t tindex = index / 3U; @@ -50,6 +48,9 @@ uint8_t set_trit_at(tryte_t *const trytes, size_t const length, size_t const ind } void trits_to_trytes(trit_t const *const trits, tryte_t *const trytes, size_t const length) { +#if defined(__SSE4_2__) + trits_to_trytes_sse42(trits, trytes, length); +#else int k = 0; for (size_t i = 0, j = 0; i < length; i += RADIX, j++) { @@ -63,6 +64,7 @@ void trits_to_trytes(trit_t const *const trits, tryte_t *const trytes, size_t co } trytes[j] = TRYTE_ALPHABET[k]; } +#endif } void trytes_to_trits(tryte_t const *const trytes, trit_t *const trits, size_t const length) { @@ -70,7 +72,11 @@ void trytes_to_trits(tryte_t const *const trytes, trit_t *const trits, size_t co return; } +#if defined(__SSE4_2__) + trytes_to_trits_sse42(trytes, trits, length); +#else for (size_t i = 0, j = 0; i < length; i++, j += RADIX) { memcpy(trits + j, TRYTES_TRITS_LUT[INDEX_OF_TRYTE(trytes[i])], NUMBER_OF_TRITS_IN_A_TRYTE); } +#endif } diff --git a/common/trinary/trit_tryte.h b/common/trinary/trit_tryte.h index a891a6bf5b..5fa2a05077 100644 --- a/common/trinary/trit_tryte.h +++ b/common/trinary/trit_tryte.h @@ -16,6 +16,11 @@ extern "C" { #endif +static const trit_t TRYTES_TRITS_LUT[TRYTE_SPACE_SIZE][NUMBER_OF_TRITS_IN_A_TRYTE] = { + {0, 0, 0}, {1, 0, 0}, {-1, 1, 0}, {0, 1, 0}, {1, 1, 0}, {-1, -1, 1}, {0, -1, 1}, {1, -1, 1}, {-1, 0, 1}, + {0, 0, 1}, {1, 0, 1}, {-1, 1, 1}, {0, 1, 1}, {1, 1, 1}, {-1, -1, -1}, {0, -1, -1}, {1, -1, -1}, {-1, 0, -1}, + {0, 0, -1}, {1, 0, -1}, {-1, 1, -1}, {0, 1, -1}, {1, 1, -1}, {-1, -1, 0}, {0, -1, 0}, {1, -1, 0}, {-1, 0, 0}}; + static inline size_t num_trytes_for_trits(size_t num_trits) { return (num_trits + NUMBER_OF_TRITS_IN_A_TRYTE - 1) / NUMBER_OF_TRITS_IN_A_TRYTE; } diff --git a/common/trinary/trit_tryte_sse42.h b/common/trinary/trit_tryte_sse42.h new file mode 100644 index 0000000000..4a57997dfa --- /dev/null +++ b/common/trinary/trit_tryte_sse42.h @@ -0,0 +1,274 @@ +#ifndef __COMMON_TRIT_TRYTE_SSE42_H_ +#define __COMMON_TRIT_TRYTE_SSE42_H_ + +#include + +#include "common/defs.h" +#include "common/trinary/trit_tryte.h" +#include "common/trinary/tryte.h" + +#define BLOCK_8BIT(type) (sizeof(type) / sizeof(int8_t)) +#define BYTE_OF_128BIT 16 +#define COMMA0 +#define COMMA1 , +#define COMMA(x) COMMA##x +#define INDEX_3DIFF_0F 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F +#define INDEX_3DIFF_1D 0x01, 0x04, 0x07, 0x0A, 0x0D +#define INDEX_3DIFF_2E 0x02, 0x05, 0x08, 0x0B, 0x0E +#define REPEAT0(str) +#define REPEAT1(str) str +#define REPEAT2(str) REPEAT1(str), str +#define REPEAT3(str) REPEAT2(str), str +#define REPEAT4(str) REPEAT3(str), str +#define REPEAT5(str) REPEAT4(str), str +#define REPEAT6(str) REPEAT5(str), str +#define REPEAT7(str) REPEAT6(str), str +#define REPEAT8(str) REPEAT7(str), str +#define REPEAT9(str) REPEAT8(str), str +#define REPEAT10(str) REPEAT9(str), str +#define REPEAT11(str) REPEAT10(str), str +#define REPEAT(n, str) REPEAT##n(str) + +static inline void trits_to_trytes_sse42(trit_t const *const trits, tryte_t *const trytes, size_t const length) { + const int block_8bit = BLOCK_8BIT(__m128i); + const int8_t set_msb = 0x80; + const __m128i tryte_alphabet[2] = { + _mm_setr_epi8(TRYTE_ALPHABET[14], TRYTE_ALPHABET[15], TRYTE_ALPHABET[16], TRYTE_ALPHABET[17], TRYTE_ALPHABET[18], + TRYTE_ALPHABET[19], TRYTE_ALPHABET[20], TRYTE_ALPHABET[21], TRYTE_ALPHABET[22], TRYTE_ALPHABET[23], + TRYTE_ALPHABET[24], TRYTE_ALPHABET[25], TRYTE_ALPHABET[26], TRYTE_ALPHABET[0], TRYTE_ALPHABET[1], + TRYTE_ALPHABET[2]), + _mm_setr_epi8(TRYTE_ALPHABET[3], TRYTE_ALPHABET[4], TRYTE_ALPHABET[5], TRYTE_ALPHABET[6], TRYTE_ALPHABET[7], + TRYTE_ALPHABET[8], TRYTE_ALPHABET[9], TRYTE_ALPHABET[10], TRYTE_ALPHABET[11], TRYTE_ALPHABET[12], + TRYTE_ALPHABET[13], 0, 0, 0, 0, 0)}; + /* For shuffling the bytes of the input trits */ + const __m128i shuffle_low[3] = { + _mm_setr_epi8(REPEAT(0, set_msb) COMMA(0) INDEX_3DIFF_0F COMMA(1) REPEAT(10, set_msb)), + _mm_setr_epi8(REPEAT(6, set_msb) COMMA(1) INDEX_3DIFF_2E COMMA(1) REPEAT(5, set_msb)), + _mm_setr_epi8(REPEAT(11, set_msb) COMMA(1) INDEX_3DIFF_1D COMMA(0) REPEAT(0, set_msb))}; + const __m128i shuffle_mid[3] = { + _mm_setr_epi8(REPEAT(0, set_msb) COMMA(0) INDEX_3DIFF_1D COMMA(1) REPEAT(11, set_msb)), + _mm_setr_epi8(REPEAT(5, set_msb) COMMA(1) INDEX_3DIFF_0F COMMA(1) REPEAT(5, set_msb)), + _mm_setr_epi8(REPEAT(11, set_msb) COMMA(1) INDEX_3DIFF_2E COMMA(0) REPEAT(0, set_msb))}; + const __m128i shuffle_high[3] = { + _mm_setr_epi8(REPEAT(0, set_msb) COMMA(0) INDEX_3DIFF_2E COMMA(1) REPEAT(11, set_msb)), + _mm_setr_epi8(REPEAT(5, set_msb) COMMA(1) INDEX_3DIFF_1D COMMA(1) REPEAT(6, set_msb)), + _mm_setr_epi8(REPEAT(10, set_msb) COMMA(1) INDEX_3DIFF_0F COMMA(0) REPEAT(0, set_msb))}; + + /* Start converting */ + for (size_t i = 0; i < length / NUMBER_OF_TRITS_IN_A_TRYTE / block_8bit; i++) { + /* Get trit data */ + __m128i data_first = _mm_loadu_si128((__m128i *)(trits) + i * 3); + __m128i data_mid = _mm_loadu_si128((__m128i *)(trits) + i * 3 + 1); + __m128i data_last = _mm_loadu_si128((__m128i *)(trits) + i * 3 + 2); + /* + * Each block represents a trit. + * shuffle + * ---------------- ------ ------ ------ ------ + * data_first = | a1 | a2 | a3 | ...... | f1 | low_trit = | a1 | ... | f1 | ... | p1 | + * ---------------- ------ ------ ------ ------ + * ---------------- ------ ------ ------ ------ + * data_mid = | f2 | f3 | g1 | ...... | k2 | => mid_trit = | a2 | ... | f2 | ... | p2 | + * ---------------- ------ ------ ------ ------ + * ---------------- ------ ------ ------ ------ + * data_last = | k3 | l1 | l2 | ...... | p3 | high_trit = | a3 | ... | f3 | ... | p3 | + * ---------------- ------ ------ ------ ------ + */ + __m128i low_trit = _mm_or_si128( + _mm_shuffle_epi8(data_first, shuffle_low[0]), + _mm_or_si128(_mm_shuffle_epi8(data_mid, shuffle_low[1]), _mm_shuffle_epi8(data_last, shuffle_low[2]))); + __m128i mid_trit = _mm_or_si128( + _mm_shuffle_epi8(data_first, shuffle_mid[0]), + _mm_or_si128(_mm_shuffle_epi8(data_mid, shuffle_mid[1]), _mm_shuffle_epi8(data_last, shuffle_mid[2]))); + __m128i high_trit = _mm_or_si128( + _mm_shuffle_epi8(data_first, shuffle_high[0]), + _mm_or_si128(_mm_shuffle_epi8(data_mid, shuffle_high[1]), _mm_shuffle_epi8(data_last, shuffle_high[2]))); + /* low_result = (low_trit) */ + __m128i low_result = low_trit; + /* mid_result = (mid_trit * 3) = (mid_trit + mid_trit + mid_trit) */ + __m128i mid_result = _mm_add_epi8(mid_trit, _mm_add_epi8(mid_trit, mid_trit)); + /* high_result = (high_trit * 9) = (high_trit + high_trit * 4 + high_trit * 4)*/ + __m128i high_trit_2 = _mm_add_epi8(high_trit, high_trit); + __m128i high_trit_4 = _mm_add_epi8(high_trit_2, high_trit_2); + __m128i high_result = _mm_add_epi8(high_trit, _mm_add_epi8(high_trit_4, high_trit_4)); + /* alphabet_offset = (low_result + mid_result + high_result) */ + __m128i alphabet_offset = _mm_add_epi8(low_result, _mm_add_epi8(mid_result, high_result)); + /* Add 0x0D (13) to eliminate negative value */ + alphabet_offset = _mm_add_epi8(alphabet_offset, _mm_set_epi32(REPEAT(4, 0x0D0D0D0D))); + + /* Assign tryte alphabet */ + /* If the offset is >= 16 (> 15), then the compared result byte = 0xFF, + * else = 0x00 */ + __m128i cmp_result = + _mm_cmpgt_epi8(alphabet_offset, _mm_set_epi8(15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15)); + /* Use the offset to get the correct tryte alphabet from tryte_alphabet[] + */ + __m128i result_lt = _mm_shuffle_epi8(tryte_alphabet[0], alphabet_offset); + __m128i result_ge = _mm_shuffle_epi8( + tryte_alphabet[1], + /* alphabet_offset - 16 */ + _mm_sub_epi8(alphabet_offset, _mm_set_epi8(16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16))); + __m128i result = _mm_or_si128(_mm_andnot_si128(cmp_result, result_lt), _mm_and_si128(cmp_result, result_ge)); + /* Store the tryte result */ + _mm_storeu_si128((__m128i *)(trytes + i * block_8bit), result); + } + /* The rest of the trits */ + for (size_t i = (length / NUMBER_OF_TRITS_IN_A_TRYTE / block_8bit) * block_8bit; + i * NUMBER_OF_TRITS_IN_A_TRYTE < length; i++) { + int k = 0; + int used_trit = i * NUMBER_OF_TRITS_IN_A_TRYTE; + for (size_t l = length - used_trit < NUMBER_OF_TRITS_IN_A_TRYTE ? length - used_trit : NUMBER_OF_TRITS_IN_A_TRYTE; + l-- > 0;) { + k *= RADIX; + k += trits[used_trit + l]; + } + + if (k < 0) { + k += TRYTE_SPACE_SIZE; + } + trytes[i] = TRYTE_ALPHABET[k]; + } +} + +static inline void trytes_to_trits_sse42(tryte_t const *const trytes, trit_t *const trits, size_t const length) { + const int block_8bit = BLOCK_8BIT(__m128i); + /* For setting the most significant bit of a byte */ + const int8_t set_msb = 0x80; + /* The set and range for indicating the trits value (0, 1, -1) + * of the corresponding trytes */ + /* '9', 'C', 'F', 'I', 'L', 'O', 'R', 'U', 'X' */ + const char set_low_trit_0[BYTE_OF_128BIT] = "9CFILORUX"; + /* 'A', 'D', 'G', 'J', 'M', 'P', 'S', 'V', 'Y' */ + const char set_low_trit_p1[BYTE_OF_128BIT] = "ADGJMPSVY"; + /* 'B', 'E', 'H', 'K', 'N', 'Q', 'T', 'W', 'Z' */ + const char set_low_trit_n1[BYTE_OF_128BIT] = "BEHKNQTWZ"; + /* '9', 'A', 'H', 'I', 'J', 'Q', 'R', 'S', 'Z' */ + const char range_mid_trit_0[BYTE_OF_128BIT] = "99AAHJQSZZ"; + /* 'B', 'C', 'D', 'K', 'L', 'M', 'T', 'U', 'V' */ + const char range_mid_trit_p1[BYTE_OF_128BIT] = "BDKMTV"; + /* 'E', 'F', 'G', 'N', 'O', 'P', 'W', 'X', 'Y' */ + const char range_mid_trit_n1[BYTE_OF_128BIT] = "EGNPWY"; + /* '9', 'A', 'B', 'C', 'D', 'W', 'X', 'Y', 'Z' */ + const char range_high_trit_0[BYTE_OF_128BIT] = "99ADWZ"; + /* 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M' */ + const char range_high_trit_p1[BYTE_OF_128BIT] = "EM"; + /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V' */ + const char range_high_trit_n1[BYTE_OF_128BIT] = "NV"; + /* Convert the char array to the 128-bit data */ + const __m128i pattern_low_trit_0 = _mm_loadu_si128((__m128i *)(set_low_trit_0)); + const __m128i pattern_low_trit_p1 = _mm_loadu_si128((__m128i *)(set_low_trit_p1)); + const __m128i pattern_low_trit_n1 = _mm_loadu_si128((__m128i *)(set_low_trit_n1)); + const __m128i pattern_mid_trit_0 = _mm_loadu_si128((__m128i *)(range_mid_trit_0)); + const __m128i pattern_mid_trit_p1 = _mm_loadu_si128((__m128i *)(range_mid_trit_p1)); + const __m128i pattern_mid_trit_n1 = _mm_loadu_si128((__m128i *)(range_mid_trit_n1)); + const __m128i pattern_high_trit_0 = _mm_loadu_si128((__m128i *)(range_high_trit_0)); + const __m128i pattern_high_trit_p1 = _mm_loadu_si128((__m128i *)(range_high_trit_p1)); + const __m128i pattern_high_trit_n1 = _mm_loadu_si128((__m128i *)(range_high_trit_n1)); + /* The 128-bit data with the repeated same bytes */ + const __m128i pos_one = _mm_set1_epi8(1); + const __m128i neg_one = _mm_set1_epi8(-1); + const __m128i zero = _mm_set1_epi8(0); + /* For shuffling the bytes of the trits transformed from the input trytes */ + const __m128i shuffle_first[3] = { + _mm_setr_epi8(0x00, REPEAT2(set_msb), 0x01, REPEAT2(set_msb), 0x02, REPEAT2(set_msb), 0x03, REPEAT2(set_msb), + 0x04, REPEAT2(set_msb), 0x05), + _mm_setr_epi8(REPEAT1(set_msb), 0x00, REPEAT2(set_msb), 0x01, REPEAT2(set_msb), 0x02, REPEAT2(set_msb), 0x03, + REPEAT2(set_msb), 0x04, REPEAT2(set_msb)), + _mm_setr_epi8(REPEAT2(set_msb), 0x00, REPEAT2(set_msb), 0x01, REPEAT2(set_msb), 0x02, REPEAT2(set_msb), 0x03, + REPEAT2(set_msb), 0x04, REPEAT1(set_msb))}; + const __m128i shuffle_mid[3] = {_mm_setr_epi8(REPEAT2(set_msb), 0x06, REPEAT2(set_msb), 0x07, REPEAT2(set_msb), 0x08, + REPEAT2(set_msb), 0x09, REPEAT2(set_msb), 0x0A, REPEAT1(set_msb)), + _mm_setr_epi8(0x05, REPEAT2(set_msb), 0x06, REPEAT2(set_msb), 0x07, REPEAT2(set_msb), + 0x08, REPEAT2(set_msb), 0x09, REPEAT2(set_msb), 0x0A), + _mm_setr_epi8(REPEAT1(set_msb), 0x05, REPEAT2(set_msb), 0x06, REPEAT2(set_msb), 0x07, + REPEAT2(set_msb), 0x08, REPEAT2(set_msb), 0x09, REPEAT2(set_msb))}; + const __m128i shuffle_last[3] = {_mm_setr_epi8(REPEAT1(set_msb), 0x0B, REPEAT2(set_msb), 0x0C, REPEAT2(set_msb), 0x0D, + REPEAT2(set_msb), 0x0E, REPEAT2(set_msb), 0x0F, REPEAT2(set_msb)), + _mm_setr_epi8(REPEAT2(set_msb), 0x0B, REPEAT2(set_msb), 0x0C, REPEAT2(set_msb), 0x0D, + REPEAT2(set_msb), 0x0E, REPEAT2(set_msb), 0x0F, REPEAT1(set_msb)), + _mm_setr_epi8(0x0A, REPEAT2(set_msb), 0x0B, REPEAT2(set_msb), 0x0C, REPEAT2(set_msb), + 0x0D, REPEAT2(set_msb), 0x0E, REPEAT2(set_msb), 0x0F)}; + + /* Start converting */ + /* The for loop handles the group of the 128-bit characters without the + * end-of-string */ + for (size_t i = 0; i < length / block_8bit; i++) { + /* Get tryte data */ + __m128i data = _mm_loadu_si128((__m128i *)(trytes) + i); + + /* The masks for setting the corresponding trits */ + __m128i mask_low_trit_0 = _mm_cmpistrm(pattern_low_trit_0, data, + /* Signed byte comparison */ + _SIDD_SBYTE_OPS | + /* Compare with the character set */ + _SIDD_CMP_EQUAL_ANY | + /* Expand the corrsponding bit result to byte unit */ + _SIDD_UNIT_MASK); + __m128i mask_low_trit_p1 = + _mm_cmpistrm(pattern_low_trit_p1, data, _SIDD_SBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_UNIT_MASK); + __m128i mask_low_trit_n1 = + _mm_cmpistrm(pattern_low_trit_n1, data, _SIDD_SBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_UNIT_MASK); + __m128i mask_mid_trit_0 = _mm_cmpistrm(pattern_mid_trit_0, data, + /* Signed byte comparison */ + _SIDD_SBYTE_OPS | + /* Compare with the character range */ + _SIDD_CMP_RANGES | + /* Expand the corrsponding bit result to byte unit */ + _SIDD_UNIT_MASK); + __m128i mask_mid_trit_p1 = + _mm_cmpistrm(pattern_mid_trit_p1, data, _SIDD_SBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK); + __m128i mask_mid_trit_n1 = + _mm_cmpistrm(pattern_mid_trit_n1, data, _SIDD_SBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK); + __m128i mask_high_trit_0 = + _mm_cmpistrm(pattern_high_trit_0, data, _SIDD_SBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK); + __m128i mask_high_trit_p1 = + _mm_cmpistrm(pattern_high_trit_p1, data, _SIDD_SBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK); + __m128i mask_high_trit_n1 = + _mm_cmpistrm(pattern_high_trit_n1, data, _SIDD_SBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK); + + /* + * Each block represents a trit. + * shuffle + * ------ ------ ------ ---------------- ------ + * low_trit = | a1 | ... | f1 | ... | p1 | data_first = | a1 | a2 | a3 | ...... | f1 | + * ------ ------ ------ ---------------- ------ + * ------ ------ ------ ---------------- ------ + * mid_trit = | a2 | ... | f2 | ... | p2 | => data_mid = | f2 | f3 | g1 | ...... | k2 | + * ------ ------ ------ ---------------- ------ + * ------ ------ ------ ---------------- ------ + * high_trit = | a3 | ... | f3 | ... | p3 | data_last = | k3 | l1 | l2 | ...... | p3 | + * ------ ------ ------ ---------------- ------ + */ + __m128i low_trit = + _mm_or_si128(_mm_and_si128(mask_low_trit_0, zero), + _mm_or_si128(_mm_and_si128(mask_low_trit_p1, pos_one), _mm_and_si128(mask_low_trit_n1, neg_one))); + __m128i mid_trit = + _mm_or_si128(_mm_and_si128(mask_mid_trit_0, zero), + _mm_or_si128(_mm_and_si128(mask_mid_trit_p1, pos_one), _mm_and_si128(mask_mid_trit_n1, neg_one))); + __m128i high_trit = _mm_or_si128( + _mm_and_si128(mask_high_trit_0, zero), + _mm_or_si128(_mm_and_si128(mask_high_trit_p1, pos_one), _mm_and_si128(mask_high_trit_n1, neg_one))); + __m128i data_first, data_mid, data_last; + data_first = _mm_or_si128( + _mm_shuffle_epi8(low_trit, shuffle_first[0]), + _mm_or_si128(_mm_shuffle_epi8(mid_trit, shuffle_first[1]), _mm_shuffle_epi8(high_trit, shuffle_first[2]))); + data_mid = _mm_or_si128( + _mm_shuffle_epi8(low_trit, shuffle_mid[0]), + _mm_or_si128(_mm_shuffle_epi8(mid_trit, shuffle_mid[1]), _mm_shuffle_epi8(high_trit, shuffle_mid[2]))); + data_last = _mm_or_si128( + _mm_shuffle_epi8(low_trit, shuffle_last[0]), + _mm_or_si128(_mm_shuffle_epi8(mid_trit, shuffle_last[1]), _mm_shuffle_epi8(high_trit, shuffle_last[2]))); + + /* Store the 3 * 128-bit trits converted from trytes */ + _mm_storeu_si128((__m128i *)(trits + (3 * i) * block_8bit), data_first); + _mm_storeu_si128((__m128i *)(trits + (3 * i + 1) * block_8bit), data_mid); + _mm_storeu_si128((__m128i *)(trits + (3 * i + 2) * block_8bit), data_last); + } + /* The rest of the trytes */ + for (size_t i = (length / block_8bit) * block_8bit, + j = (length / block_8bit) * block_8bit * NUMBER_OF_TRITS_IN_A_TRYTE; + i < length; i++, j += RADIX) { + memcpy(trits + j, TRYTES_TRITS_LUT[INDEX_OF_TRYTE(trytes[i])], NUMBER_OF_TRITS_IN_A_TRYTE); + } +} + +#endif // __COMMON_TRIT_TRYTE_SSE42_H_ From b95aabe068b36c22b689fa2d7dc299871c83858e Mon Sep 17 00:00:00 2001 From: marktwtn Date: Tue, 10 Dec 2019 19:26:19 +0800 Subject: [PATCH 2/5] common/trinary: Increase input data size for SSE4.2 In the original testing, the input trit/tryte data size are too small to test the SIMD SSE4.2 trit tryte conversion acceleration. trytes_to_trits minimum requirement: 128-bit = 16-tryte trits_to_trytes minimum requirement: 384-bit = 48-trit --- common/trinary/tests/test_trit_tryte.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/common/trinary/tests/test_trit_tryte.c b/common/trinary/tests/test_trit_tryte.c index 7b9a9fa1ad..546bff035e 100644 --- a/common/trinary/tests/test_trit_tryte.c +++ b/common/trinary/tests/test_trit_tryte.c @@ -10,23 +10,25 @@ #include "common/trinary/trit_tryte.h" -#define TRYTES_IN "AZN9" -#define EXP 1, 0, 0, -1, 0, 0, -1, -1, -1, 0, 0, 0 +#define TRYTES_IN "AZN9AZN9AZN9AZN9AZN9" +#define EXP \ + 1, 0, 0, -1, 0, 0, -1, -1, -1, 0, 0, 0, 1, 0, 0, -1, 0, 0, -1, -1, -1, 0, 0, 0, 1, 0, 0, -1, 0, 0, -1, -1, -1, 0, 0, \ + 0, 1, 0, 0, -1, 0, 0, -1, -1, -1, 0, 0, 0, 1, 0, 0, -1, 0, 0, -1, -1, -1, 0, 0, 0 void test_trits_to_trytes(void) { trit_t trits[] = {EXP}; - tryte_t trytes[4]; + tryte_t trytes[20]; tryte_t exp[] = {TRYTES_IN}; - trits_to_trytes(trits, trytes, 12); - TEST_ASSERT_EQUAL_MEMORY(exp, trytes, 4); + trits_to_trytes(trits, trytes, 60); + TEST_ASSERT_EQUAL_MEMORY(exp, trytes, 20); } void test_trytes_to_trits(void) { tryte_t trytes[] = {TRYTES_IN}; - trit_t trits[12]; + trit_t trits[60]; trit_t exp[] = {EXP}; - trytes_to_trits(trytes, trits, 4); - TEST_ASSERT_EQUAL_MEMORY(exp, trits, 12); + trytes_to_trits(trytes, trits, 20); + TEST_ASSERT_EQUAL_MEMORY(exp, trits, 60); } void test_get_trit_at(void) { From 71b89aac3de1c6aac6a38967ddd7f8a5b83fe7aa Mon Sep 17 00:00:00 2001 From: marktwtn Date: Thu, 13 Feb 2020 18:18:32 +0800 Subject: [PATCH 3/5] common/trinary: Add benchmark for trit tryte conversion The benchmark displays the minimum, maximum and average value of trit tryte conversion function of different input size. The range of input size can be modified in bench_trit_tryte.c. The default input/output tryte size range is 16 ~ 2048. --- common/trinary/benchmark/BUILD | 8 ++ common/trinary/benchmark/bench_trit_tryte.c | 127 ++++++++++++++++++++ 2 files changed, 135 insertions(+) create mode 100644 common/trinary/benchmark/BUILD create mode 100644 common/trinary/benchmark/bench_trit_tryte.c diff --git a/common/trinary/benchmark/BUILD b/common/trinary/benchmark/BUILD new file mode 100644 index 0000000000..db21e891f9 --- /dev/null +++ b/common/trinary/benchmark/BUILD @@ -0,0 +1,8 @@ +cc_test( + name = "bench_trit_tryte", + timeout = "short", + srcs = ["bench_trit_tryte.c"], + deps = [ + "//common/trinary:trit_tryte", + ], +) diff --git a/common/trinary/benchmark/bench_trit_tryte.c b/common/trinary/benchmark/bench_trit_tryte.c new file mode 100644 index 0000000000..cdde82bd2d --- /dev/null +++ b/common/trinary/benchmark/bench_trit_tryte.c @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2020 IOTA Stiftung + * https://github.com/iotaledger/entangled + * + * Refer to the LICENSE file for licensing information + */ + +#include +#include +#include +#include + +#include "common/trinary/trit_tryte.h" + +// The minimum and maximum input/output tryte size for perfomance testing +#define MIN_TRYTE_SIZE 16 +#define MAX_TRYTE_SIZE 2048 +// The number of times of the same input size testing +#define TEST_TIMES 20 + +tryte_t tryte_chars[27] = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', + 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '9'}; +trit_t trit_nums[3] = {-1, 0, 1}; + +long diff_in_nanosec(struct timespec start, struct timespec end) { + struct timespec diff; + + if (end.tv_nsec - start.tv_nsec < 0) { + diff.tv_sec = end.tv_sec - start.tv_sec - 1; + diff.tv_nsec = end.tv_nsec - start.tv_nsec + 1000000000; + } else { + diff.tv_sec = end.tv_sec - start.tv_sec; + diff.tv_nsec = end.tv_nsec - start.tv_nsec; + } + + return (diff.tv_sec * 1000000000 + diff.tv_nsec); +} + +void test_trits_to_trytes(unsigned int trit_size, unsigned int times) { + struct timespec start, end; + long run_time; + long min = 0, max = 0, sum = 0; + + trit_t *trits = malloc(sizeof(trit_t) * trit_size); + tryte_t *trytes = malloc(sizeof(tryte_t) * trit_size / 3); + + for (unsigned int count = 0; count < times; count++) { + // Generate random trits + for (unsigned int idx = 0; idx < trit_size; idx++) { + memset(trits + idx, trit_nums[rand() % 3], 1); + } + + // Execution time measurement + clock_gettime(CLOCK_MONOTONIC, &start); + trits_to_trytes(trits, trytes, trit_size); + clock_gettime(CLOCK_MONOTONIC, &end); + run_time = diff_in_nanosec(start, end); + + max = (count == 0 || run_time > max) ? run_time : max; + min = (count == 0 || run_time < min) ? run_time : min; + sum += run_time; + } + + printf("Input trit size: %d\n", trit_size); + printf(" minimum: %ld nsec\n", min); + printf(" maximum: %ld nsec\n", max); + printf(" average: %ld nsec\n", sum / times); + + free(trits); + free(trytes); +} + +void test_trytes_to_trits(unsigned int tryte_size, unsigned int times) { + struct timespec start, end; + long run_time; + long min = 0, max = 0, sum = 0; + + tryte_t *trytes = malloc(sizeof(tryte_t) * tryte_size); + trit_t *trits = malloc(sizeof(trit_t) * 3 * tryte_size); + + for (unsigned int count = 0; count < times; count++) { + // Generate random trytes + for (unsigned int idx = 0; idx < tryte_size; idx++) { + memset(trytes + idx, tryte_chars[rand() % 27], 1); + } + + // Execution time measurement + clock_gettime(CLOCK_MONOTONIC, &start); + trytes_to_trits(trytes, trits, tryte_size); + clock_gettime(CLOCK_MONOTONIC, &end); + run_time = diff_in_nanosec(start, end); + + max = (count == 0 || run_time > max) ? run_time : max; + min = (count == 0 || run_time < min) ? run_time : min; + sum += run_time; + } + + printf("Input tryte size: %d\n", tryte_size); + printf(" minimum: %ld nsec\n", min); + printf(" maximum: %ld nsec\n", max); + printf(" average: %ld nsec\n", sum / times); + + free(trytes); + free(trits); +} + +int main(void) { + unsigned int size; + + // Set random seed + srand(time(NULL)); + + printf("trytes_to_trits\n"); + + for (size = MIN_TRYTE_SIZE; size <= MAX_TRYTE_SIZE; size++) { + test_trytes_to_trits(size, TEST_TIMES); + } + + printf("\n"); + printf("trits_to_trytes\n"); + + for (size = MIN_TRYTE_SIZE; size <= MAX_TRYTE_SIZE; size++) { + test_trits_to_trytes(size * 3, TEST_TIMES); + } + + return 0; +} From 11d2c0b8764e25ffeaa3e08d3bfc09c38de51dbc Mon Sep 17 00:00:00 2001 From: marktwtn Date: Thu, 13 Feb 2020 20:08:42 +0800 Subject: [PATCH 4/5] common/trinary: Set the threshold of activating SSE4.2 acceleration The threshold value is determined by the execution time difference. The time difference should be at least 500 nano second. The threshold experiment is run on the CPU: AMD Ryzen 5 2400G with Radeon Vega Graphics. TODO: trytes_to_trits() rarely slower when using SSE4.2 acceleration with large input. --- common/trinary/trit_tryte.c | 20 ++++++++++++++------ common/trinary/trit_tryte.h | 3 +++ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/common/trinary/trit_tryte.c b/common/trinary/trit_tryte.c index e8e5d7e6a8..b1fcb5b1d9 100644 --- a/common/trinary/trit_tryte.c +++ b/common/trinary/trit_tryte.c @@ -48,9 +48,16 @@ uint8_t set_trit_at(tryte_t *const trytes, size_t const length, size_t const ind } void trits_to_trytes(trit_t const *const trits, tryte_t *const trytes, size_t const length) { + if (length == 0) { + return; + } + #if defined(__SSE4_2__) - trits_to_trytes_sse42(trits, trytes, length); -#else + if (length >= TRITS_TO_TRYTES_THRESHOLD) { + trits_to_trytes_sse42(trits, trytes, length); + return; + } +#endif int k = 0; for (size_t i = 0, j = 0; i < length; i += RADIX, j++) { @@ -64,7 +71,6 @@ void trits_to_trytes(trit_t const *const trits, tryte_t *const trytes, size_t co } trytes[j] = TRYTE_ALPHABET[k]; } -#endif } void trytes_to_trits(tryte_t const *const trytes, trit_t *const trits, size_t const length) { @@ -73,10 +79,12 @@ void trytes_to_trits(tryte_t const *const trytes, trit_t *const trits, size_t co } #if defined(__SSE4_2__) - trytes_to_trits_sse42(trytes, trits, length); -#else + if (length >= TRYTES_TO_TRITS_THRESHOLD) { + trytes_to_trits_sse42(trytes, trits, length); + return; + } +#endif for (size_t i = 0, j = 0; i < length; i++, j += RADIX) { memcpy(trits + j, TRYTES_TRITS_LUT[INDEX_OF_TRYTE(trytes[i])], NUMBER_OF_TRITS_IN_A_TRYTE); } -#endif } diff --git a/common/trinary/trit_tryte.h b/common/trinary/trit_tryte.h index 5fa2a05077..f8e67cbe03 100644 --- a/common/trinary/trit_tryte.h +++ b/common/trinary/trit_tryte.h @@ -12,6 +12,9 @@ #include "common/trinary/trits.h" #include "common/trinary/tryte.h" +#define TRITS_TO_TRYTES_THRESHOLD 192 +#define TRYTES_TO_TRITS_THRESHOLD 736 + #ifdef __cplusplus extern "C" { #endif From 5dc9d898300b9cb17776e9158669d008560721c1 Mon Sep 17 00:00:00 2001 From: marktwtn Date: Thu, 13 Feb 2020 20:47:41 +0800 Subject: [PATCH 5/5] common/trinary: Add trit tryte SSE4.2 acceleration testing The trit tryte SSE4.2 accleration testing is added since the threshold is added. Otherwise, the acceleration would not be tested or the testing input data need to be larger than the threshold value. --- common/trinary/tests/test_trit_tryte.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/common/trinary/tests/test_trit_tryte.c b/common/trinary/tests/test_trit_tryte.c index 546bff035e..7ea0b8174e 100644 --- a/common/trinary/tests/test_trit_tryte.c +++ b/common/trinary/tests/test_trit_tryte.c @@ -9,6 +9,7 @@ #include #include "common/trinary/trit_tryte.h" +#include "common/trinary/trit_tryte_sse42.h" #define TRYTES_IN "AZN9AZN9AZN9AZN9AZN9" #define EXP \ @@ -31,6 +32,24 @@ void test_trytes_to_trits(void) { TEST_ASSERT_EQUAL_MEMORY(exp, trits, 60); } +#if defined(__SSE4_2__) +void test_trits_to_trytes_sse42(void) { + trit_t trits[] = {EXP}; + tryte_t trytes[20]; + tryte_t exp[] = {TRYTES_IN}; + trits_to_trytes_sse42(trits, trytes, 60); + TEST_ASSERT_EQUAL_MEMORY(exp, trytes, 20); +} + +void test_trytes_to_trits_sse42(void) { + tryte_t trytes[] = {TRYTES_IN}; + trit_t trits[60]; + trit_t exp[] = {EXP}; + trytes_to_trits_sse42(trytes, trits, 20); + TEST_ASSERT_EQUAL_MEMORY(exp, trits, 60); +} +#endif + void test_get_trit_at(void) { tryte_t trytes[] = {TRYTES_IN}; trit_t trit; @@ -59,6 +78,10 @@ int main(void) { RUN_TEST(test_trits_to_trytes); RUN_TEST(test_trytes_to_trits); +#if defined(__SSE4_2__) + RUN_TEST(test_trits_to_trytes_sse42); + RUN_TEST(test_trytes_to_trits_sse42); +#endif RUN_TEST(test_get_trit_at); RUN_TEST(test_set_trit_at);