diff --git a/CHANGES b/CHANGES index 83b2079..199a085 100644 --- a/CHANGES +++ b/CHANGES @@ -1,2 +1,6 @@ -* 2022-06-12 : Version 1.0.0 - * Initial public release of the esa-matchfinder. +Changes in 1.0.1 (June 19, 2022) +- Improved cache coherence for ARMv8 architecture. + +Changes in 1.0.0 (June 12, 2022) +- Initial public release of the esa-matchfinder. + diff --git a/README.md b/README.md index be0941b..f34330e 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,8 @@ The esa-matchfinder finds all distance optimal matches (between min_match_length The esa-matchfinder released under the [Apache License Version 2.0](LICENSE "Apache license") and is considered suitable for production use. However, no warranty or fitness for a particular purpose is expressed or implied. ## Changes +* June 19, 2022 (1.0.1) + * Improved cache coherence for ARMv8 architecture. * June 12, 2022 (1.0.0) * Initial public release of the esa-matchfinder. diff --git a/VERSION b/VERSION index afaf360..7f20734 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.0.0 \ No newline at end of file +1.0.1 \ No newline at end of file diff --git a/esa_matchfinder.c b/esa_matchfinder.c index ecbae39..67506ee 100644 --- a/esa_matchfinder.c +++ b/esa_matchfinder.c @@ -127,11 +127,11 @@ typedef struct ESA_MF_CONTEXT #endif #if defined(ESA_MF_HAS_BUILTIN_PREFECTCH) - #define esa_matchfinder_prefetchr(address) __builtin_prefetch((const void *)(address), 0, 0) - #define esa_matchfinder_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 0) + #define esa_matchfinder_prefetchr(address) __builtin_prefetch((const void *)(address), 0, 3) + #define esa_matchfinder_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 3) #elif defined (_M_IX86) || defined (_M_AMD64) #include - #define esa_matchfinder_prefetchr(address) _mm_prefetch((const void *)(address), _MM_HINT_NTA) + #define esa_matchfinder_prefetchr(address) _mm_prefetch((const void *)(address), _MM_HINT_T0) #define esa_matchfinder_prefetchw(address) _m_prefetchw((const void *)(address)) #elif defined (_M_ARM) #include @@ -139,8 +139,8 @@ typedef struct ESA_MF_CONTEXT #define esa_matchfinder_prefetchw(address) __prefetchw((const void *)(address)) #elif defined (_M_ARM64) #include - #define esa_matchfinder_prefetchr(address) __prefetch2((const void *)(address), 1) - #define esa_matchfinder_prefetchw(address) __prefetch2((const void *)(address), 17) + #define esa_matchfinder_prefetchr(address) __prefetch2((const void *)(address), 0) + #define esa_matchfinder_prefetchw(address) __prefetch2((const void *)(address), 16) #else #error Your compiler, configuration or platform is not supported. #endif @@ -384,7 +384,9 @@ static ptrdiff_t esa_matchfinder_build_interval_tree for (ptrdiff_t i = omp_block_start + omp_block_size - 1; i >= omp_block_start; i -= 1) { esa_matchfinder_prefetchr(&sa_parent_link[i - 2 * prefetch_distance]); + esa_matchfinder_prefetchw(&plcp_leaf_link[sa_parent_link[i - prefetch_distance]]); + esa_matchfinder_prefetchw(&sa_parent_link[next_interval_index - prefetch_distance]); uint64_t next_pos = sa_parent_link[i]; uint64_t next_lcp = (uint64_t)plcp_leaf_link[next_pos] - min_match_length; diff --git a/libsais/CHANGES b/libsais/CHANGES index beed505..c41b978 100644 --- a/libsais/CHANGES +++ b/libsais/CHANGES @@ -1,3 +1,6 @@ +Changes in 2.7.1 (June 19, 2022) +- Improved cache coherence for ARMv8 architecture. + Changes in 2.7.0 (April 12, 2022) - Support for longest common prefix array (LCP) construction. diff --git a/libsais/VERSION b/libsais/VERSION index 9aa3464..5588ae8 100644 --- a/libsais/VERSION +++ b/libsais/VERSION @@ -1 +1 @@ -2.7.0 \ No newline at end of file +2.7.1 \ No newline at end of file diff --git a/libsais/libsais.c b/libsais/libsais.c index 44cdc19..fcbfe7e 100644 --- a/libsais/libsais.c +++ b/libsais/libsais.c @@ -118,20 +118,20 @@ typedef struct LIBSAIS_UNBWT_CONTEXT #endif #if defined(HAS_BUILTIN_PREFECTCH) - #define libsais_prefetch(address) __builtin_prefetch((const void *)(address), 0, 0) - #define libsais_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 0) + #define libsais_prefetchr(address) __builtin_prefetch((const void *)(address), 0, 3) + #define libsais_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 3) #elif defined (_M_IX86) || defined (_M_AMD64) #include - #define libsais_prefetch(address) _mm_prefetch((const void *)(address), _MM_HINT_NTA) + #define libsais_prefetchr(address) _mm_prefetch((const void *)(address), _MM_HINT_T0) #define libsais_prefetchw(address) _m_prefetchw((const void *)(address)) #elif defined (_M_ARM) #include - #define libsais_prefetch(address) __prefetch((const void *)(address)) + #define libsais_prefetchr(address) __prefetch((const void *)(address)) #define libsais_prefetchw(address) __prefetchw((const void *)(address)) #elif defined (_M_ARM64) #include - #define libsais_prefetch(address) __prefetch2((const void *)(address), 1) - #define libsais_prefetchw(address) __prefetch2((const void *)(address), 17) + #define libsais_prefetchr(address) __prefetch2((const void *)(address), 0) + #define libsais_prefetchw(address) __prefetch2((const void *)(address), 16) #else #error Your compiler, configuration or platform is not supported. #endif @@ -287,7 +287,7 @@ static void libsais_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREA fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { - libsais_prefetch(&cache[i + 2 * prefetch_distance]); + libsais_prefetchr(&cache[i + 2 * prefetch_distance]); libsais_prefetchw(&SA[cache[i + prefetch_distance + 0].symbol]); libsais_prefetchw(&SA[cache[i + prefetch_distance + 1].symbol]); @@ -441,7 +441,7 @@ static void libsais_gather_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t for (i = omp_block_start + omp_block_size - 2, j = omp_block_start + 3; i >= j; i -= 4) { - libsais_prefetch(&T[i - prefetch_distance]); + libsais_prefetchr(&T[i - prefetch_distance]); c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1); @@ -511,7 +511,7 @@ static sa_sint_t libsais_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T, s for (; i >= 3; i -= 4) { - libsais_prefetch(&T[i - prefetch_distance]); + libsais_prefetchr(&T[i - prefetch_distance]); c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 0; m -= ((s & 3) == 1); @@ -539,7 +539,7 @@ static sa_sint_t libsais_gather_compacted_lms_suffixes_32s(const sa_sint_t * RES for (; i >= 3; i -= 4) { - libsais_prefetch(&T[i - prefetch_distance]); + libsais_prefetchr(&T[i - prefetch_distance]); c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((fast_sint_t)(s & 3) == (c0 >= 0)); c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 0; m -= ((fast_sint_t)(s & 3) == (c1 >= 0)); @@ -570,7 +570,7 @@ static void libsais_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_s for (; i >= prefetch_distance + 3; i -= 4) { - libsais_prefetch(&T[i - 2 * prefetch_distance]); + libsais_prefetchr(&T[i - 2 * prefetch_distance]); libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]); @@ -614,7 +614,7 @@ static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_s for (; i >= prefetch_distance + 3; i -= 4) { - libsais_prefetch(&T[i - 2 * prefetch_distance]); + libsais_prefetchr(&T[i - 2 * prefetch_distance]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]); @@ -658,7 +658,7 @@ static void libsais_count_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRI for (; i >= prefetch_distance + 3; i -= 4) { - libsais_prefetch(&T[i - 2 * prefetch_distance]); + libsais_prefetchr(&T[i - 2 * prefetch_distance]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]); @@ -707,7 +707,7 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_8u(const uint8_t * RESTRI for (i = m - 1, j = omp_block_start + 3; i >= j; i -= 4) { - libsais_prefetch(&T[i - prefetch_distance]); + libsais_prefetchr(&T[i - prefetch_distance]); c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1); buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++; @@ -820,7 +820,7 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k(const sa_sint_t * for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { - libsais_prefetch(&T[i - 2 * prefetch_distance]); + libsais_prefetchr(&T[i - 2 * prefetch_distance]); libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]); @@ -871,7 +871,7 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k(const sa_sint_t * for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { - libsais_prefetch(&T[i - 2 * prefetch_distance]); + libsais_prefetchr(&T[i - 2 * prefetch_distance]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]); @@ -922,7 +922,7 @@ static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(const sa for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { - libsais_prefetch(&T[i - 2 * prefetch_distance]); + libsais_prefetchr(&T[i - 2 * prefetch_distance]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]); @@ -1334,7 +1334,7 @@ static void libsais_count_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t n fast_sint_t i, j; for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8) { - libsais_prefetch(&T[i + prefetch_distance]); + libsais_prefetchr(&T[i + prefetch_distance]); buckets[T[i + 0]]++; buckets[T[i + 1]]++; @@ -1541,12 +1541,12 @@ static void libsais_radix_sort_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_si fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { - libsais_prefetch(&SA[i - 2 * prefetch_distance]); + libsais_prefetchr(&SA[i - 2 * prefetch_distance]); - libsais_prefetch(&T[SA[i - prefetch_distance - 0]]); - libsais_prefetch(&T[SA[i - prefetch_distance - 1]]); - libsais_prefetch(&T[SA[i - prefetch_distance - 2]]); - libsais_prefetch(&T[SA[i - prefetch_distance - 3]]); + libsais_prefetchr(&T[SA[i - prefetch_distance - 0]]); + libsais_prefetchr(&T[SA[i - prefetch_distance - 1]]); + libsais_prefetchr(&T[SA[i - prefetch_distance - 2]]); + libsais_prefetchr(&T[SA[i - prefetch_distance - 3]]); sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0; sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1; @@ -1615,12 +1615,12 @@ static void libsais_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T, fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) { - libsais_prefetch(&SA[i - 3 * prefetch_distance]); + libsais_prefetchr(&SA[i - 3 * prefetch_distance]); - libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]); - libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]); - libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]); - libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]); + libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 0]]); + libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 1]]); + libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 2]]); + libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 3]]); libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 0]]]); libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 1]]]); @@ -1646,12 +1646,12 @@ static void libsais_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) { - libsais_prefetch(&SA[i - 3 * prefetch_distance]); + libsais_prefetchr(&SA[i - 3 * prefetch_distance]); - libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]); - libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]); - libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]); - libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]); + libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 0]]); + libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 1]]); + libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 2]]); + libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 3]]); libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 0]], 0)]); libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 1]], 0)]); @@ -1679,12 +1679,12 @@ static void libsais_radix_sort_lms_suffixes_32s_block_gather(const sa_sint_t * R fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { - libsais_prefetch(&SA[i + 2 * prefetch_distance]); + libsais_prefetchr(&SA[i + 2 * prefetch_distance]); - libsais_prefetch(&T[SA[i + prefetch_distance + 0]]); - libsais_prefetch(&T[SA[i + prefetch_distance + 1]]); - libsais_prefetch(&T[SA[i + prefetch_distance + 2]]); - libsais_prefetch(&T[SA[i + prefetch_distance + 3]]); + libsais_prefetchr(&T[SA[i + prefetch_distance + 0]]); + libsais_prefetchr(&T[SA[i + prefetch_distance + 1]]); + libsais_prefetchr(&T[SA[i + prefetch_distance + 2]]); + libsais_prefetchr(&T[SA[i + prefetch_distance + 3]]); libsais_prefetchw(&cache[i + prefetch_distance]); @@ -1909,7 +1909,7 @@ static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRI for (; i >= prefetch_distance + 3; i -= 4) { - libsais_prefetch(&T[i - 2 * prefetch_distance]); + libsais_prefetchr(&T[i - 2 * prefetch_distance]); libsais_prefetchw(&buckets[T[i - prefetch_distance - 0]]); libsais_prefetchw(&buckets[T[i - prefetch_distance - 1]]); @@ -1950,7 +1950,7 @@ static void libsais_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA, sa_si fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { - libsais_prefetch(&induction_bucket[i + 2 * prefetch_distance]); + libsais_prefetchr(&induction_bucket[i + 2 * prefetch_distance]); libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 0]]); libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 1]]); @@ -1976,7 +1976,7 @@ static void libsais_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_si fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { - libsais_prefetch(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]); + libsais_prefetchr(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]); libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 0, 0)]]); libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 1, 0)]]); @@ -2112,12 +2112,12 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(const uint8_t * R fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { - libsais_prefetch(&SA[i + 2 * prefetch_distance]); + libsais_prefetchr(&SA[i + 2 * prefetch_distance]); - libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); - libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); - libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); - libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); + libsais_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); + libsais_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); + libsais_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); + libsais_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; @@ -2149,12 +2149,12 @@ static void libsais_partial_sorting_scan_left_to_right_8u_block_prepare(const ui fast_sint_t i, j, count = 0; sa_sint_t d = 1; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { - libsais_prefetch(&SA[i + 2 * prefetch_distance]); + libsais_prefetchr(&SA[i + 2 * prefetch_distance]); - libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); - libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); - libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); - libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); + libsais_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); + libsais_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); + libsais_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); + libsais_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); sa_sint_t p0 = cache[count].index = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d; sa_sint_t p1 = cache[count].index = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d; @@ -2179,7 +2179,7 @@ static void libsais_partial_sorting_scan_left_to_right_8u_block_place(sa_sint_t fast_sint_t i, j; for (i = 0, j = count - 1; i < j; i += 2) { - libsais_prefetch(&cache[i + prefetch_distance]); + libsais_prefetchr(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol; SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; @@ -2320,12 +2320,12 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(const sa_sint fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) { - libsais_prefetch(&SA[i + 3 * prefetch_distance]); + libsais_prefetchr(&SA[i + 3 * prefetch_distance]); - libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 1); - libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 2); - libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 1); - libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 2); + libsais_prefetchr(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 1); + libsais_prefetchr(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 2); + libsais_prefetchr(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 1); + libsais_prefetchr(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i + prefetch_distance + 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais_prefetchw(&buckets[v0]); sa_sint_t p1 = SA[i + prefetch_distance + 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais_prefetchw(&buckets[v1]); @@ -2358,8 +2358,8 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(const sa_sint { libsais_prefetchw(&SA[i + 3 * prefetch_distance]); - sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts2]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); } sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts3]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); } @@ -2400,10 +2400,10 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t * { libsais_prefetchw(&SA[i + 3 * prefetch_distance]); - sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); - sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); } - sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); } + sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); + sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetchr(&T[s2] - 2); } + sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetchr(&T[s3] - 2); } sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { SA[i + 0] = 0; SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { SA[i + 1] = 0; SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); } @@ -2424,12 +2424,12 @@ static void libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(const fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { - libsais_prefetch(&SA[i + 2 * prefetch_distance]); + libsais_prefetchr(&SA[i + 2 * prefetch_distance]); - libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); - libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); - libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); - libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); + libsais_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); + libsais_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); + libsais_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); + libsais_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); libsais_prefetchw(&cache[i + prefetch_distance]); @@ -2452,8 +2452,8 @@ static void libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(const { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); - sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); libsais_prefetchw(&cache[i + prefetch_distance]); @@ -2476,8 +2476,8 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(const { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); - sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); libsais_prefetchw(&cache[i + prefetch_distance]); @@ -2943,12 +2943,12 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(const uint8_t * R fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { - libsais_prefetch(&SA[i - 2 * prefetch_distance]); + libsais_prefetchr(&SA[i - 2 * prefetch_distance]); - libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1); - libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2); - libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1); - libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2); + libsais_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1); + libsais_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2); + libsais_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1); + libsais_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; @@ -2980,12 +2980,12 @@ static void libsais_partial_sorting_scan_right_to_left_8u_block_prepare(const ui fast_sint_t i, j, count = 0; sa_sint_t d = 1; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { - libsais_prefetch(&SA[i - 2 * prefetch_distance]); + libsais_prefetchr(&SA[i - 2 * prefetch_distance]); - libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1); - libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2); - libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1); - libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2); + libsais_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1); + libsais_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2); + libsais_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1); + libsais_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2); sa_sint_t p0 = cache[count].index = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d; sa_sint_t p1 = cache[count].index = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d; @@ -3010,7 +3010,7 @@ static void libsais_partial_sorting_scan_right_to_left_8u_block_place(sa_sint_t fast_sint_t i, j; for (i = 0, j = count - 1; i < j; i += 2) { - libsais_prefetch(&cache[i + prefetch_distance]); + libsais_prefetchr(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol; SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; @@ -3149,12 +3149,12 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(const sa_sint fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) { - libsais_prefetch(&SA[i - 3 * prefetch_distance]); + libsais_prefetchr(&SA[i - 3 * prefetch_distance]); - libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 1); - libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 2); - libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 1); - libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 2); + libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 1); + libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 2); + libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 1); + libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i - prefetch_distance - 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais_prefetchw(&buckets[v0]); sa_sint_t p1 = SA[i - prefetch_distance - 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais_prefetchw(&buckets[v1]); @@ -3187,8 +3187,8 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(const sa_sint { libsais_prefetchw(&SA[i - 3 * prefetch_distance]); - sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts2]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); } sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts3]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); } @@ -3229,10 +3229,10 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * { libsais_prefetchw(&SA[i - 3 * prefetch_distance]); - sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); - sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); } - sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); } + sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); + sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetchr(&T[s2] - 2); } + sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetchr(&T[s3] - 2); } sa_sint_t p0 = SA[i - 0]; if (p0 > 0) { SA[i - 0] = 0; SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; if (p1 > 0) { SA[i - 1] = 0; SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); } @@ -3253,12 +3253,12 @@ static void libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(const fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { - libsais_prefetch(&SA[i + 2 * prefetch_distance]); + libsais_prefetchr(&SA[i + 2 * prefetch_distance]); - libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); - libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); - libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); - libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); + libsais_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); + libsais_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); + libsais_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); + libsais_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); libsais_prefetchw(&cache[i + prefetch_distance]); @@ -3281,8 +3281,8 @@ static void libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(const { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); - sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); libsais_prefetchw(&cache[i + prefetch_distance]); @@ -3305,8 +3305,8 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(const { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); - sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); libsais_prefetchw(&cache[i + prefetch_distance]); @@ -3662,7 +3662,7 @@ static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_4k(sa_sint_t fast_sint_t i, j, l; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) { - libsais_prefetch(&SA[i + prefetch_distance]); + libsais_prefetchr(&SA[i + prefetch_distance]); sa_sint_t s0 = SA[i + 0]; SA[l] = (s0 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s0 < 0); sa_sint_t s1 = SA[i + 1]; SA[l] = (s1 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s1 < 0); @@ -3685,7 +3685,7 @@ static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t fast_sint_t i, j, l; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) { - libsais_prefetch(&SA[i + prefetch_distance]); + libsais_prefetchr(&SA[i + prefetch_distance]); sa_sint_t s0 = SA[i + 0]; SA[l] = s0 & SAINT_MAX; l += (s0 < 0); sa_sint_t s1 = SA[i + 1]; SA[l] = s1 & SAINT_MAX; l += (s1 < 0); @@ -3859,7 +3859,7 @@ static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA, sa_si fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { - libsais_prefetch(&SA[i + 2 * prefetch_distance]); + libsais_prefetchr(&SA[i + 2 * prefetch_distance]); libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]); libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]); @@ -3889,7 +3889,7 @@ static fast_sint_t libsais_gather_marked_suffixes_8u(sa_sint_t * RESTRICT SA, sa fast_sint_t i, j; for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4) { - libsais_prefetch(&SA[i - prefetch_distance]); + libsais_prefetchr(&SA[i - prefetch_distance]); sa_sint_t s0 = SA[i - 0]; SA[l] = s0 & SAINT_MAX; l -= s0 < 0; sa_sint_t s1 = SA[i - 1]; SA[l] = s1 & SAINT_MAX; l -= s1 < 0; @@ -4230,7 +4230,7 @@ static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(sa_s fast_sint_t i, j; for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3; i < j; i += 4) { - libsais_prefetch(&SA[i + 2 * prefetch_distance]); + libsais_prefetchr(&SA[i + 2 * prefetch_distance]); libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); @@ -4261,10 +4261,10 @@ static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(sa_s fast_sint_t i, j, p = SA[0], plen = SAm[p >> 1]; sa_sint_t pdiff = SAINT_MIN; for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2) { - libsais_prefetch(&SA[i + 2 * prefetch_distance]); + libsais_prefetchr(&SA[i + 2 * prefetch_distance]); - libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]); - libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]); + libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais_prefetchr(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]); + libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais_prefetchr(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]); fast_sint_t q = SA[i + 0], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN; if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < qlen); qdiff = (sa_sint_t)(l - qlen) & SAINT_MIN; } @@ -4306,10 +4306,10 @@ static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); - libsais_prefetch(&SAnm[SA[i + prefetch_distance + 0]]); - libsais_prefetch(&SAnm[SA[i + prefetch_distance + 1]]); - libsais_prefetch(&SAnm[SA[i + prefetch_distance + 2]]); - libsais_prefetch(&SAnm[SA[i + prefetch_distance + 3]]); + libsais_prefetchr(&SAnm[SA[i + prefetch_distance + 0]]); + libsais_prefetchr(&SAnm[SA[i + prefetch_distance + 1]]); + libsais_prefetchr(&SAnm[SA[i + prefetch_distance + 2]]); + libsais_prefetchr(&SAnm[SA[i + prefetch_distance + 3]]); SA[i + 0] = SAnm[SA[i + 0]]; SA[i + 1] = SAnm[SA[i + 1]]; @@ -4425,12 +4425,12 @@ static void libsais_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRIC sa_sint_t c = k - 1; fast_sint_t i, l = buckets[c]; for (i = (fast_sint_t)m - 1; i >= prefetch_distance + 3; i -= 4) { - libsais_prefetch(&SA[i - 2 * prefetch_distance]); + libsais_prefetchr(&SA[i - 2 * prefetch_distance]); - libsais_prefetch(&T[SA[i - prefetch_distance - 0]]); - libsais_prefetch(&T[SA[i - prefetch_distance - 1]]); - libsais_prefetch(&T[SA[i - prefetch_distance - 2]]); - libsais_prefetch(&T[SA[i - prefetch_distance - 3]]); + libsais_prefetchr(&T[SA[i - prefetch_distance - 0]]); + libsais_prefetchr(&T[SA[i - prefetch_distance - 1]]); + libsais_prefetchr(&T[SA[i - prefetch_distance - 2]]); + libsais_prefetchr(&T[SA[i - prefetch_distance - 3]]); sa_sint_t p0 = SA[i - 0]; if (T[p0] != c) { c = T[p0]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p0; sa_sint_t p1 = SA[i - 1]; if (T[p1] != c) { c = T[p1]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p1; @@ -4527,8 +4527,8 @@ static void libsais_final_bwt_scan_left_to_right_8u(const uint8_t * RESTRICT T, { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); - sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } @@ -4549,8 +4549,8 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u(const uint8_t * RESTRICT { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); - sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]]; }} sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]]; }} @@ -4571,8 +4571,8 @@ static void libsais_final_sorting_scan_left_to_right_8u(const uint8_t * RESTRICT { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); - sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } @@ -4593,10 +4593,10 @@ static void libsais_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTR { libsais_prefetchw(&SA[i + 3 * prefetch_distance]); - sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); - sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); } - sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); } + sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); + sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetchr(&T[s2] - 2); } + sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetchr(&T[s3] - 2); } sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } @@ -4621,8 +4621,8 @@ static fast_sint_t libsais_final_bwt_scan_left_to_right_8u_block_prepare(const u { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); - sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } @@ -4647,8 +4647,8 @@ static fast_sint_t libsais_final_sorting_scan_left_to_right_8u_block_prepare(con { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); - sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } @@ -4669,7 +4669,7 @@ static void libsais_final_order_scan_left_to_right_8u_block_place(sa_sint_t * RE fast_sint_t i, j; for (i = 0, j = count - 3; i < j; i += 4) { - libsais_prefetch(&cache[i + prefetch_distance]); + libsais_prefetchr(&cache[i + prefetch_distance]); SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index; SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index; @@ -4690,7 +4690,7 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u_block_place(sa_sint_t * fast_sint_t i, j; for (i = 0, j = count - 3; i < j; i += 4) { - libsais_prefetch(&cache[i + prefetch_distance]); + libsais_prefetchr(&cache[i + prefetch_distance]); SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index; if ((cache[i + 0].index & rm) == 0) { I[(cache[i + 0].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 0].symbol]; } SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 1].symbol]; } @@ -4713,8 +4713,8 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_gather(const sa_s { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); - sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); libsais_prefetchw(&cache[i + prefetch_distance]); @@ -5146,8 +5146,8 @@ static sa_sint_t libsais_final_bwt_scan_right_to_left_8u(const uint8_t * RESTRIC { libsais_prefetchw(&SA[i - 2 * prefetch_distance]); - sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i - 0]; index = (p0 == 0) ? (sa_sint_t)(i - 0) : index; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; } @@ -5174,8 +5174,8 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u(const uint8_t * RESTRICT { libsais_prefetchw(&SA[i - 2 * prefetch_distance]); - sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]] + 1; } } @@ -5200,8 +5200,8 @@ static void libsais_final_sorting_scan_right_to_left_8u(const uint8_t * RESTRICT { libsais_prefetchw(&SA[i - 2 * prefetch_distance]); - sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } @@ -5222,10 +5222,10 @@ static void libsais_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTR { libsais_prefetchw(&SA[i - 3 * prefetch_distance]); - sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); - sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); } - sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); } + sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); + sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetchr(&T[s2] - 2); } + sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetchr(&T[s3] - 2); } sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } @@ -5250,8 +5250,8 @@ static fast_sint_t libsais_final_bwt_scan_right_to_left_8u_block_prepare(const u { libsais_prefetchw(&SA[i - 2 * prefetch_distance]); - sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p0 : t; } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p1 : t; } @@ -5276,8 +5276,8 @@ static fast_sint_t libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(con { libsais_prefetchw(&SA[i - 2 * prefetch_distance]); - sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p0 : t; cache[count + 1].index = p0; count += 2; } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p1 : t; cache[count + 1].index = p1; count += 2; } @@ -5302,8 +5302,8 @@ static fast_sint_t libsais_final_sorting_scan_right_to_left_8u_block_prepare(con { libsais_prefetchw(&SA[i - 2 * prefetch_distance]); - sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } @@ -5324,7 +5324,7 @@ static void libsais_final_order_scan_right_to_left_8u_block_place(sa_sint_t * RE fast_sint_t i, j; for (i = 0, j = count - 3; i < j; i += 4) { - libsais_prefetch(&cache[i + prefetch_distance]); + libsais_prefetchr(&cache[i + prefetch_distance]); SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; SA[--buckets[cache[i + 1].symbol]] = cache[i + 1].index; @@ -5345,7 +5345,7 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u_block_place(sa_sint_t * fast_sint_t i, j; for (i = 0, j = count - 6; i < j; i += 8) { - libsais_prefetch(&cache[i + prefetch_distance]); + libsais_prefetchr(&cache[i + prefetch_distance]); SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; if ((cache[i + 1].index & rm) == 0) { I[cache[i + 1].index / (rm + 1)] = buckets[cache[i + 0].symbol] + 1; } SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; if ((cache[i + 3].index & rm) == 0) { I[cache[i + 3].index / (rm + 1)] = buckets[cache[i + 2].symbol] + 1; } @@ -5368,8 +5368,8 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_gather(const sa_s { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); - sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetchr(s0 > 0 ? Ts0 : NULL); + sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetchr(s1 > 0 ? Ts1 : NULL); libsais_prefetchw(&cache[i + prefetch_distance]); @@ -5866,7 +5866,7 @@ static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s(sa_sint_ sa_sint_t i, j; for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 2 * (sa_sint_t)prefetch_distance - 3; i < j; i += 4) { - libsais_prefetch(&SA[i + 3 * prefetch_distance]); + libsais_prefetchr(&SA[i + 3 * prefetch_distance]); libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 0]) >> 1]); libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 1]) >> 1]); @@ -5902,7 +5902,7 @@ static void libsais_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RE fast_sint_t i, j, l = *pl - 1, r = *pr - 1; for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4) { - libsais_prefetch(&SA[i - prefetch_distance]); + libsais_prefetchr(&SA[i - prefetch_distance]); sa_sint_t p0 = SA[i - 0]; SAl[l] = p0 & SAINT_MAX; l -= p0 < 0; SAr[r] = p0 - 1; r -= p0 > 0; sa_sint_t p1 = SA[i - 1]; SAl[l] = p1 & SAINT_MAX; l -= p1 < 0; SAr[r] = p1 - 1; r -= p1 > 0; @@ -5930,12 +5930,12 @@ static sa_sint_t libsais_count_unique_suffixes(sa_sint_t * RESTRICT SA, sa_sint_ fast_sint_t i, j; sa_sint_t f0 = 0, f1 = 0, f2 = 0, f3 = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { - libsais_prefetch(&SA[i + 2 * prefetch_distance]); + libsais_prefetchr(&SA[i + 2 * prefetch_distance]); - libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); - libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); - libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]); - libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]); + libsais_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); + libsais_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); + libsais_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]); + libsais_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]); f0 += SAm[((sa_uint_t)SA[i + 0]) >> 1] < 0; f1 += SAm[((sa_uint_t)SA[i + 1]) >> 1] < 0; @@ -6090,7 +6090,7 @@ static void libsais_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sin sa_sint_t i, j; fast_sint_t tmp = *SAnm++; for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 6; i < j; i += 4) { - libsais_prefetch(&T[i + prefetch_distance]); + libsais_prefetchr(&T[i + prefetch_distance]); sa_sint_t c0 = T[i + 0]; if (c0 < 0) { T[i + 0] = c0 & SAINT_MAX; SA[tmp] = i + 0; i++; tmp = *SAnm++; } sa_sint_t c1 = T[i + 1]; if (c1 < 0) { T[i + 1] = c1 & SAINT_MAX; SA[tmp] = i + 1; i++; tmp = *SAnm++; } @@ -6113,7 +6113,7 @@ static void libsais_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa fast_sint_t i, j; sa_sint_t tmp = *SAnm++; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) { - libsais_prefetch(&SA[i + prefetch_distance]); + libsais_prefetchr(&SA[i + prefetch_distance]); if (SA[i + 0] == 0) { SA[i + 0] = tmp; tmp = *SAnm++; } if (SA[i + 1] == 0) { SA[i + 1] = tmp; tmp = *SAnm++; } @@ -6539,7 +6539,7 @@ static void libsais_bwt_copy_8u(uint8_t * RESTRICT U, sa_sint_t * RESTRICT A, sa fast_sint_t i, j; for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8) { - libsais_prefetch(&A[i + prefetch_distance]); + libsais_prefetchr(&A[i + prefetch_distance]); U[i + 0] = (uint8_t)A[i + 0]; U[i + 1] = (uint8_t)A[i + 1]; @@ -6919,7 +6919,7 @@ static void libsais_unbwt_compute_histogram(const uint8_t * RESTRICT T, fast_sin for (; T_p < (uint8_t * )((ptrdiff_t)(T + n - 8) & (-64)); T_p += 64) { - libsais_prefetch(&T_p[prefetch_distance]); + libsais_prefetchr(&T_p[prefetch_distance]); fast_uint_t z = ((const uint32_t *)(const void *)T_p)[2], w = ((const uint32_t *)(const void *)T_p)[3]; copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++; @@ -7653,6 +7653,8 @@ static void libsais_compute_phi(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTR fast_sint_t i, j; sa_sint_t k = omp_block_start > 0 ? SA[omp_block_start - 1] : n; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { + libsais_prefetchr(&SA[i + 2 * prefetch_distance]); + libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 0]]); libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 1]]); @@ -7702,7 +7704,8 @@ static void libsais_compute_plcp(const uint8_t * RESTRICT T, sa_sint_t * RESTRIC fast_sint_t i, j, l = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j; i += 1) { - libsais_prefetch(&T[PLCP[i + prefetch_distance] + l]); + libsais_prefetchw(&PLCP[i + 2 * prefetch_distance]); + libsais_prefetchr(&T[PLCP[i + prefetch_distance] + l]); fast_sint_t k = PLCP[i], m = n - (i > k ? i : k); while (l < m && T[i + l] == T[k + l]) { l++; } @@ -7749,14 +7752,17 @@ static void libsais_compute_lcp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { - libsais_prefetch(&PLCP[SA[i + prefetch_distance + 0]]); - libsais_prefetch(&PLCP[SA[i + prefetch_distance + 1]]); + libsais_prefetchr(&SA[i + 2 * prefetch_distance]); + libsais_prefetchw(&LCP[i + prefetch_distance]); + + libsais_prefetchr(&PLCP[SA[i + prefetch_distance + 0]]); + libsais_prefetchr(&PLCP[SA[i + prefetch_distance + 1]]); LCP[i + 0] = PLCP[SA[i + 0]]; LCP[i + 1] = PLCP[SA[i + 1]]; - libsais_prefetch(&PLCP[SA[i + prefetch_distance + 2]]); - libsais_prefetch(&PLCP[SA[i + prefetch_distance + 3]]); + libsais_prefetchr(&PLCP[SA[i + prefetch_distance + 2]]); + libsais_prefetchr(&PLCP[SA[i + prefetch_distance + 3]]); LCP[i + 2] = PLCP[SA[i + 2]]; LCP[i + 3] = PLCP[SA[i + 3]];