diff --git a/CHANGES b/CHANGES index 686ea50..beed505 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,6 @@ +Changes in 2.7.0 (April 12, 2022) +- Support for longest common prefix array (LCP) construction. + Changes in 2.6.5 (January 1, 2022) - Exposed functions to construct suffix array of a given integer array. - Improved detection of various compiler intrinsics. diff --git a/Makefile b/Makefile index 6950303..87de04c 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ PROJECT=sais PLIBNAME=lib$(PROJECT) -PVER=2.6.5 +PVER=2.7.0 PSOVER=2 ifeq ($(OS),Windows_NT) PLIBSTATIC=$(PROJECT).a diff --git a/README.md b/README.md index c6127e7..fd33895 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ # libsais -The libsais is a library for fast (see [Benchmarks](#benchmarks) below) linear time suffix array and Burrows-Wheeler transform construction based on induced sorting algorithm described in the following papers: -* Ge Nong, Sen Zhang and Wai Hong Chan *Two Efficient Algorithms for Linear Suffix Array Construction*, 2008 +The libsais is a library for fast (see [Benchmarks](#benchmarks) below) linear time suffix array, longest common prefix array and Burrows-Wheeler transform construction based on induced sorting algorithm described in the following papers: +* Ge Nong, Sen Zhang, Wai Hong Chan *Two Efficient Algorithms for Linear Suffix Array Construction*, 2008 +* Juha Karkkainen, Giovanni Manzini, Simon J. Puglisi *Permuted Longest-Common-Prefix Array*, 2009 * Nataliya Timoshevskaya, Wu-chun Feng *SAIS-OPT: On the characterization and optimization of the SA-IS algorithm for suffix array construction*, 2014 * Jing Yi Xie, Ge Nong, Bin Lao, Wentao Xu *Scalable Suffix Sorting on a Multicore Machine*, 2020 @@ -19,6 +20,8 @@ The libsais provides simple C99 API to construct suffix array and Burrows-Wheele The libsais is released under the [Apache License Version 2.0](LICENSE "Apache license") ## Changes +* April 12, 2022 (2.7.0) + * Support for longest common prefix array (LCP) construction. * January 1, 2022 (2.6.5) * Exposed functions to construct suffix array of a given integer array. * Improved detection of various compiler intrinsics. @@ -41,7 +44,7 @@ The libsais is released under the [Apache License Version 2.0](LICENSE "Apache l * Initial release. ## Versions of the libsais library -* [libsais.c](src/libsais.c) (and corresponding [libsais.h](src/libsais.h)) is for suffix array, forward BWT and reverse BWT construction over 8-bit inputs smaller than 2GB (2147483648 bytes). +* [libsais.c](src/libsais.c) (and corresponding [libsais.h](src/libsais.h)) is for suffix array, PLCP, LCP, forward BWT and reverse BWT construction over 8-bit inputs smaller than 2GB (2147483648 bytes). * This version of the library could also be used to construct suffix array of an integer array (with a caveat that input array must be mutable). * [libsais64.c](src/libsais64.c) (and corresponding [libsais64.h](src/libsais64.h)) is optional extension of the library for inputs larger or equlas to 2GB (2147483648 bytes). * [libsais16.c](src/libsais16.c) (and corresponding [libsais16.h](src/libsais16.h)) is independent version of the library for 16-bit inputs. diff --git a/VERSION b/VERSION index 6816713..9aa3464 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.6.5 \ No newline at end of file +2.7.0 \ No newline at end of file diff --git a/src/libsais.c b/src/libsais.c index 6820fa7..44cdc19 100644 --- a/src/libsais.c +++ b/src/libsais.c @@ -1,7 +1,7 @@ /*-- -This file is a part of libsais, a library for linear time -suffix array and burrows wheeler transform construction. +This file is a part of libsais, a library for linear time suffix array, +longest common prefix array and burrows wheeler transform construction. Copyright (c) 2021-2022 Ilya Grebnov @@ -105,7 +105,7 @@ typedef struct LIBSAIS_UNBWT_CONTEXT #if __has_builtin(__builtin_prefetch) #define HAS_BUILTIN_PREFECTCH #endif -#elif defined(__GNUC__) && ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4) +#elif defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4)) #define HAS_BUILTIN_PREFECTCH #endif @@ -113,7 +113,7 @@ typedef struct LIBSAIS_UNBWT_CONTEXT #if __has_builtin(__builtin_bswap16) #define HAS_BUILTIN_BSWAP16 #endif -#elif defined(__GNUC__) && ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)) || (__GNUC__ >= 5) +#elif defined(__GNUC__) && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)) || (__GNUC__ >= 5)) #define HAS_BUILTIN_BSWAP16 #endif @@ -7645,3 +7645,226 @@ int32_t libsais_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32 } #endif + +static void libsais_compute_phi(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; sa_sint_t k = omp_block_start > 0 ? SA[omp_block_start - 1] : n; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 0]]); + libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 1]]); + + PLCP[SA[i + 0]] = k; k = SA[i + 0]; + PLCP[SA[i + 1]] = k; k = SA[i + 1]; + + libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 2]]); + libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 3]]); + + PLCP[SA[i + 2]] = k; k = SA[i + 2]; + PLCP[SA[i + 3]] = k; k = SA[i + 3]; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + PLCP[SA[i]] = k; k = SA[i]; + } +} + +static void libsais_compute_phi_omp(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + libsais_compute_phi(SA, PLCP, n, omp_block_start, omp_block_size); + } +} + +static void libsais_compute_plcp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, fast_sint_t n, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j, l = 0; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j; i += 1) + { + libsais_prefetch(&T[PLCP[i + prefetch_distance] + l]); + + fast_sint_t k = PLCP[i], m = n - (i > k ? i : k); + while (l < m && T[i + l] == T[k + l]) { l++; } + + PLCP[i] = (sa_sint_t)l; l -= (l != 0); + } + + for (j += prefetch_distance; i < j; i += 1) + { + fast_sint_t k = PLCP[i], m = n - (i > k ? i : k); + while (l < m && T[i + l] == T[k + l]) { l++; } + + PLCP[i] = (sa_sint_t)l; l -= (l != 0); + } +} + +static void libsais_compute_plcp_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + libsais_compute_plcp(T, PLCP, n, omp_block_start, omp_block_size); + } +} + +static void libsais_compute_lcp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT LCP, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais_prefetch(&PLCP[SA[i + prefetch_distance + 0]]); + libsais_prefetch(&PLCP[SA[i + prefetch_distance + 1]]); + + LCP[i + 0] = PLCP[SA[i + 0]]; + LCP[i + 1] = PLCP[SA[i + 1]]; + + libsais_prefetch(&PLCP[SA[i + prefetch_distance + 2]]); + libsais_prefetch(&PLCP[SA[i + prefetch_distance + 3]]); + + LCP[i + 2] = PLCP[SA[i + 2]]; + LCP[i + 3] = PLCP[SA[i + 3]]; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + LCP[i] = PLCP[SA[i]]; + } +} + +static void libsais_compute_lcp_omp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT LCP, sa_sint_t n, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + libsais_compute_lcp(PLCP, SA, LCP, omp_block_start, omp_block_size); + } +} + +int32_t libsais_plcp(const uint8_t * T, const int32_t * SA, int32_t * PLCP, int32_t n) +{ + if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { PLCP[0] = 0; } + return 0; + } + + libsais_compute_phi_omp(SA, PLCP, n, 1); + libsais_compute_plcp_omp(T, PLCP, n, 1); + + return 0; +} + +int32_t libsais_lcp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n) +{ + if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { LCP[0] = PLCP[SA[0]]; } + return 0; + } + + libsais_compute_lcp_omp(PLCP, SA, LCP, n, 1); + + return 0; +} + +#if defined(_OPENMP) + +int32_t libsais_plcp_omp(const uint8_t * T, const int32_t * SA, int32_t * PLCP, int32_t n, int32_t threads) +{ + if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (threads < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { PLCP[0] = 0; } + return 0; + } + + threads = threads > 0 ? threads : omp_get_max_threads(); + + libsais_compute_phi_omp(SA, PLCP, n, threads); + libsais_compute_plcp_omp(T, PLCP, n, threads); + + return 0; +} + +int32_t libsais_lcp_omp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n, int32_t threads) +{ + if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0) || (threads < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { LCP[0] = PLCP[SA[0]]; } + return 0; + } + + threads = threads > 0 ? threads : omp_get_max_threads(); + + libsais_compute_lcp_omp(PLCP, SA, LCP, n, threads); + + return 0; +} + +#endif diff --git a/src/libsais.h b/src/libsais.h index cc3b50b..f0f6018 100644 --- a/src/libsais.h +++ b/src/libsais.h @@ -1,7 +1,7 @@ /*-- -This file is a part of libsais, a library for linear time -suffix array and burrows wheeler transform construction. +This file is a part of libsais, a library for linear time suffix array, +longest common prefix array and burrows wheeler transform construction. Copyright (c) 2021-2022 Ilya Grebnov @@ -116,7 +116,7 @@ extern "C" { #endif /** - * Constructs the burrows-wheeler transformed string of a given string. + * Constructs the burrows-wheeler transformed string (BWT) of a given string. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. @@ -128,7 +128,7 @@ extern "C" { int32_t libsais_bwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq); /** - * Constructs the burrows-wheeler transformed string of a given string with auxiliary indexes. + * Constructs the burrows-wheeler transformed string (BWT) of a given string with auxiliary indexes. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. @@ -142,7 +142,7 @@ extern "C" { int32_t libsais_bwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I); /** - * Constructs the burrows-wheeler transformed string of a given string using libsais context. + * Constructs the burrows-wheeler transformed string (BWT) of a given string using libsais context. * @param ctx The libsais context. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). @@ -155,7 +155,7 @@ extern "C" { int32_t libsais_bwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq); /** - * Constructs the burrows-wheeler transformed string of a given string with auxiliary indexes using libsais context. + * Constructs the burrows-wheeler transformed string (BWT) of a given string with auxiliary indexes using libsais context. * @param ctx The libsais context. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). @@ -171,7 +171,7 @@ extern "C" { #if defined(_OPENMP) /** - * Constructs the burrows-wheeler transformed string of a given string in parallel using OpenMP. + * Constructs the burrows-wheeler transformed string (BWT) of a given string in parallel using OpenMP. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. @@ -184,7 +184,7 @@ extern "C" { int32_t libsais_bwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads); /** - * Constructs the burrows-wheeler transformed string of a given string with auxiliary indexes in parallel using OpenMP. + * Constructs the burrows-wheeler transformed string (BWT) of a given string with auxiliary indexes in parallel using OpenMP. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. @@ -223,7 +223,7 @@ extern "C" { void libsais_unbwt_free_ctx(void * ctx); /** - * Constructs the original string from a given burrows-wheeler transformed string with primary index. + * Constructs the original string from a given burrows-wheeler transformed string (BWT) with primary index. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). @@ -235,7 +235,7 @@ extern "C" { int32_t libsais_unbwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i); /** - * Constructs the original string from a given burrows-wheeler transformed string with primary index using libsais reverse BWT context. + * Constructs the original string from a given burrows-wheeler transformed string (BWT) with primary index using libsais reverse BWT context. * @param ctx The libsais reverse BWT context. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). @@ -248,7 +248,7 @@ extern "C" { int32_t libsais_unbwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i); /** - * Constructs the original string from a given burrows-wheeler transformed string with auxiliary indexes. + * Constructs the original string from a given burrows-wheeler transformed string (BWT) with auxiliary indexes. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). @@ -261,7 +261,7 @@ extern "C" { int32_t libsais_unbwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I); /** - * Constructs the original string from a given burrows-wheeler transformed string with auxiliary indexes using libsais reverse BWT context. + * Constructs the original string from a given burrows-wheeler transformed string (BWT) with auxiliary indexes using libsais reverse BWT context. * @param ctx The libsais reverse BWT context. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). @@ -276,7 +276,7 @@ extern "C" { #if defined(_OPENMP) /** - * Constructs the original string from a given burrows-wheeler transformed string with primary index in parallel using OpenMP. + * Constructs the original string from a given burrows-wheeler transformed string (BWT) with primary index in parallel using OpenMP. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). @@ -289,7 +289,7 @@ extern "C" { int32_t libsais_unbwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads); /** - * Constructs the original string from a given burrows-wheeler transformed string with auxiliary indexes in parallel using OpenMP. + * Constructs the original string from a given burrows-wheeler transformed string (BWT) with auxiliary indexes in parallel using OpenMP. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). @@ -303,6 +303,50 @@ extern "C" { int32_t libsais_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads); #endif + /** + * Constructs the permuted longest common prefix array (PLCP) of a given string and a suffix array. + * @param T [0..n-1] The input string. + * @param SA [0..n-1] The input suffix array. + * @param PLCP [0..n-1] The output permuted longest common prefix array. + * @param n The length of the string and the suffix array. + * @return 0 if no error occurred, -1 otherwise. + */ + int32_t libsais_plcp(const uint8_t * T, const int32_t * SA, int32_t * PLCP, int32_t n); + + /** + * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array. + * @param PLCP [0..n-1] The input permuted longest common prefix array. + * @param SA [0..n-1] The input suffix array. + * @param LCP [0..n-1] The output longest common prefix array (can be SA). + * @param n The length of the permuted longest common prefix array and the suffix array. + * @return 0 if no error occurred, -1 otherwise. + */ + int32_t libsais_lcp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n); + +#if defined(_OPENMP) + /** + * Constructs the permuted longest common prefix array (PLCP) of a given string and a suffix array in parallel using OpenMP. + * @param T [0..n-1] The input string. + * @param SA [0..n-1] The input suffix array. + * @param PLCP [0..n-1] The output permuted longest common prefix array. + * @param n The length of the string and the suffix array. + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return 0 if no error occurred, -1 otherwise. + */ + int32_t libsais_plcp_omp(const uint8_t * T, const int32_t * SA, int32_t * PLCP, int32_t n, int32_t threads); + + /** + * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array in parallel using OpenMP. + * @param PLCP [0..n-1] The input permuted longest common prefix array. + * @param SA [0..n-1] The input suffix array. + * @param LCP [0..n-1] The output longest common prefix array (can be SA). + * @param n The length of the permuted longest common prefix array and the suffix array. + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return 0 if no error occurred, -1 otherwise. + */ + int32_t libsais_lcp_omp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n, int32_t threads); +#endif + #ifdef __cplusplus } #endif diff --git a/src/libsais16.c b/src/libsais16.c index 3d09d30..49e6953 100644 --- a/src/libsais16.c +++ b/src/libsais16.c @@ -1,7 +1,7 @@ /*-- -This file is a part of libsais, a library for linear time -suffix array and burrows wheeler transform construction. +This file is a part of libsais, a library for linear time suffix array, +longest common prefix array and burrows wheeler transform construction. Copyright (c) 2021-2022 Ilya Grebnov @@ -105,7 +105,7 @@ typedef struct LIBSAIS_UNBWT_CONTEXT #if __has_builtin(__builtin_prefetch) #define HAS_BUILTIN_PREFECTCH #endif -#elif defined(__GNUC__) && ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4) +#elif defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4)) #define HAS_BUILTIN_PREFECTCH #endif @@ -6915,7 +6915,7 @@ static void libsais16_unbwt_init_parallel(const uint16_t * RESTRICT T, sa_uint_t fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ALPHABET_SIZE - omp_block_start; - memset(bucket2 + omp_block_start, 0, omp_block_size * sizeof(sa_uint_t)); + memset(bucket2 + omp_block_start, 0, (size_t)omp_block_size * sizeof(sa_uint_t)); fast_sint_t t; for (t = 0; t < omp_num_threads; ++t, bucket2_temp += ALPHABET_SIZE) @@ -7350,3 +7350,226 @@ int32_t libsais16_unbwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, i } #endif + +static void libsais16_compute_phi(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; sa_sint_t k = omp_block_start > 0 ? SA[omp_block_start - 1] : n; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais16_prefetchw(&PLCP[SA[i + prefetch_distance + 0]]); + libsais16_prefetchw(&PLCP[SA[i + prefetch_distance + 1]]); + + PLCP[SA[i + 0]] = k; k = SA[i + 0]; + PLCP[SA[i + 1]] = k; k = SA[i + 1]; + + libsais16_prefetchw(&PLCP[SA[i + prefetch_distance + 2]]); + libsais16_prefetchw(&PLCP[SA[i + prefetch_distance + 3]]); + + PLCP[SA[i + 2]] = k; k = SA[i + 2]; + PLCP[SA[i + 3]] = k; k = SA[i + 3]; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + PLCP[SA[i]] = k; k = SA[i]; + } +} + +static void libsais16_compute_phi_omp(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + libsais16_compute_phi(SA, PLCP, n, omp_block_start, omp_block_size); + } +} + +static void libsais16_compute_plcp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, fast_sint_t n, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j, l = 0; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j; i += 1) + { + libsais16_prefetch(&T[PLCP[i + prefetch_distance] + l]); + + fast_sint_t k = PLCP[i], m = n - (i > k ? i : k); + while (l < m && T[i + l] == T[k + l]) { l++; } + + PLCP[i] = (sa_sint_t)l; l -= (l != 0); + } + + for (j += prefetch_distance; i < j; i += 1) + { + fast_sint_t k = PLCP[i], m = n - (i > k ? i : k); + while (l < m && T[i + l] == T[k + l]) { l++; } + + PLCP[i] = (sa_sint_t)l; l -= (l != 0); + } +} + +static void libsais16_compute_plcp_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + libsais16_compute_plcp(T, PLCP, n, omp_block_start, omp_block_size); + } +} + +static void libsais16_compute_lcp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT LCP, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais16_prefetch(&PLCP[SA[i + prefetch_distance + 0]]); + libsais16_prefetch(&PLCP[SA[i + prefetch_distance + 1]]); + + LCP[i + 0] = PLCP[SA[i + 0]]; + LCP[i + 1] = PLCP[SA[i + 1]]; + + libsais16_prefetch(&PLCP[SA[i + prefetch_distance + 2]]); + libsais16_prefetch(&PLCP[SA[i + prefetch_distance + 3]]); + + LCP[i + 2] = PLCP[SA[i + 2]]; + LCP[i + 3] = PLCP[SA[i + 3]]; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + LCP[i] = PLCP[SA[i]]; + } +} + +static void libsais16_compute_lcp_omp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT LCP, sa_sint_t n, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + libsais16_compute_lcp(PLCP, SA, LCP, omp_block_start, omp_block_size); + } +} + +int32_t libsais16_plcp(const uint16_t * T, const int32_t * SA, int32_t * PLCP, int32_t n) +{ + if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { PLCP[0] = 0; } + return 0; + } + + libsais16_compute_phi_omp(SA, PLCP, n, 1); + libsais16_compute_plcp_omp(T, PLCP, n, 1); + + return 0; +} + +int32_t libsais16_lcp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n) +{ + if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { LCP[0] = PLCP[SA[0]]; } + return 0; + } + + libsais16_compute_lcp_omp(PLCP, SA, LCP, n, 1); + + return 0; +} + +#if defined(_OPENMP) + +int32_t libsais16_plcp_omp(const uint16_t * T, const int32_t * SA, int32_t * PLCP, int32_t n, int32_t threads) +{ + if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (threads < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { PLCP[0] = 0; } + return 0; + } + + threads = threads > 0 ? threads : omp_get_max_threads(); + + libsais16_compute_phi_omp(SA, PLCP, n, threads); + libsais16_compute_plcp_omp(T, PLCP, n, threads); + + return 0; +} + +int32_t libsais16_lcp_omp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n, int32_t threads) +{ + if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0) || (threads < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { LCP[0] = PLCP[SA[0]]; } + return 0; + } + + threads = threads > 0 ? threads : omp_get_max_threads(); + + libsais16_compute_lcp_omp(PLCP, SA, LCP, n, threads); + + return 0; +} + +#endif diff --git a/src/libsais16.h b/src/libsais16.h index 314c90f..910efd1 100644 --- a/src/libsais16.h +++ b/src/libsais16.h @@ -1,7 +1,7 @@ /*-- -This file is a part of libsais, a library for linear time -suffix array and burrows wheeler transform construction. +This file is a part of libsais, a library for linear time suffix array, +longest common prefix array and burrows wheeler transform construction. Copyright (c) 2021-2022 Ilya Grebnov @@ -91,7 +91,7 @@ extern "C" { #endif /** - * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string. + * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n-1+fs] The temporary array. @@ -103,7 +103,7 @@ extern "C" { int32_t libsais16_bwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq); /** - * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string with auxiliary indexes. + * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string with auxiliary indexes. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n-1+fs] The temporary array. @@ -117,7 +117,7 @@ extern "C" { int32_t libsais16_bwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I); /** - * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string using libsais16 context. + * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string using libsais16 context. * @param ctx The libsais16 context. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). @@ -130,7 +130,7 @@ extern "C" { int32_t libsais16_bwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq); /** - * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string with auxiliary indexes using libsais16 context. + * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string with auxiliary indexes using libsais16 context. * @param ctx The libsais16 context. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). @@ -146,7 +146,7 @@ extern "C" { #if defined(_OPENMP) /** - * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string in parallel using OpenMP. + * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string in parallel using OpenMP. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n-1+fs] The temporary array. @@ -159,7 +159,7 @@ extern "C" { int32_t libsais16_bwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads); /** - * Constructs the burrows-wheeler transformed 16-bit string of a given 16-bit string with auxiliary indexes in parallel using OpenMP. + * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string with auxiliary indexes in parallel using OpenMP. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n-1+fs] The temporary array. @@ -198,7 +198,7 @@ extern "C" { void libsais16_unbwt_free_ctx(void * ctx); /** - * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with primary index. + * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with primary index. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). @@ -210,7 +210,7 @@ extern "C" { int32_t libsais16_unbwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i); /** - * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with primary index using libsais16 reverse BWT context. + * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with primary index using libsais16 reverse BWT context. * @param ctx The libsais16 reverse BWT context. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). @@ -223,7 +223,7 @@ extern "C" { int32_t libsais16_unbwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i); /** - * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with auxiliary indexes. + * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with auxiliary indexes. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). @@ -236,7 +236,7 @@ extern "C" { int32_t libsais16_unbwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I); /** - * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with auxiliary indexes using libsais16 reverse BWT context. + * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with auxiliary indexes using libsais16 reverse BWT context. * @param ctx The libsais16 reverse BWT context. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). @@ -251,7 +251,7 @@ extern "C" { #if defined(_OPENMP) /** - * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with primary index in parallel using OpenMP. + * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with primary index in parallel using OpenMP. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). @@ -264,7 +264,7 @@ extern "C" { int32_t libsais16_unbwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads); /** - * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string with auxiliary indexes in parallel using OpenMP. + * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with auxiliary indexes in parallel using OpenMP. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). @@ -278,6 +278,50 @@ extern "C" { int32_t libsais16_unbwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads); #endif + /** + * Constructs the permuted longest common prefix array (PLCP) of a given 16-bit string and a suffix array. + * @param T [0..n-1] The input 16-bit string. + * @param SA [0..n-1] The input suffix array. + * @param PLCP [0..n-1] The output permuted longest common prefix array. + * @param n The length of the 16-bit string and the suffix array. + * @return 0 if no error occurred, -1 otherwise. + */ + int32_t libsais16_plcp(const uint16_t * T, const int32_t * SA, int32_t * PLCP, int32_t n); + + /** + * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array. + * @param PLCP [0..n-1] The input permuted longest common prefix array. + * @param SA [0..n-1] The input suffix array. + * @param LCP [0..n-1] The output longest common prefix array (can be SA). + * @param n The length of the permuted longest common prefix array and the suffix array. + * @return 0 if no error occurred, -1 otherwise. + */ + int32_t libsais16_lcp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n); + +#if defined(_OPENMP) + /** + * Constructs the permuted longest common prefix array (PLCP) of a given 16-bit string and a suffix array in parallel using OpenMP. + * @param T [0..n-1] The input 16-bit string. + * @param SA [0..n-1] The input suffix array. + * @param PLCP [0..n-1] The output permuted longest common prefix array. + * @param n The length of the 16-bit string and the suffix array. + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return 0 if no error occurred, -1 otherwise. + */ + int32_t libsais16_plcp_omp(const uint16_t * T, const int32_t * SA, int32_t * PLCP, int32_t n, int32_t threads); + + /** + * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array in parallel using OpenMP. + * @param PLCP [0..n-1] The input permuted longest common prefix array. + * @param SA [0..n-1] The input suffix array. + * @param LCP [0..n-1] The output longest common prefix array (can be SA). + * @param n The length of the permuted longest common prefix array and the suffix array. + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return 0 if no error occurred, -1 otherwise. + */ + int32_t libsais16_lcp_omp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n, int32_t threads); +#endif + #ifdef __cplusplus } #endif diff --git a/src/libsais64.c b/src/libsais64.c index fab14a4..6145a26 100644 --- a/src/libsais64.c +++ b/src/libsais64.c @@ -1,7 +1,7 @@ /*-- -This file is a part of libsais, a library for linear time -suffix array and burrows wheeler transform construction. +This file is a part of libsais, a library for linear time suffix array, +longest common prefix array and burrows wheeler transform construction. Copyright (c) 2021-2022 Ilya Grebnov @@ -106,7 +106,7 @@ typedef struct LIBSAIS_UNBWT_CONTEXT #if __has_builtin(__builtin_prefetch) #define HAS_BUILTIN_PREFECTCH #endif -#elif defined(__GNUC__) && ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4) +#elif defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4)) #define HAS_BUILTIN_PREFECTCH #endif @@ -114,7 +114,7 @@ typedef struct LIBSAIS_UNBWT_CONTEXT #if __has_builtin(__builtin_bswap16) #define HAS_BUILTIN_BSWAP16 #endif -#elif defined(__GNUC__) && ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)) || (__GNUC__ >= 5) +#elif defined(__GNUC__) && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)) || (__GNUC__ >= 5)) #define HAS_BUILTIN_BSWAP16 #endif @@ -7545,3 +7545,226 @@ int64_t libsais64_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int64_t * A, int } #endif + +static void libsais64_compute_phi(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; sa_sint_t k = omp_block_start > 0 ? SA[omp_block_start - 1] : n; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais64_prefetchw(&PLCP[SA[i + prefetch_distance + 0]]); + libsais64_prefetchw(&PLCP[SA[i + prefetch_distance + 1]]); + + PLCP[SA[i + 0]] = k; k = SA[i + 0]; + PLCP[SA[i + 1]] = k; k = SA[i + 1]; + + libsais64_prefetchw(&PLCP[SA[i + prefetch_distance + 2]]); + libsais64_prefetchw(&PLCP[SA[i + prefetch_distance + 3]]); + + PLCP[SA[i + 2]] = k; k = SA[i + 2]; + PLCP[SA[i + 3]] = k; k = SA[i + 3]; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + PLCP[SA[i]] = k; k = SA[i]; + } +} + +static void libsais64_compute_phi_omp(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + libsais64_compute_phi(SA, PLCP, n, omp_block_start, omp_block_size); + } +} + +static void libsais64_compute_plcp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, fast_sint_t n, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j, l = 0; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j; i += 1) + { + libsais64_prefetch(&T[PLCP[i + prefetch_distance] + l]); + + fast_sint_t k = PLCP[i], m = n - (i > k ? i : k); + while (l < m && T[i + l] == T[k + l]) { l++; } + + PLCP[i] = (sa_sint_t)l; l -= (l != 0); + } + + for (j += prefetch_distance; i < j; i += 1) + { + fast_sint_t k = PLCP[i], m = n - (i > k ? i : k); + while (l < m && T[i + l] == T[k + l]) { l++; } + + PLCP[i] = (sa_sint_t)l; l -= (l != 0); + } +} + +static void libsais64_compute_plcp_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + libsais64_compute_plcp(T, PLCP, n, omp_block_start, omp_block_size); + } +} + +static void libsais64_compute_lcp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT LCP, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +{ + const fast_sint_t prefetch_distance = 32; + + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) + { + libsais64_prefetch(&PLCP[SA[i + prefetch_distance + 0]]); + libsais64_prefetch(&PLCP[SA[i + prefetch_distance + 1]]); + + LCP[i + 0] = PLCP[SA[i + 0]]; + LCP[i + 1] = PLCP[SA[i + 1]]; + + libsais64_prefetch(&PLCP[SA[i + prefetch_distance + 2]]); + libsais64_prefetch(&PLCP[SA[i + prefetch_distance + 3]]); + + LCP[i + 2] = PLCP[SA[i + 2]]; + LCP[i + 3] = PLCP[SA[i + 3]]; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + LCP[i] = PLCP[SA[i]]; + } +} + +static void libsais64_compute_lcp_omp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT LCP, sa_sint_t n, sa_sint_t threads) +{ +#if defined(_OPENMP) + #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) +#endif + { +#if defined(_OPENMP) + fast_sint_t omp_thread_num = omp_get_thread_num(); + fast_sint_t omp_num_threads = omp_get_num_threads(); +#else + UNUSED(threads); + + fast_sint_t omp_thread_num = 0; + fast_sint_t omp_num_threads = 1; +#endif + fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); + fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; + fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; + + libsais64_compute_lcp(PLCP, SA, LCP, omp_block_start, omp_block_size); + } +} + +int64_t libsais64_plcp(const uint8_t * T, const int64_t * SA, int64_t * PLCP, int64_t n) +{ + if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { PLCP[0] = 0; } + return 0; + } + + libsais64_compute_phi_omp(SA, PLCP, n, 1); + libsais64_compute_plcp_omp(T, PLCP, n, 1); + + return 0; +} + +int64_t libsais64_lcp(const int64_t * PLCP, const int64_t * SA, int64_t * LCP, int64_t n) +{ + if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { LCP[0] = PLCP[SA[0]]; } + return 0; + } + + libsais64_compute_lcp_omp(PLCP, SA, LCP, n, 1); + + return 0; +} + +#if defined(_OPENMP) + +int64_t libsais64_plcp_omp(const uint8_t * T, const int64_t * SA, int64_t * PLCP, int64_t n, int64_t threads) +{ + if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (threads < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { PLCP[0] = 0; } + return 0; + } + + threads = threads > 0 ? threads : omp_get_max_threads(); + + libsais64_compute_phi_omp(SA, PLCP, n, threads); + libsais64_compute_plcp_omp(T, PLCP, n, threads); + + return 0; +} + +int64_t libsais64_lcp_omp(const int64_t * PLCP, const int64_t * SA, int64_t * LCP, int64_t n, int64_t threads) +{ + if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0) || (threads < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { LCP[0] = PLCP[SA[0]]; } + return 0; + } + + threads = threads > 0 ? threads : omp_get_max_threads(); + + libsais64_compute_lcp_omp(PLCP, SA, LCP, n, threads); + + return 0; +} + +#endif diff --git a/src/libsais64.h b/src/libsais64.h index 17908ae..f909572 100644 --- a/src/libsais64.h +++ b/src/libsais64.h @@ -1,7 +1,7 @@ /*-- -This file is a part of libsais, a library for linear time -suffix array and burrows wheeler transform construction. +This file is a part of libsais, a library for linear time suffix array, +longest common prefix array and burrows wheeler transform construction. Copyright (c) 2021-2022 Ilya Grebnov @@ -56,7 +56,7 @@ extern "C" { #endif /** - * Constructs the burrows-wheeler transformed string of a given string. + * Constructs the burrows-wheeler transformed string (BWT) of a given string. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. @@ -68,7 +68,7 @@ extern "C" { int64_t libsais64_bwt(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq); /** - * Constructs the burrows-wheeler transformed string of a given string with auxiliary indexes. + * Constructs the burrows-wheeler transformed string (BWT) of a given string with auxiliary indexes. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. @@ -83,7 +83,7 @@ extern "C" { #if defined(_OPENMP) /** - * Constructs the burrows-wheeler transformed string of a given string in parallel using OpenMP. + * Constructs the burrows-wheeler transformed string (BWT) of a given string in parallel using OpenMP. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. @@ -96,7 +96,7 @@ extern "C" { int64_t libsais64_bwt_omp(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t threads); /** - * Constructs the burrows-wheeler transformed string of a given string with auxiliary indexes in parallel using OpenMP. + * Constructs the burrows-wheeler transformed string (BWT) of a given string with auxiliary indexes in parallel using OpenMP. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. @@ -112,7 +112,7 @@ extern "C" { #endif /** - * Constructs the original string from a given burrows-wheeler transformed string with primary index. + * Constructs the original string from a given burrows-wheeler transformed string (BWT) with primary index. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). @@ -124,7 +124,7 @@ extern "C" { int64_t libsais64_unbwt(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t i); /** - * Constructs the original string from a given burrows-wheeler transformed string with auxiliary indexes. + * Constructs the original string from a given burrows-wheeler transformed string (BWT) with auxiliary indexes. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). @@ -138,7 +138,7 @@ extern "C" { #if defined(_OPENMP) /** - * Constructs the original string from a given burrows-wheeler transformed string with primary index in parallel using OpenMP. + * Constructs the original string from a given burrows-wheeler transformed string (BWT) with primary index in parallel using OpenMP. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). @@ -151,7 +151,7 @@ extern "C" { int64_t libsais64_unbwt_omp(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t i, int64_t threads); /** - * Constructs the original string from a given burrows-wheeler transformed string with auxiliary indexes in parallel using OpenMP. + * Constructs the original string from a given burrows-wheeler transformed string (BWT) with auxiliary indexes in parallel using OpenMP. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). @@ -165,6 +165,50 @@ extern "C" { int64_t libsais64_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t r, const int64_t * I, int64_t threads); #endif + /** + * Constructs the permuted longest common prefix array (PLCP) of a given string and a suffix array. + * @param T [0..n-1] The input string. + * @param SA [0..n-1] The input suffix array. + * @param PLCP [0..n-1] The output permuted longest common prefix array. + * @param n The length of the string and the suffix array. + * @return 0 if no error occurred, -1 otherwise. + */ + int64_t libsais64_plcp(const uint8_t * T, const int64_t * SA, int64_t * PLCP, int64_t n); + + /** + * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array. + * @param PLCP [0..n-1] The input permuted longest common prefix array. + * @param SA [0..n-1] The input suffix array. + * @param LCP [0..n-1] The output longest common prefix array (can be SA). + * @param n The length of the permuted longest common prefix array and the suffix array. + * @return 0 if no error occurred, -1 otherwise. + */ + int64_t libsais64_lcp(const int64_t * PLCP, const int64_t * SA, int64_t * LCP, int64_t n); + +#if defined(_OPENMP) + /** + * Constructs the permuted longest common prefix array (PLCP) of a given string and a suffix array in parallel using OpenMP. + * @param T [0..n-1] The input string. + * @param SA [0..n-1] The input suffix array. + * @param PLCP [0..n-1] The output permuted longest common prefix array. + * @param n The length of the string and the suffix array. + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return 0 if no error occurred, -1 otherwise. + */ + int64_t libsais64_plcp_omp(const uint8_t * T, const int64_t * SA, int64_t * PLCP, int64_t n, int64_t threads); + + /** + * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array in parallel using OpenMP. + * @param PLCP [0..n-1] The input permuted longest common prefix array. + * @param SA [0..n-1] The input suffix array. + * @param LCP [0..n-1] The output longest common prefix array (can be SA). + * @param n The length of the permuted longest common prefix array and the suffix array. + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return 0 if no error occurred, -1 otherwise. + */ + int64_t libsais64_lcp_omp(const int64_t * PLCP, const int64_t * SA, int64_t * LCP, int64_t n, int64_t threads); +#endif + #ifdef __cplusplus } #endif