From 381e3e53042dcf8f839ca7ee72d1cea8073d48c2 Mon Sep 17 00:00:00 2001 From: Ilya Grebnov Date: Fri, 23 Feb 2024 20:08:16 -0800 Subject: [PATCH] Resolved strict aliasing violation resulted in invalid code generation by Intel compiler. --- CHANGES | 4 ++ CMakeLists.txt | 2 +- README.md | 5 +- VERSION | 2 +- include/libsais.h | 6 +-- include/libsais16.h | 64 ++++++++++++------------ include/libsais64.h | 42 ++++++++-------- src/libsais.c | 65 ++++++++++++++----------- src/libsais16.c | 61 +++++++++++++---------- src/libsais64.c | 115 ++++++++++++++++++++++++++++---------------- 10 files changed, 213 insertions(+), 153 deletions(-) diff --git a/CHANGES b/CHANGES index 22349af..64eebfb 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,7 @@ +Changes in 2.7.4 (February 23, 2024) +- Improved performance of suffix array and burrows wheeler transform construction on degenerate inputs. +- Resolved strict aliasing violation resulted in invalid code generation by Intel compiler. + Changes in 2.7.3 (April 21, 2023) - CMake script for library build and integration with other projects. diff --git a/CMakeLists.txt b/CMakeLists.txt index 310dca3..3bb5e26 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.10) -project(libsais VERSION 2.7.3 LANGUAGES C DESCRIPTION "libsais is a library for linear time suffix array, longest common prefix array and burrows wheeler transform construction based on induced sorting algorithm.") +project(libsais VERSION 2.7.4 LANGUAGES C DESCRIPTION "libsais is a library for linear time suffix array, longest common prefix array and burrows wheeler transform construction based on induced sorting algorithm.") set(CMAKE_C_STANDARD 99) set(CMAKE_C_STANDARD_REQUIRED ON) diff --git a/README.md b/README.md index ad5c6b4..0cd2256 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ The libsais is a library for fast (see [Benchmarks](#benchmarks) below) linear t * Nataliya Timoshevskaya, Wu-chun Feng *SAIS-OPT: On the characterization and optimization of the SA-IS algorithm for suffix array construction*, 2014 * Jing Yi Xie, Ge Nong, Bin Lao, Wentao Xu *Scalable Suffix Sorting on a Multicore Machine*, 2020 -Copyright (c) 2021-2022 Ilya Grebnov +Copyright (c) 2021-2024 Ilya Grebnov >The libsais is inspired by [libdivsufsort](https://github.com/y-256/libdivsufsort), [sais](https://sites.google.com/site/yuta256/sais) libraries by Yuta Mori and [msufsort](https://github.com/michaelmaniscalco/msufsort) by Michael Maniscalco. @@ -23,6 +23,9 @@ The libsais provides simple C99 API to construct suffix array and Burrows-Wheele The libsais is released under the [Apache License Version 2.0](LICENSE "Apache license") ## Changes +* February 23, 2024 (2.7.4) + * Improved performance of suffix array and burrows wheeler transform construction on degenerate inputs. + * Resolved strict aliasing violation resulted in invalid code generation by Intel compiler. * April 21, 2023 (2.7.3) * CMake script for library build and integration with other projects. * April 18, 2023 (2.7.2) diff --git a/VERSION b/VERSION index e2bdf6e..74500ce 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.7.3 \ No newline at end of file +2.7.4 \ No newline at end of file diff --git a/include/libsais.h b/include/libsais.h index 8d38774..5cff9f3 100644 --- a/include/libsais.h +++ b/include/libsais.h @@ -3,7 +3,7 @@ This file is a part of libsais, a library for linear time suffix array, longest common prefix array and burrows wheeler transform construction. - Copyright (c) 2021-2022 Ilya Grebnov + Copyright (c) 2021-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,8 +26,8 @@ Please see the file LICENSE for full copyright information. #define LIBSAIS_VERSION_MAJOR 2 #define LIBSAIS_VERSION_MINOR 7 -#define LIBSAIS_VERSION_PATCH 3 -#define LIBSAIS_VERSION_STRING "2.7.3" +#define LIBSAIS_VERSION_PATCH 4 +#define LIBSAIS_VERSION_STRING "2.7.4" #ifdef _WIN32 #ifdef LIBSAIS_SHARED diff --git a/include/libsais16.h b/include/libsais16.h index af7dee1..d603aa4 100644 --- a/include/libsais16.h +++ b/include/libsais16.h @@ -3,7 +3,7 @@ This file is a part of libsais, a library for linear time suffix array, longest common prefix array and burrows wheeler transform construction. - Copyright (c) 2021-2022 Ilya Grebnov + Copyright (c) 2021-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,21 +26,21 @@ Please see the file LICENSE for full copyright information. #define LIBSAIS16_VERSION_MAJOR 2 #define LIBSAIS16_VERSION_MINOR 7 -#define LIBSAIS16_VERSION_PATCH 3 -#define LIBSAIS16_VERSION_STRING "2.7.3" +#define LIBSAIS16_VERSION_PATCH 4 +#define LIBSAIS16_VERSION_STRING "2.7.4" #ifdef _WIN32 #ifdef LIBSAIS_SHARED #ifdef LIBSAIS_EXPORTS - #define LIBSAIS_API __declspec(dllexport) + #define LIBSAIS16_API __declspec(dllexport) #else - #define LIBSAIS_API __declspec(dllimport) + #define LIBSAIS16_API __declspec(dllimport) #endif #else - #define LIBSAIS_API + #define LIBSAIS16_API #endif #else - #define LIBSAIS_API + #define LIBSAIS16_API #endif #ifdef __cplusplus @@ -54,7 +54,7 @@ extern "C" { * In multi-threaded environments, use one context per thread for parallel executions. * @return the libsais16 context, NULL otherwise. */ - LIBSAIS_API void * libsais16_create_ctx(void); + LIBSAIS16_API void * libsais16_create_ctx(void); #if defined(LIBSAIS_OPENMP) /** @@ -63,14 +63,14 @@ extern "C" { * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return the libsais16 context, NULL otherwise. */ - LIBSAIS_API void * libsais16_create_ctx_omp(int32_t threads); + LIBSAIS16_API void * libsais16_create_ctx_omp(int32_t threads); #endif /** * Destroys the libsass context and free previusly allocated memory. * @param ctx The libsais16 context (can be NULL). */ - LIBSAIS_API void libsais16_free_ctx(void * ctx); + LIBSAIS16_API void libsais16_free_ctx(void * ctx); /** * Constructs the suffix array of a given 16-bit string. @@ -81,7 +81,7 @@ extern "C" { * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @return 0 if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int32_t libsais16(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq); + LIBSAIS16_API int32_t libsais16(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq); /** * Constructs the suffix array of a given 16-bit string using libsais16 context. @@ -93,7 +93,7 @@ extern "C" { * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @return 0 if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int32_t libsais16_ctx(const void * ctx, const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq); + LIBSAIS16_API int32_t libsais16_ctx(const void * ctx, const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq); #if defined(LIBSAIS_OPENMP) /** @@ -106,7 +106,7 @@ extern "C" { * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int32_t libsais16_omp(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads); + LIBSAIS16_API int32_t libsais16_omp(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads); #endif /** @@ -119,7 +119,7 @@ extern "C" { * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @return The primary index if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int32_t libsais16_bwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq); + LIBSAIS16_API int32_t libsais16_bwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq); /** * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string with auxiliary indexes. @@ -133,7 +133,7 @@ extern "C" { * @param I [0..(n-1)/r] The output auxiliary indexes. * @return 0 if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int32_t libsais16_bwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I); + LIBSAIS16_API int32_t libsais16_bwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I); /** * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string using libsais16 context. @@ -146,7 +146,7 @@ extern "C" { * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @return The primary index if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int32_t libsais16_bwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq); + LIBSAIS16_API int32_t libsais16_bwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq); /** * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string with auxiliary indexes using libsais16 context. @@ -161,7 +161,7 @@ extern "C" { * @param I [0..(n-1)/r] The output auxiliary indexes. * @return 0 if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int32_t libsais16_bwt_aux_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I); + LIBSAIS16_API int32_t libsais16_bwt_aux_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I); #if defined(LIBSAIS_OPENMP) /** @@ -175,7 +175,7 @@ extern "C" { * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return The primary index if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int32_t libsais16_bwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads); + LIBSAIS16_API int32_t libsais16_bwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads); /** * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string with auxiliary indexes in parallel using OpenMP. @@ -190,7 +190,7 @@ extern "C" { * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int32_t libsais16_bwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads); + LIBSAIS16_API int32_t libsais16_bwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads); #endif /** @@ -198,7 +198,7 @@ extern "C" { * In multi-threaded environments, use one context per thread for parallel executions. * @return the libsais16 context, NULL otherwise. */ - LIBSAIS_API void * libsais16_unbwt_create_ctx(void); + LIBSAIS16_API void * libsais16_unbwt_create_ctx(void); #if defined(LIBSAIS_OPENMP) /** @@ -207,14 +207,14 @@ extern "C" { * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return the libsais16 context, NULL otherwise. */ - LIBSAIS_API void * libsais16_unbwt_create_ctx_omp(int32_t threads); + LIBSAIS16_API void * libsais16_unbwt_create_ctx_omp(int32_t threads); #endif /** * Destroys the libsass reverse BWT context and free previusly allocated memory. * @param ctx The libsais16 context (can be NULL). */ - LIBSAIS_API void libsais16_unbwt_free_ctx(void * ctx); + LIBSAIS16_API void libsais16_unbwt_free_ctx(void * ctx); /** * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with primary index. @@ -226,7 +226,7 @@ extern "C" { * @param i The primary index. * @return 0 if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int32_t libsais16_unbwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i); + LIBSAIS16_API int32_t libsais16_unbwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i); /** * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with primary index using libsais16 reverse BWT context. @@ -239,7 +239,7 @@ extern "C" { * @param i The primary index. * @return 0 if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int32_t libsais16_unbwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i); + LIBSAIS16_API int32_t libsais16_unbwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i); /** * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with auxiliary indexes. @@ -252,7 +252,7 @@ extern "C" { * @param I [0..(n-1)/r] The input auxiliary indexes. * @return 0 if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int32_t libsais16_unbwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I); + LIBSAIS16_API int32_t libsais16_unbwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I); /** * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with auxiliary indexes using libsais16 reverse BWT context. @@ -266,7 +266,7 @@ extern "C" { * @param I [0..(n-1)/r] The input auxiliary indexes. * @return 0 if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int32_t libsais16_unbwt_aux_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I); + LIBSAIS16_API int32_t libsais16_unbwt_aux_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I); #if defined(LIBSAIS_OPENMP) /** @@ -280,7 +280,7 @@ extern "C" { * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int32_t libsais16_unbwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads); + LIBSAIS16_API int32_t libsais16_unbwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads); /** * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with auxiliary indexes in parallel using OpenMP. @@ -294,7 +294,7 @@ extern "C" { * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int32_t libsais16_unbwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads); + LIBSAIS16_API int32_t libsais16_unbwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads); #endif /** @@ -305,7 +305,7 @@ extern "C" { * @param n The length of the 16-bit string and the suffix array. * @return 0 if no error occurred, -1 otherwise. */ - LIBSAIS_API int32_t libsais16_plcp(const uint16_t * T, const int32_t * SA, int32_t * PLCP, int32_t n); + LIBSAIS16_API int32_t libsais16_plcp(const uint16_t * T, const int32_t * SA, int32_t * PLCP, int32_t n); /** * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array. @@ -315,7 +315,7 @@ extern "C" { * @param n The length of the permuted longest common prefix array and the suffix array. * @return 0 if no error occurred, -1 otherwise. */ - LIBSAIS_API int32_t libsais16_lcp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n); + LIBSAIS16_API int32_t libsais16_lcp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n); #if defined(LIBSAIS_OPENMP) /** @@ -327,7 +327,7 @@ extern "C" { * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 otherwise. */ - LIBSAIS_API int32_t libsais16_plcp_omp(const uint16_t * T, const int32_t * SA, int32_t * PLCP, int32_t n, int32_t threads); + LIBSAIS16_API int32_t libsais16_plcp_omp(const uint16_t * T, const int32_t * SA, int32_t * PLCP, int32_t n, int32_t threads); /** * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array in parallel using OpenMP. @@ -338,7 +338,7 @@ extern "C" { * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 otherwise. */ - LIBSAIS_API int32_t libsais16_lcp_omp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n, int32_t threads); + LIBSAIS16_API int32_t libsais16_lcp_omp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n, int32_t threads); #endif #ifdef __cplusplus diff --git a/include/libsais64.h b/include/libsais64.h index 7176070..ace7cf9 100644 --- a/include/libsais64.h +++ b/include/libsais64.h @@ -3,7 +3,7 @@ This file is a part of libsais, a library for linear time suffix array, longest common prefix array and burrows wheeler transform construction. - Copyright (c) 2021-2022 Ilya Grebnov + Copyright (c) 2021-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,21 +26,21 @@ Please see the file LICENSE for full copyright information. #define LIBSAIS64_VERSION_MAJOR 2 #define LIBSAIS64_VERSION_MINOR 7 -#define LIBSAIS64_VERSION_PATCH 3 -#define LIBSAIS64_VERSION_STRING "2.7.3" +#define LIBSAIS64_VERSION_PATCH 4 +#define LIBSAIS64_VERSION_STRING "2.7.4" #ifdef _WIN32 #ifdef LIBSAIS_SHARED #ifdef LIBSAIS_EXPORTS - #define LIBSAIS_API __declspec(dllexport) + #define LIBSAIS64_API __declspec(dllexport) #else - #define LIBSAIS_API __declspec(dllimport) + #define LIBSAIS64_API __declspec(dllimport) #endif #else - #define LIBSAIS_API + #define LIBSAIS64_API #endif #else - #define LIBSAIS_API + #define LIBSAIS64_API #endif #ifdef __cplusplus @@ -58,7 +58,7 @@ extern "C" { * @param freq [0..255] The output symbol frequency table (can be NULL). * @return 0 if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int64_t libsais64(const uint8_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq); + LIBSAIS64_API int64_t libsais64(const uint8_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq); #if defined(LIBSAIS_OPENMP) /** @@ -71,7 +71,7 @@ extern "C" { * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int64_t libsais64_omp(const uint8_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq, int64_t threads); + LIBSAIS64_API int64_t libsais64_omp(const uint8_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq, int64_t threads); #endif /** @@ -84,7 +84,7 @@ extern "C" { * @param freq [0..255] The output symbol frequency table (can be NULL). * @return The primary index if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int64_t libsais64_bwt(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq); + LIBSAIS64_API int64_t libsais64_bwt(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq); /** * Constructs the burrows-wheeler transformed string (BWT) of a given string with auxiliary indexes. @@ -98,7 +98,7 @@ extern "C" { * @param I [0..(n-1)/r] The output auxiliary indexes. * @return 0 if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int64_t libsais64_bwt_aux(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t r, int64_t * I); + LIBSAIS64_API int64_t libsais64_bwt_aux(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t r, int64_t * I); #if defined(LIBSAIS_OPENMP) /** @@ -112,7 +112,7 @@ extern "C" { * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return The primary index if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int64_t libsais64_bwt_omp(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t threads); + LIBSAIS64_API int64_t libsais64_bwt_omp(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t threads); /** * Constructs the burrows-wheeler transformed string (BWT) of a given string with auxiliary indexes in parallel using OpenMP. @@ -127,7 +127,7 @@ extern "C" { * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int64_t libsais64_bwt_aux_omp(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t r, int64_t * I, int64_t threads); + LIBSAIS64_API int64_t libsais64_bwt_aux_omp(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t r, int64_t * I, int64_t threads); #endif /** @@ -140,7 +140,7 @@ extern "C" { * @param i The primary index. * @return 0 if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int64_t libsais64_unbwt(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t i); + LIBSAIS64_API int64_t libsais64_unbwt(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t i); /** * Constructs the original string from a given burrows-wheeler transformed string (BWT) with auxiliary indexes. @@ -153,7 +153,7 @@ extern "C" { * @param I [0..(n-1)/r] The input auxiliary indexes. * @return 0 if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int64_t libsais64_unbwt_aux(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t r, const int64_t * I); + LIBSAIS64_API int64_t libsais64_unbwt_aux(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t r, const int64_t * I); #if defined(LIBSAIS_OPENMP) /** @@ -167,7 +167,7 @@ extern "C" { * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int64_t libsais64_unbwt_omp(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t i, int64_t threads); + LIBSAIS64_API int64_t libsais64_unbwt_omp(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t i, int64_t threads); /** * Constructs the original string from a given burrows-wheeler transformed string (BWT) with auxiliary indexes in parallel using OpenMP. @@ -181,7 +181,7 @@ extern "C" { * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ - LIBSAIS_API int64_t libsais64_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t r, const int64_t * I, int64_t threads); + LIBSAIS64_API int64_t libsais64_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t r, const int64_t * I, int64_t threads); #endif /** @@ -192,7 +192,7 @@ extern "C" { * @param n The length of the string and the suffix array. * @return 0 if no error occurred, -1 otherwise. */ - LIBSAIS_API int64_t libsais64_plcp(const uint8_t * T, const int64_t * SA, int64_t * PLCP, int64_t n); + LIBSAIS64_API int64_t libsais64_plcp(const uint8_t * T, const int64_t * SA, int64_t * PLCP, int64_t n); /** * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array. @@ -202,7 +202,7 @@ extern "C" { * @param n The length of the permuted longest common prefix array and the suffix array. * @return 0 if no error occurred, -1 otherwise. */ - LIBSAIS_API int64_t libsais64_lcp(const int64_t * PLCP, const int64_t * SA, int64_t * LCP, int64_t n); + LIBSAIS64_API int64_t libsais64_lcp(const int64_t * PLCP, const int64_t * SA, int64_t * LCP, int64_t n); #if defined(LIBSAIS_OPENMP) /** @@ -214,7 +214,7 @@ extern "C" { * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 otherwise. */ - LIBSAIS_API int64_t libsais64_plcp_omp(const uint8_t * T, const int64_t * SA, int64_t * PLCP, int64_t n, int64_t threads); + LIBSAIS64_API int64_t libsais64_plcp_omp(const uint8_t * T, const int64_t * SA, int64_t * PLCP, int64_t n, int64_t threads); /** * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array in parallel using OpenMP. @@ -225,7 +225,7 @@ extern "C" { * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 otherwise. */ - LIBSAIS_API int64_t libsais64_lcp_omp(const int64_t * PLCP, const int64_t * SA, int64_t * LCP, int64_t n, int64_t threads); + LIBSAIS64_API int64_t libsais64_lcp_omp(const int64_t * PLCP, const int64_t * SA, int64_t * LCP, int64_t n, int64_t threads); #endif #ifdef __cplusplus diff --git a/src/libsais.c b/src/libsais.c index 52036fc..155d39c 100644 --- a/src/libsais.c +++ b/src/libsais.c @@ -3,7 +3,7 @@ This file is a part of libsais, a library for linear time suffix array, longest common prefix array and burrows wheeler transform construction. - Copyright (c) 2021-2022 Ilya Grebnov + Copyright (c) 2021-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -53,6 +53,7 @@ typedef size_t fast_uint_t; #define BUCKETS_INDEX2(_c, _s) (((_c) << 1) + (_s)) #define BUCKETS_INDEX4(_c, _s) (((_c) << 2) + (_s)) +#define LIBSAIS_LOCAL_BUFFER_SIZE (1024) #define LIBSAIS_PER_THREAD_CACHE_SIZE (24576) typedef struct LIBSAIS_THREAD_CACHE @@ -232,7 +233,7 @@ static void libsais_free_thread_state(LIBSAIS_THREAD_STATE * thread_state) static LIBSAIS_CONTEXT * libsais_create_ctx_main(sa_sint_t threads) { LIBSAIS_CONTEXT * RESTRICT ctx = (LIBSAIS_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_CONTEXT), 64); - sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); + sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned((size_t)8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL; if (ctx != NULL && buckets != NULL && (thread_state != NULL || threads == 1)) @@ -691,7 +692,7 @@ static void libsais_count_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRI static sa_sint_t libsais_count_and_gather_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { - memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t)); + memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t)); fast_sint_t m = omp_block_start + omp_block_size - 1; @@ -777,7 +778,7 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(const uint8_t * RE #pragma omp master { - memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t)); + memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t)); fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) @@ -2144,7 +2145,7 @@ static void libsais_partial_sorting_scan_left_to_right_8u_block_prepare(const ui sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; - memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t)); + memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; sa_sint_t d = 1; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) @@ -2975,7 +2976,7 @@ static void libsais_partial_sorting_scan_right_to_left_8u_block_prepare(const ui sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; - memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t)); + memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; sa_sint_t d = 1; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) @@ -3805,7 +3806,7 @@ static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(sa_sint_t * R static void libsais_induce_partial_order_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { - memset(&buckets[2 * ALPHABET_SIZE], 0, 2 * ALPHABET_SIZE * sizeof(sa_sint_t)); + memset(&buckets[2 * ALPHABET_SIZE], 0, (size_t)2 * ALPHABET_SIZE * sizeof(sa_sint_t)); sa_sint_t d = libsais_partial_sorting_scan_left_to_right_8u_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state); libsais_partial_sorting_shift_markers_8u_omp(SA, n, buckets, threads); @@ -6259,14 +6260,15 @@ static void libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(sa_sint_t * RE } } -static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +static sa_sint_t libsais_main_32s_recursion(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state, sa_sint_t * RESTRICT local_buffer) { fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n); - if (k > 0 && fs / k >= 6) + if (k > 0 && ((fs / k >= 6) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 6))) { - sa_sint_t alignment = (fs - 1024) / k >= 6 ? 1024 : 16; + sa_sint_t alignment = (fs - 1024) / k >= 6 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 6 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * (fast_sint_t)k]; + buckets = (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 6) ? local_buffer : buckets; sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state); if (m > 1) @@ -6289,7 +6291,7 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S { sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); - if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0) + if (libsais_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } @@ -6316,10 +6318,11 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S return 0; } - else if (k > 0 && fs / k >= 4) + else if (k > 0 && ((fs / k >= 4) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 4))) { - sa_sint_t alignment = (fs - 1024) / k >= 4 ? 1024 : 16; + sa_sint_t alignment = (fs - 1024) / k >= 4 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 4 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * (fast_sint_t)k]; + buckets = (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 4) ? local_buffer : buckets; sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); if (m > 1) @@ -6337,7 +6340,7 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S { sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); - if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0) + if (libsais_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } @@ -6360,10 +6363,11 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S return 0; } - else if (k > 0 && fs / k >= 2) + else if (k > 0 && ((fs / k >= 2) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 2))) { - sa_sint_t alignment = (fs - 1024) / k >= 2 ? 1024 : 16; + sa_sint_t alignment = (fs - 1024) / k >= 2 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 2 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * (fast_sint_t)k]; + buckets = (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 2) ? local_buffer : buckets; sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); if (m > 1) @@ -6381,7 +6385,7 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S { sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); - if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0) + if (libsais_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } @@ -6410,7 +6414,7 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S { sa_sint_t * buffer = fs < k ? (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096) : (sa_sint_t *)NULL; - sa_sint_t alignment = fs - 1024 >= k ? 1024 : 16; + sa_sint_t alignment = fs - 1024 >= k ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = fs - alignment >= k ? (sa_sint_t *)libsais_align_up(&SA[n + fs - k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : fs >= k ? &SA[n + fs - k] : buffer; if (buckets == NULL) { return -2; } @@ -6432,7 +6436,7 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); - if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0) + if (libsais_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } @@ -6455,6 +6459,13 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S } } +static sa_sint_t libsais_main_32s_entry(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t local_buffer[LIBSAIS_LOCAL_BUFFER_SIZE]; + + return libsais_main_32s_recursion(T, SA, n, k, fs, threads, thread_state, local_buffer); +} + static sa_sint_t libsais_main_8u(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n); @@ -6478,7 +6489,7 @@ static sa_sint_t libsais_main_8u(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t names = libsais_renumber_and_gather_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state); if (names < m) { - if (libsais_main_32s(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0) + if (libsais_main_32s_entry(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0) { return -2; } @@ -6500,7 +6511,7 @@ static sa_sint_t libsais_main_8u(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, static sa_sint_t libsais_main(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads) { LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL; - sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); + sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned((size_t)8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); sa_sint_t index = buckets != NULL && (thread_state != NULL || threads == 1) ? libsais_main_8u(T, SA, n, buckets, bwt, r, I, fs, freq, threads, thread_state) @@ -6517,7 +6528,7 @@ static int32_t libsais_main_int(sa_sint_t * T, sa_sint_t * SA, sa_sint_t n, sa_s LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL; sa_sint_t index = thread_state != NULL || threads == 1 - ? libsais_main_32s(T, SA, n, k, fs, threads, thread_state) + ? libsais_main_32s_entry(T, SA, n, k, fs, threads, thread_state) : -2; libsais_free_thread_state(thread_state); @@ -6906,7 +6917,7 @@ static void libsais_unbwt_compute_histogram(const uint8_t * RESTRICT T, fast_sin { sa_uint_t copy[4 * (ALPHABET_SIZE + 16)]; - memset(copy, 0, 4 * (ALPHABET_SIZE + 16) * sizeof(sa_uint_t)); + memset(copy, 0, (size_t)4 * (ALPHABET_SIZE + 16) * sizeof(sa_uint_t)); sa_uint_t * RESTRICT copy0 = copy + 0 * (ALPHABET_SIZE + 16); sa_uint_t * RESTRICT copy1 = copy + 1 * (ALPHABET_SIZE + 16); @@ -7090,7 +7101,7 @@ static void libsais_unbwt_init_single(const uint8_t * RESTRICT T, sa_uint_t * RE fast_uint_t index = I[0]; fast_uint_t lastc = T[0]; - fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; } + fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } if (freq != NULL) { @@ -7134,7 +7145,7 @@ static void libsais_unbwt_init_parallel(const uint8_t * RESTRICT T, sa_uint_t * fast_uint_t index = I[0]; fast_uint_t lastc = T[0]; - fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; } + fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } memset(bucket1, 0, ALPHABET_SIZE * sizeof(sa_uint_t)); memset(bucket2, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t)); @@ -7418,7 +7429,7 @@ static void libsais_unbwt_decode_8(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, static void libsais_unbwt_decode(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_sint_t blocks, fast_uint_t remainder) { - fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; } + fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } fast_uint_t offset = 0; while (blocks > 8) @@ -7530,7 +7541,7 @@ static sa_sint_t libsais_unbwt_core(const uint8_t * RESTRICT T, uint8_t * RESTRI static sa_sint_t libsais_unbwt_main(const uint8_t * T, uint8_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I, sa_sint_t threads) { - fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; } + fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096); uint16_t * RESTRICT fastbits = (uint16_t *)libsais_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(uint16_t), 4096); diff --git a/src/libsais16.c b/src/libsais16.c index d9ad40f..0a5315f 100644 --- a/src/libsais16.c +++ b/src/libsais16.c @@ -3,7 +3,7 @@ This file is a part of libsais, a library for linear time suffix array, longest common prefix array and burrows wheeler transform construction. - Copyright (c) 2021-2022 Ilya Grebnov + Copyright (c) 2021-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -53,6 +53,7 @@ typedef size_t fast_uint_t; #define BUCKETS_INDEX2(_c, _s) (((_c) << 1) + (_s)) #define BUCKETS_INDEX4(_c, _s) (((_c) << 2) + (_s)) +#define LIBSAIS_LOCAL_BUFFER_SIZE (1024) #define LIBSAIS_PER_THREAD_CACHE_SIZE (24576) typedef struct LIBSAIS_THREAD_CACHE @@ -210,7 +211,7 @@ static void libsais16_free_thread_state(LIBSAIS_THREAD_STATE * thread_state) static LIBSAIS_CONTEXT * libsais16_create_ctx_main(sa_sint_t threads) { LIBSAIS_CONTEXT * RESTRICT ctx = (LIBSAIS_CONTEXT *)libsais16_alloc_aligned(sizeof(LIBSAIS_CONTEXT), 64); - sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais16_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); + sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais16_alloc_aligned((size_t)8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais16_alloc_thread_state(threads) : NULL; if (ctx != NULL && buckets != NULL && (thread_state != NULL || threads == 1)) @@ -669,7 +670,7 @@ static void libsais16_count_compacted_lms_suffixes_32s_2k(const sa_sint_t * REST static sa_sint_t libsais16_count_and_gather_lms_suffixes_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { - memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t)); + memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t)); fast_sint_t m = omp_block_start + omp_block_size - 1; @@ -755,7 +756,7 @@ static sa_sint_t libsais16_count_and_gather_lms_suffixes_16u_omp(const uint16_t #pragma omp master { - memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t)); + memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t)); fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) @@ -2122,7 +2123,7 @@ static void libsais16_partial_sorting_scan_left_to_right_16u_block_prepare(const sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; - memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t)); + memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; sa_sint_t d = 1; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) @@ -2953,7 +2954,7 @@ static void libsais16_partial_sorting_scan_right_to_left_16u_block_prepare(const sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; - memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t)); + memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; sa_sint_t d = 1; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) @@ -3783,7 +3784,7 @@ static void libsais16_partial_sorting_gather_lms_suffixes_32s_1k_omp(sa_sint_t * static void libsais16_induce_partial_order_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { - memset(&buckets[2 * ALPHABET_SIZE], 0, 2 * ALPHABET_SIZE * sizeof(sa_sint_t)); + memset(&buckets[2 * ALPHABET_SIZE], 0, (size_t)2 * ALPHABET_SIZE * sizeof(sa_sint_t)); sa_sint_t d = libsais16_partial_sorting_scan_left_to_right_16u_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state); libsais16_partial_sorting_shift_markers_16u_omp(SA, n, buckets, threads); @@ -6237,14 +6238,15 @@ static void libsais16_reconstruct_compacted_lms_suffixes_32s_1k_omp(sa_sint_t * } } -static sa_sint_t libsais16_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +static sa_sint_t libsais16_main_32s_recursion(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state, sa_sint_t * RESTRICT local_buffer) { fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n); - if (k > 0 && fs / k >= 6) + if (k > 0 && ((fs / k >= 6) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 6))) { - sa_sint_t alignment = (fs - 1024) / k >= 6 ? 1024 : 16; + sa_sint_t alignment = (fs - 1024) / k >= 6 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 6 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * (fast_sint_t)k]; + buckets = (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 6) ? local_buffer : buckets; sa_sint_t m = libsais16_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state); if (m > 1) @@ -6267,7 +6269,7 @@ static sa_sint_t libsais16_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT { sa_sint_t f = libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); - if (libsais16_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0) + if (libsais16_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } @@ -6294,10 +6296,11 @@ static sa_sint_t libsais16_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT return 0; } - else if (k > 0 && fs / k >= 4) + else if (k > 0 && ((fs / k >= 4) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 4))) { - sa_sint_t alignment = (fs - 1024) / k >= 4 ? 1024 : 16; + sa_sint_t alignment = (fs - 1024) / k >= 4 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 4 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * (fast_sint_t)k]; + buckets = (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 4) ? local_buffer : buckets; sa_sint_t m = libsais16_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); if (m > 1) @@ -6315,7 +6318,7 @@ static sa_sint_t libsais16_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT { sa_sint_t f = libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); - if (libsais16_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0) + if (libsais16_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } @@ -6338,10 +6341,11 @@ static sa_sint_t libsais16_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT return 0; } - else if (k > 0 && fs / k >= 2) + else if (k > 0 && ((fs / k >= 2) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 2))) { - sa_sint_t alignment = (fs - 1024) / k >= 2 ? 1024 : 16; + sa_sint_t alignment = (fs - 1024) / k >= 2 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 2 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * (fast_sint_t)k]; + buckets = (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 2) ? local_buffer : buckets; sa_sint_t m = libsais16_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); if (m > 1) @@ -6359,7 +6363,7 @@ static sa_sint_t libsais16_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT { sa_sint_t f = libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); - if (libsais16_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0) + if (libsais16_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } @@ -6388,7 +6392,7 @@ static sa_sint_t libsais16_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT { sa_sint_t * buffer = fs < k ? (sa_sint_t *)libsais16_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096) : (sa_sint_t *)NULL; - sa_sint_t alignment = fs - 1024 >= k ? 1024 : 16; + sa_sint_t alignment = fs - 1024 >= k ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = fs - alignment >= k ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : fs >= k ? &SA[n + fs - k] : buffer; if (buckets == NULL) { return -2; } @@ -6410,7 +6414,7 @@ static sa_sint_t libsais16_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT sa_sint_t f = libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); - if (libsais16_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0) + if (libsais16_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } @@ -6433,6 +6437,13 @@ static sa_sint_t libsais16_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT } } +static sa_sint_t libsais16_main_32s_entry(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t local_buffer[LIBSAIS_LOCAL_BUFFER_SIZE]; + + return libsais16_main_32s_recursion(T, SA, n, k, fs, threads, thread_state, local_buffer); +} + static sa_sint_t libsais16_main_16u(const uint16_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n); @@ -6456,7 +6467,7 @@ static sa_sint_t libsais16_main_16u(const uint16_t * T, sa_sint_t * SA, sa_sint_ sa_sint_t names = libsais16_renumber_and_gather_lms_suffixes_16u_omp(SA, n, m, fs, threads, thread_state); if (names < m) { - if (libsais16_main_32s(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0) + if (libsais16_main_32s_entry(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0) { return -2; } @@ -6478,7 +6489,7 @@ static sa_sint_t libsais16_main_16u(const uint16_t * T, sa_sint_t * SA, sa_sint_ static sa_sint_t libsais16_main(const uint16_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads) { LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais16_alloc_thread_state(threads) : NULL; - sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais16_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); + sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais16_alloc_aligned((size_t)8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); sa_sint_t index = buckets != NULL && (thread_state != NULL || threads == 1) ? libsais16_main_16u(T, SA, n, buckets, bwt, r, I, fs, freq, threads, thread_state) @@ -6863,7 +6874,7 @@ static void libsais16_unbwt_calculate_P(const uint16_t * RESTRICT T, sa_uint_t * static void libsais16_unbwt_init_single(const uint16_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits) { fast_uint_t index = I[0]; - fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; } + fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } if (freq != NULL) { @@ -6884,7 +6895,7 @@ static void libsais16_unbwt_init_single(const uint16_t * RESTRICT T, sa_uint_t * static void libsais16_unbwt_init_parallel(const uint16_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads) { fast_uint_t index = I[0]; - fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; } + fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) { @@ -7126,7 +7137,7 @@ static void libsais16_unbwt_decode_8(uint16_t * RESTRICT U, sa_uint_t * RESTRICT static void libsais16_unbwt_decode(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_sint_t blocks, fast_uint_t remainder) { - fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; } + fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } fast_uint_t offset = 0; while (blocks > 8) @@ -7235,7 +7246,7 @@ static sa_sint_t libsais16_unbwt_core(const uint16_t * RESTRICT T, uint16_t * RE static sa_sint_t libsais16_unbwt_main(const uint16_t * T, uint16_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I, sa_sint_t threads) { - fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; } + fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais16_alloc_aligned(ALPHABET_SIZE * sizeof(sa_uint_t), 4096); uint16_t * RESTRICT fastbits = (uint16_t *)libsais16_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(uint16_t), 4096); diff --git a/src/libsais64.c b/src/libsais64.c index 882cae2..8d262b5 100644 --- a/src/libsais64.c +++ b/src/libsais64.c @@ -3,7 +3,7 @@ This file is a part of libsais, a library for linear time suffix array, longest common prefix array and burrows wheeler transform construction. - Copyright (c) 2021-2022 Ilya Grebnov + Copyright (c) 2021-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -54,6 +54,7 @@ typedef uint64_t fast_uint_t; #define BUCKETS_INDEX2(_c, _s) ((((fast_sint_t)_c) << 1) + (fast_sint_t)(_s)) #define BUCKETS_INDEX4(_c, _s) ((((fast_sint_t)_c) << 2) + (fast_sint_t)(_s)) +#define LIBSAIS_LOCAL_BUFFER_SIZE (1024) #define LIBSAIS_PER_THREAD_CACHE_SIZE (24576) typedef struct LIBSAIS_THREAD_CACHE @@ -661,7 +662,7 @@ static void libsais64_count_compacted_lms_suffixes_32s_2k(const sa_sint_t * REST static sa_sint_t libsais64_count_and_gather_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { - memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t)); + memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t)); fast_sint_t m = omp_block_start + omp_block_size - 1; @@ -747,7 +748,7 @@ static sa_sint_t libsais64_count_and_gather_lms_suffixes_8u_omp(const uint8_t * #pragma omp master { - memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t)); + memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t)); fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) @@ -2114,7 +2115,7 @@ static void libsais64_partial_sorting_scan_left_to_right_8u_block_prepare(const sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; - memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t)); + memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; sa_sint_t d = 1; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) @@ -2945,7 +2946,7 @@ static void libsais64_partial_sorting_scan_right_to_left_8u_block_prepare(const sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; - memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t)); + memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; sa_sint_t d = 1; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) @@ -3775,7 +3776,7 @@ static void libsais64_partial_sorting_gather_lms_suffixes_32s_1k_omp(sa_sint_t * static void libsais64_induce_partial_order_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { - memset(&buckets[2 * ALPHABET_SIZE], 0, 2 * ALPHABET_SIZE * sizeof(sa_sint_t)); + memset(&buckets[2 * ALPHABET_SIZE], 0, (size_t)2 * ALPHABET_SIZE * sizeof(sa_sint_t)); sa_sint_t d = libsais64_partial_sorting_scan_left_to_right_8u_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state); libsais64_partial_sorting_shift_markers_8u_omp(SA, n, buckets, threads); @@ -6229,22 +6230,42 @@ static void libsais64_reconstruct_compacted_lms_suffixes_32s_1k_omp(sa_sint_t * } } -static void libsais64_convert_right_to_left_32u_to_64u(uint32_t * S, uint64_t * D, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +static void libsais64_convert_32u_to_64u(uint32_t * RESTRICT S, uint64_t * RESTRICT D, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { - fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start; i >= j; i -= 1) { D[i] = (uint64_t)S[i]; } + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size; i < j; i += 1) + { + D[i] = (uint64_t)S[i]; + } } -static void libsais64_convert_left_to_right_32u_to_64u(uint32_t * RESTRICT S, uint64_t * RESTRICT D, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +static void libsais64_convert_inplace_32u_to_64u(uint32_t * V, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { - fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size; i < j; i += 1) { D[i] = (uint64_t)S[i]; } + fast_sint_t i, j; + for (i = omp_block_start + omp_block_size - 1, j = omp_block_start; i >= j; i -= 1) + { +#if defined(__LITTLE_ENDIAN__) + V[i + i + 0] = V[i]; V[i + i + 1] = 0; +#else + V[i + i + 0] = 0; V[i + i + 1] = V[i]; +#endif + } } -static void libsais64_convert_inplace_64u_to_32u(uint64_t * S, uint32_t * D, fast_sint_t omp_block_start, fast_sint_t omp_block_size) +static void libsais64_convert_inplace_64u_to_32u(uint64_t * V, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { - fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size; i < j; i += 1) { D[i] = (uint32_t)S[i]; } + fast_sint_t i, j; + for (i = omp_block_start, j = omp_block_start + omp_block_size; i < j; i += 1) + { +#if defined(__LITTLE_ENDIAN__) + V[i] = V[i + i + 0]; +#else + V[i] = V[i + i + 1]; +#endif + } } -static void libsais64_convert_inplace_32u_to_64u_omp(uint32_t * S, uint64_t * D, sa_sint_t n, sa_sint_t threads) +static void libsais64_convert_inplace_32u_to_64u_omp(uint32_t * V, sa_sint_t n, sa_sint_t threads) { while (n >= 65536) { @@ -6267,23 +6288,23 @@ static void libsais64_convert_inplace_32u_to_64u_omp(uint32_t * S, uint64_t * D, fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; - libsais64_convert_left_to_right_32u_to_64u(S, D, n + omp_block_start, omp_block_size); + libsais64_convert_32u_to_64u((uint32_t *)V, (uint64_t *)V, n + omp_block_start, omp_block_size); } } - libsais64_convert_right_to_left_32u_to_64u(S, D, 0, n); + libsais64_convert_inplace_32u_to_64u(V, 0, n); } -static sa_sint_t libsais64_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +static sa_sint_t libsais64_main_32s_recursion(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state, sa_sint_t * RESTRICT local_buffer) { fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n); if (n <= INT32_MAX) { sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n; - if (new_fs / k >= 4 || (new_fs >= fs)) + if ((new_fs / k >= 4) || (new_fs >= fs)) { - libsais64_convert_inplace_64u_to_32u((uint64_t *)T, (uint32_t *)T, 0, n); + libsais64_convert_inplace_64u_to_32u((uint64_t *)T, 0, n); #if defined(LIBSAIS_OPENMP) sa_sint_t index = libsais_int_omp((int32_t *)T, (int32_t *)SA, (int32_t)n, (int32_t)k, (int32_t)new_fs, (int32_t)threads); @@ -6292,17 +6313,18 @@ static sa_sint_t libsais64_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT #endif if (index >= 0) { - libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)SA, (uint64_t *)SA, n, threads); + libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)SA, n, threads); } return index; } } - if (k > 0 && fs / k >= 6) + if (k > 0 && ((fs / k >= 6) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 6))) { - sa_sint_t alignment = (fs - 1024) / k >= 6 ? 1024 : 16; + sa_sint_t alignment = (fs - 1024) / k >= 6 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais64_align_up(&SA[n + fs - 6 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * (fast_sint_t)k]; + buckets = (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 6) ? local_buffer : buckets; sa_sint_t m = libsais64_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state); if (m > 1) @@ -6325,7 +6347,7 @@ static sa_sint_t libsais64_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT { sa_sint_t f = libsais64_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); - if (libsais64_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0) + if (libsais64_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } @@ -6352,10 +6374,11 @@ static sa_sint_t libsais64_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT return 0; } - else if (k > 0 && fs / k >= 4) + else if (k > 0 && ((fs / k >= 4) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 4))) { - sa_sint_t alignment = (fs - 1024) / k >= 4 ? 1024 : 16; + sa_sint_t alignment = (fs - 1024) / k >= 4 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais64_align_up(&SA[n + fs - 4 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * (fast_sint_t)k]; + buckets = (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 4) ? local_buffer : buckets; sa_sint_t m = libsais64_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); if (m > 1) @@ -6373,7 +6396,7 @@ static sa_sint_t libsais64_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT { sa_sint_t f = libsais64_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); - if (libsais64_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0) + if (libsais64_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } @@ -6396,10 +6419,11 @@ static sa_sint_t libsais64_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT return 0; } - else if (k > 0 && fs / k >= 2) + else if (k > 0 && ((fs / k >= 2) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 2))) { - sa_sint_t alignment = (fs - 1024) / k >= 2 ? 1024 : 16; + sa_sint_t alignment = (fs - 1024) / k >= 2 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais64_align_up(&SA[n + fs - 2 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * (fast_sint_t)k]; + buckets = (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 2) ? local_buffer : buckets; sa_sint_t m = libsais64_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); if (m > 1) @@ -6417,7 +6441,7 @@ static sa_sint_t libsais64_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT { sa_sint_t f = libsais64_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); - if (libsais64_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0) + if (libsais64_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } @@ -6446,7 +6470,7 @@ static sa_sint_t libsais64_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT { sa_sint_t * buffer = fs < k ? (sa_sint_t *)libsais64_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096) : (sa_sint_t *)NULL; - sa_sint_t alignment = fs - 1024 >= k ? 1024 : 16; + sa_sint_t alignment = fs - 1024 >= k ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = fs - alignment >= k ? (sa_sint_t *)libsais64_align_up(&SA[n + fs - k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : fs >= k ? &SA[n + fs - k] : buffer; if (buckets == NULL) { return -2; } @@ -6468,7 +6492,7 @@ static sa_sint_t libsais64_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT sa_sint_t f = libsais64_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); - if (libsais64_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0) + if (libsais64_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } @@ -6491,6 +6515,13 @@ static sa_sint_t libsais64_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT } } +static sa_sint_t libsais64_main_32s_entry(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) +{ + sa_sint_t local_buffer[LIBSAIS_LOCAL_BUFFER_SIZE]; + + return libsais64_main_32s_recursion(T, SA, n, k, fs, threads, thread_state, local_buffer); +} + static sa_sint_t libsais64_main_8u(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n); @@ -6514,7 +6545,7 @@ static sa_sint_t libsais64_main_8u(const uint8_t * T, sa_sint_t * SA, sa_sint_t sa_sint_t names = libsais64_renumber_and_gather_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state); if (names < m) { - if (libsais64_main_32s(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0) + if (libsais64_main_32s_entry(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0) { return -2; } @@ -6536,7 +6567,7 @@ static sa_sint_t libsais64_main_8u(const uint8_t * T, sa_sint_t * SA, sa_sint_t static sa_sint_t libsais64_main(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads) { LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais64_alloc_thread_state(threads) : NULL; - sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais64_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); + sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais64_alloc_aligned((size_t)8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); sa_sint_t index = buckets != NULL && (thread_state != NULL || threads == 1) ? libsais64_main_8u(T, SA, n, buckets, bwt, r, I, fs, freq, threads, thread_state) @@ -6620,8 +6651,8 @@ int64_t libsais64(const uint8_t * T, int64_t * SA, int64_t n, int64_t fs, int64_ if (index >= 0) { - libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)SA, (uint64_t *)SA, n, 1); - if (freq != NULL) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, (uint64_t *)freq, ALPHABET_SIZE, 1); } + libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)SA, n, 1); + if (freq != NULL) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, 1); } } return index; @@ -6650,7 +6681,7 @@ int64_t libsais64_bwt(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, in if (index >= 0) { - if (freq != NULL) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, (uint64_t *)freq, ALPHABET_SIZE, 1); } + if (freq != NULL) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, 1); } } return index; @@ -6690,8 +6721,8 @@ int64_t libsais64_bwt_aux(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n if (index >= 0) { - libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)I, (uint64_t *)I, 1 + ((n - 1) / r), 1); - if (freq != NULL) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, (uint64_t *)freq, ALPHABET_SIZE, 1); } + libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)I, 1 + ((n - 1) / r), 1); + if (freq != NULL) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, 1); } } return index; @@ -6838,7 +6869,7 @@ static void libsais64_unbwt_compute_histogram(const uint8_t * RESTRICT T, fast_s { sa_uint_t copy[4 * (ALPHABET_SIZE + 16)]; - memset(copy, 0, 4 * (ALPHABET_SIZE + 16) * sizeof(sa_uint_t)); + memset(copy, 0, (size_t)4 * (ALPHABET_SIZE + 16) * sizeof(sa_uint_t)); sa_uint_t * RESTRICT copy0 = copy + 0 * (ALPHABET_SIZE + 16); sa_uint_t * RESTRICT copy1 = copy + 1 * (ALPHABET_SIZE + 16); @@ -7022,7 +7053,7 @@ static void libsais64_unbwt_init_single(const uint8_t * RESTRICT T, sa_uint_t * fast_uint_t index = I[0]; fast_uint_t lastc = T[0]; - fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; } + fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } if (freq != NULL) { @@ -7066,7 +7097,7 @@ static void libsais64_unbwt_init_parallel(const uint8_t * RESTRICT T, sa_uint_t fast_uint_t index = I[0]; fast_uint_t lastc = T[0]; - fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; } + fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } memset(bucket1, 0, ALPHABET_SIZE * sizeof(sa_uint_t)); memset(bucket2, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t)); @@ -7350,7 +7381,7 @@ static void libsais64_unbwt_decode_8(uint8_t * RESTRICT U, sa_uint_t * RESTRICT static void libsais64_unbwt_decode(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_sint_t blocks, fast_uint_t remainder) { - fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; } + fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } fast_uint_t offset = 0; while (blocks > 8) @@ -7462,7 +7493,7 @@ static sa_sint_t libsais64_unbwt_core(const uint8_t * RESTRICT T, uint8_t * REST static sa_sint_t libsais64_unbwt_main(const uint8_t * T, uint8_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I, sa_sint_t threads) { - fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; } + fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais64_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096); uint16_t * RESTRICT fastbits = (uint16_t *)libsais64_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(uint16_t), 4096);