From f8c7124deecf9fd3aa9fd53a25afdbeaf27753af Mon Sep 17 00:00:00 2001
From: Ilya Grebnov <Ilya.Grebnov@gmail.com>
Date: Wed, 12 Jun 2024 21:53:10 -0700
Subject: [PATCH] Implemented suffix array construction of a long 16-bit array
 (libsais16x64).

---
 CHANGES                |    5 +-
 CMakeLists.txt         |    4 +-
 README.md              |   15 +-
 VERSION                |    2 +-
 include/libsais.h      |    4 +-
 include/libsais16.h    |   29 +-
 include/libsais16x64.h |  260 ++
 include/libsais64.h    |    4 +-
 src/libsais16.c        |   45 +
 src/libsais16x64.c     | 7605 ++++++++++++++++++++++++++++++++++++++++
 10 files changed, 7958 insertions(+), 15 deletions(-)
 create mode 100644 include/libsais16x64.h
 create mode 100644 src/libsais16x64.c

diff --git a/CHANGES b/CHANGES
index 2436787..2e52af3 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,5 +1,8 @@
+Changes in 2.8.3 (June 11, 2024)
+- Implemented suffix array construction of a long 16-bit array (libsais16x64).
+
 Changes in 2.8.2 (May 27, 2024)
-- Implemented suffix array construction of a long integer array (libsais64).
+- Implemented suffix array construction of a long 64-bit array (libsais64).
 
 Changes in 2.8.1 (April 5, 2024)
 - Fixed out-of-bound memory access issue for large inputs (libsais64).
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b08fa32..dbe6008 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.10)
 
-project(libsais VERSION 2.8.2 LANGUAGES C DESCRIPTION "libsais is a library for linear time suffix array, longest common prefix array and burrows wheeler transform construction based on induced sorting algorithm.")
+project(libsais VERSION 2.8.3 LANGUAGES C DESCRIPTION "libsais is a library for linear time suffix array, longest common prefix array and burrows wheeler transform construction based on induced sorting algorithm.")
 
 set(CMAKE_C_STANDARD 99)
 set(CMAKE_C_STANDARD_REQUIRED ON)
@@ -20,9 +20,11 @@ add_library(libsais ${LIBSAIS_LIBRARY_TYPE})
 target_sources(libsais PRIVATE
     include/libsais.h
     include/libsais16.h
+    include/libsais16x64.h
     include/libsais64.h
     src/libsais.c
     src/libsais16.c
+    src/libsais16x64.c
     src/libsais64.c
 )
 
diff --git a/README.md b/README.md
index edf3bfd..9e28e9e 100644
--- a/README.md
+++ b/README.md
@@ -23,8 +23,10 @@ The libsais provides simple C99 API to construct suffix array and Burrows-Wheele
 The libsais is released under the [Apache License Version 2.0](LICENSE "Apache license")
 
 ## Changes
+* June 11, 2024 (2.8.3)
+  * Implemented suffix array construction of a long 16-bit array (libsais16x64).
 * May 27, 2024 (2.8.2)
-  * Implemented suffix array construction of a long integer array (libsais64).
+  * Implemented suffix array construction of a long 64-bit array (libsais64).
 * April 5, 2024 (2.8.1)
   * Fixed out-of-bound memory access issue for large inputs (libsais64).
 * March 3, 2024 (2.8.0)
@@ -65,11 +67,12 @@ The libsais is released under the [Apache License Version 2.0](LICENSE "Apache l
 
 ## Versions of the libsais
 * [libsais.c](src/libsais.c) (and corresponding [libsais.h](include/libsais.h)) is for suffix array, PLCP, LCP, forward BWT and reverse BWT construction over 8-bit inputs smaller than 2GB (2147483648 bytes).
-  * This version of the library could also be used to construct suffix array of an integer array (with a caveat that input array must be mutable).
-* [libsais64.c](src/libsais64.c) (and corresponding [libsais64.h](include/libsais64.h)) is optional extension of the library for inputs larger or equlas to 2GB (2147483648 bytes).
-* [libsais16.c](src/libsais16.c) (and corresponding [libsais16.h](include/libsais16.h)) is independent version of the library for 16-bit inputs.
+  * [libsais64.c](src/libsais64.c) (and corresponding [libsais64.h](include/libsais64.h)) is optional extension of the library for inputs larger or equlas to 2GB (2147483648 bytes).
+  * This versions of the library could also be used to construct suffix array of an integer array (with a caveat that input array must be mutable).
+* [libsais16.c](src/libsais16.c) + [libsais16x64.c](src/libsais16x64.c) (and corresponding [libsais16.h](include/libsais16.h) + [libsais16x64.h](include/libsais16x64.h)) is independent version of the library for 16-bit inputs.
+  * This version of the library could also be used to construct suffix array and BWT of a set of strings by adding a unique end-of-string symbol to each string and then computing the result for the concatenated string.
 
-## Examples of APIs (see [libsais.h](include/libsais.h), [libsais16.h](include/libsais16.h) and [libsais64.h](include/libsais64.h) for complete APIs list)
+## Examples of APIs (see [libsais.h](include/libsais.h), [libsais16.h](include/libsais16.h), [libsais16x64.h](include/libsais16x64.h) and [libsais64.h](include/libsais64.h) for complete APIs list)
 ```c
     /**
     * Constructs the suffix array of a given string.
@@ -124,7 +127,7 @@ The libsais is released under the [Apache License Version 2.0](LICENSE "Apache l
 CPMAddPackage(
   NAME libsais
   GITHUB_REPOSITORY IlyaGrebnov/libsais
-  GIT_TAG v2.8.1
+  GIT_TAG v2.8.3
   OPTIONS
     "LIBSAIS_USE_OPENMP OFF"
     "LIBSAIS_BUILD_SHARED_LIB OFF"
diff --git a/VERSION b/VERSION
index 1817afe..9f8d8a9 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.8.2
+2.8.3
diff --git a/include/libsais.h b/include/libsais.h
index 960e181..e2d14d4 100644
--- a/include/libsais.h
+++ b/include/libsais.h
@@ -26,8 +26,8 @@ Please see the file LICENSE for full copyright information.
 
 #define LIBSAIS_VERSION_MAJOR   2
 #define LIBSAIS_VERSION_MINOR   8
-#define LIBSAIS_VERSION_PATCH   2
-#define LIBSAIS_VERSION_STRING  "2.8.2"
+#define LIBSAIS_VERSION_PATCH   3
+#define LIBSAIS_VERSION_STRING  "2.8.3"
 
 #ifdef _WIN32
     #ifdef LIBSAIS_SHARED
diff --git a/include/libsais16.h b/include/libsais16.h
index daff42f..091ce71 100644
--- a/include/libsais16.h
+++ b/include/libsais16.h
@@ -26,8 +26,8 @@ Please see the file LICENSE for full copyright information.
 
 #define LIBSAIS16_VERSION_MAJOR   2
 #define LIBSAIS16_VERSION_MINOR   8
-#define LIBSAIS16_VERSION_PATCH   2
-#define LIBSAIS16_VERSION_STRING  "2.8.2"
+#define LIBSAIS16_VERSION_PATCH   3
+#define LIBSAIS16_VERSION_STRING  "2.8.3"
 
 #ifdef _WIN32
     #ifdef LIBSAIS_SHARED
@@ -83,6 +83,18 @@ extern "C" {
     */
     LIBSAIS16_API int32_t libsais16(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq);
 
+    /**
+    * Constructs the suffix array of a given integer array.
+    * Note, during construction input array will be modified, but restored at the end if no errors occurred.
+    * @param T [0..n-1] The input integer array.
+    * @param SA [0..n-1+fs] The output array of suffixes.
+    * @param n The length of the integer array.
+    * @param k The alphabet size of the input integer array.
+    * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    LIBSAIS16_API int32_t libsais16_int(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs);
+
     /**
     * Constructs the suffix array of a given 16-bit string using libsais16 context.
     * @param ctx The libsais16 context.
@@ -107,6 +119,19 @@ extern "C" {
     * @return 0 if no error occurred, -1 or -2 otherwise.
     */
     LIBSAIS16_API int32_t libsais16_omp(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads);
+
+    /**
+    * Constructs the suffix array of a given integer array in parallel using OpenMP.
+    * Note, during construction input array will be modified, but restored at the end if no errors occurred.
+    * @param T [0..n-1] The input integer array.
+    * @param SA [0..n-1+fs] The output array of suffixes.
+    * @param n The length of the integer array.
+    * @param k The alphabet size of the input integer array.
+    * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance).
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    LIBSAIS16_API int32_t libsais16_int_omp(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs, int32_t threads);
 #endif
 
     /**
diff --git a/include/libsais16x64.h b/include/libsais16x64.h
new file mode 100644
index 0000000..2811353
--- /dev/null
+++ b/include/libsais16x64.h
@@ -0,0 +1,260 @@
+/*--
+
+This file is a part of libsais, a library for linear time suffix array,
+longest common prefix array and burrows wheeler transform construction.
+
+   Copyright (c) 2021-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+Please see the file LICENSE for full copyright information.
+
+--*/
+
+#ifndef LIBSAIS16X64_H
+#define LIBSAIS16X64_H 1
+
+#define LIBSAIS16X64_VERSION_MAJOR   2
+#define LIBSAIS16X64_VERSION_MINOR   8
+#define LIBSAIS16X64_VERSION_PATCH   3
+#define LIBSAIS16X64_VERSION_STRING  "2.8.3"
+
+#ifdef _WIN32
+    #ifdef LIBSAIS_SHARED
+        #ifdef LIBSAIS_EXPORTS
+            #define LIBSAIS16X64_API __declspec(dllexport)
+        #else
+            #define LIBSAIS16X64_API __declspec(dllimport)
+        #endif
+    #else
+        #define LIBSAIS16X64_API
+    #endif
+#else
+    #define LIBSAIS16X64_API
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    #include <stdint.h>
+
+    /**
+    * Constructs the suffix array of a given 16-bit string.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param SA [0..n-1+fs] The output array of suffixes.
+    * @param n The length of the given 16-bit string.
+    * @param fs The extra space available at the end of SA array (0 should be enough for most cases).
+    * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    LIBSAIS16X64_API int64_t libsais16x64(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq);
+
+    /**
+    * Constructs the suffix array of a given integer array.
+    * Note, during construction input array will be modified, but restored at the end if no errors occurred.
+    * @param T [0..n-1] The input integer array.
+    * @param SA [0..n-1+fs] The output array of suffixes.
+    * @param n The length of the integer array.
+    * @param k The alphabet size of the input integer array.
+    * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    LIBSAIS16X64_API int64_t libsais16x64_long(int64_t * T, int64_t * SA, int64_t n, int64_t k, int64_t fs);
+
+#if defined(LIBSAIS_OPENMP)
+    /**
+    * Constructs the suffix array of a given 16-bit string in parallel using OpenMP.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param SA [0..n-1+fs] The output array of suffixes.
+    * @param n The length of the given 16-bit string.
+    * @param fs The extra space available at the end of SA array (0 should be enough for most cases).
+    * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    LIBSAIS16X64_API int64_t libsais16x64_omp(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq, int64_t threads);
+
+    /**
+    * Constructs the suffix array of a given integer array in parallel using OpenMP.
+    * Note, during construction input array will be modified, but restored at the end if no errors occurred.
+    * @param T [0..n-1] The input integer array.
+    * @param SA [0..n-1+fs] The output array of suffixes.
+    * @param n The length of the integer array.
+    * @param k The alphabet size of the input integer array.
+    * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance).
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    LIBSAIS16X64_API int64_t libsais16x64_long_omp(int64_t * T, int64_t * SA, int64_t n, int64_t k, int64_t fs, int64_t threads);
+#endif
+
+    /**
+    * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param U [0..n-1] The output 16-bit string (can be T).
+    * @param A [0..n-1+fs] The temporary array.
+    * @param n The length of the given 16-bit string.
+    * @param fs The extra space available at the end of A array (0 should be enough for most cases).
+    * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+    * @return The primary index if no error occurred, -1 or -2 otherwise.
+    */
+    LIBSAIS16X64_API int64_t libsais16x64_bwt(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq);
+
+    /**
+    * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string with auxiliary indexes.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param U [0..n-1] The output 16-bit string (can be T).
+    * @param A [0..n-1+fs] The temporary array.
+    * @param n The length of the given 16-bit string.
+    * @param fs The extra space available at the end of A array (0 should be enough for most cases).
+    * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+    * @param r The sampling rate for auxiliary indexes (must be power of 2).
+    * @param I [0..(n-1)/r] The output auxiliary indexes.
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    LIBSAIS16X64_API int64_t libsais16x64_bwt_aux(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t r, int64_t * I);
+
+#if defined(LIBSAIS_OPENMP)
+    /**
+    * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string in parallel using OpenMP.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param U [0..n-1] The output 16-bit string (can be T).
+    * @param A [0..n-1+fs] The temporary array.
+    * @param n The length of the given 16-bit string.
+    * @param fs The extra space available at the end of A array (0 should be enough for most cases).
+    * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return The primary index if no error occurred, -1 or -2 otherwise.
+    */
+    LIBSAIS16X64_API int64_t libsais16x64_bwt_omp(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t threads);
+
+    /**
+    * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string with auxiliary indexes in parallel using OpenMP.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param U [0..n-1] The output 16-bit string (can be T).
+    * @param A [0..n-1+fs] The temporary array.
+    * @param n The length of the given 16-bit string.
+    * @param fs The extra space available at the end of A array (0 should be enough for most cases).
+    * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL).
+    * @param r The sampling rate for auxiliary indexes (must be power of 2).
+    * @param I [0..(n-1)/r] The output auxiliary indexes.
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    LIBSAIS16X64_API int64_t libsais16x64_bwt_aux_omp(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t r, int64_t * I, int64_t threads);
+#endif
+
+    /**
+    * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with primary index.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param U [0..n-1] The output 16-bit string (can be T).
+    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+    * @param n The length of the given 16-bit string.
+    * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL).
+    * @param i The primary index.
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    LIBSAIS16X64_API int64_t libsais16x64_unbwt(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t i);
+
+    /**
+    * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with auxiliary indexes.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param U [0..n-1] The output 16-bit string (can be T).
+    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+    * @param n The length of the given 16-bit string.
+    * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL).
+    * @param r The sampling rate for auxiliary indexes (must be power of 2).
+    * @param I [0..(n-1)/r] The input auxiliary indexes.
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    LIBSAIS16X64_API int64_t libsais16x64_unbwt_aux(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t r, const int64_t * I);
+
+#if defined(LIBSAIS_OPENMP)
+    /**
+    * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with primary index in parallel using OpenMP.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param U [0..n-1] The output 16-bit string (can be T).
+    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+    * @param n The length of the given 16-bit string.
+    * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL).
+    * @param i The primary index.
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    LIBSAIS16X64_API int64_t libsais16x64_unbwt_omp(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t i, int64_t threads);
+
+    /**
+    * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with auxiliary indexes in parallel using OpenMP.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param U [0..n-1] The output 16-bit string (can be T).
+    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
+    * @param n The length of the given 16-bit string.
+    * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL).
+    * @param r The sampling rate for auxiliary indexes (must be power of 2).
+    * @param I [0..(n-1)/r] The input auxiliary indexes.
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return 0 if no error occurred, -1 or -2 otherwise.
+    */
+    LIBSAIS16X64_API int64_t libsais16x64_unbwt_aux_omp(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t r, const int64_t * I, int64_t threads);
+#endif
+
+    /**
+    * Constructs the permuted longest common prefix array (PLCP) of a given 16-bit string and a suffix array.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param SA [0..n-1] The input suffix array.
+    * @param PLCP [0..n-1] The output permuted longest common prefix array.
+    * @param n The length of the 16-bit string and the suffix array.
+    * @return 0 if no error occurred, -1 otherwise.
+    */
+    LIBSAIS16X64_API int64_t libsais16x64_plcp(const uint16_t * T, const int64_t * SA, int64_t * PLCP, int64_t n);
+
+    /**
+    * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array.
+    * @param PLCP [0..n-1] The input permuted longest common prefix array.
+    * @param SA [0..n-1] The input suffix array.
+    * @param LCP [0..n-1] The output longest common prefix array (can be SA).
+    * @param n The length of the permuted longest common prefix array and the suffix array.
+    * @return 0 if no error occurred, -1 otherwise.
+    */
+    LIBSAIS16X64_API int64_t libsais16x64_lcp(const int64_t * PLCP, const int64_t * SA, int64_t * LCP, int64_t n);
+
+#if defined(LIBSAIS_OPENMP)
+    /**
+    * Constructs the permuted longest common prefix array (PLCP) of a given 16-bit string and a suffix array in parallel using OpenMP.
+    * @param T [0..n-1] The input 16-bit string.
+    * @param SA [0..n-1] The input suffix array.
+    * @param PLCP [0..n-1] The output permuted longest common prefix array.
+    * @param n The length of the 16-bit string and the suffix array.
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return 0 if no error occurred, -1 otherwise.
+    */
+    LIBSAIS16X64_API int64_t libsais16x64_plcp_omp(const uint16_t * T, const int64_t * SA, int64_t * PLCP, int64_t n, int64_t threads);
+
+    /**
+    * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array in parallel using OpenMP.
+    * @param PLCP [0..n-1] The input permuted longest common prefix array.
+    * @param SA [0..n-1] The input suffix array.
+    * @param LCP [0..n-1] The output longest common prefix array (can be SA).
+    * @param n The length of the permuted longest common prefix array and the suffix array.
+    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
+    * @return 0 if no error occurred, -1 otherwise.
+    */
+    LIBSAIS16X64_API int64_t libsais16x64_lcp_omp(const int64_t * PLCP, const int64_t * SA, int64_t * LCP, int64_t n, int64_t threads);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/libsais64.h b/include/libsais64.h
index 37e7ec5..8dc3b90 100644
--- a/include/libsais64.h
+++ b/include/libsais64.h
@@ -26,8 +26,8 @@ Please see the file LICENSE for full copyright information.
 
 #define LIBSAIS64_VERSION_MAJOR   2
 #define LIBSAIS64_VERSION_MINOR   8
-#define LIBSAIS64_VERSION_PATCH   2
-#define LIBSAIS64_VERSION_STRING  "2.8.2"
+#define LIBSAIS64_VERSION_PATCH   3
+#define LIBSAIS64_VERSION_STRING  "2.8.3"
 
 #ifdef _WIN32
     #ifdef LIBSAIS_SHARED
diff --git a/src/libsais16.c b/src/libsais16.c
index b1e8a61..960bafb 100644
--- a/src/libsais16.c
+++ b/src/libsais16.c
@@ -6506,6 +6506,19 @@ static sa_sint_t libsais16_main(const uint16_t * T, sa_sint_t * SA, sa_sint_t n,
     return index;
 }
 
+static sa_sint_t libsais16_main_int(sa_sint_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads)
+{
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais16_alloc_thread_state(threads) : NULL;
+
+    sa_sint_t index = thread_state != NULL || threads == 1
+        ? libsais16_main_32s_entry(T, SA, n, k, fs, threads, thread_state)
+        : -2;
+
+    libsais16_free_thread_state(thread_state);
+
+    return index;
+}
+
 static sa_sint_t libsais16_main_ctx(const LIBSAIS_CONTEXT * ctx, const uint16_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq)
 {
     return ctx != NULL && (ctx->buckets != NULL && (ctx->thread_state != NULL || ctx->threads == 1))
@@ -6591,6 +6604,21 @@ int32_t libsais16(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32
     return libsais16_main(T, SA, n, 0, 0, NULL, fs, freq, 1);
 }
 
+int32_t libsais16_int(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs)
+{
+    if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0))
+    {
+        return -1;
+    }
+    else if (n < 2)
+    {
+        if (n == 1) { SA[0] = 0; }
+        return 0;
+    }
+
+    return libsais16_main_int(T, SA, n, k, fs, 1);
+}
+
 int32_t libsais16_ctx(const void * ctx, const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq)
 {
     if ((ctx == NULL) || (T == NULL) || (SA == NULL) || (n < 0) || (fs < 0))
@@ -6751,6 +6779,23 @@ int32_t libsais16_omp(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, i
     return libsais16_main(T, SA, n, 0, 0, NULL, fs, freq, threads);
 }
 
+int32_t libsais16_int_omp(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs, int32_t threads)
+{
+    if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0))
+    {
+        return -1;
+    }
+    else if (n < 2)
+    {
+        if (n == 1) { SA[0] = 0; }
+        return 0;
+    }
+
+    threads = threads > 0 ? threads : omp_get_max_threads();
+
+    return libsais16_main_int(T, SA, n, k, fs, threads);
+}
+
 int32_t libsais16_bwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads)
 {
     if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (threads < 0))
diff --git a/src/libsais16x64.c b/src/libsais16x64.c
new file mode 100644
index 0000000..ffa39b7
--- /dev/null
+++ b/src/libsais16x64.c
@@ -0,0 +1,7605 @@
+/*--
+
+This file is a part of libsais, a library for linear time suffix array,
+longest common prefix array and burrows wheeler transform construction.
+
+   Copyright (c) 2021-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+Please see the file LICENSE for full copyright information.
+
+--*/
+
+#include "libsais16.h"
+#include "libsais16x64.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#if defined(LIBSAIS_OPENMP)
+    #include <omp.h>
+#else
+    #define UNUSED(_x)                  (void)(_x)
+#endif
+
+typedef int64_t                         sa_sint_t;
+typedef uint64_t                        sa_uint_t;
+typedef int64_t                         fast_sint_t;
+typedef uint64_t                        fast_uint_t;
+
+#define SAINT_BIT                       (64)
+#define SAINT_MAX                       INT64_MAX
+#define SAINT_MIN                       INT64_MIN
+
+#define ALPHABET_SIZE                   (1 << CHAR_BIT << CHAR_BIT)
+#define UNBWT_FASTBITS                  (17)
+
+#define SUFFIX_GROUP_BIT                (SAINT_BIT - 1)
+#define SUFFIX_GROUP_MARKER             (((sa_sint_t)1) << (SUFFIX_GROUP_BIT - 1))
+
+#define BUCKETS_INDEX2(_c, _s)          ((((fast_sint_t)_c) << 1) + (fast_sint_t)(_s))
+#define BUCKETS_INDEX4(_c, _s)          ((((fast_sint_t)_c) << 2) + (fast_sint_t)(_s))
+
+#define LIBSAIS_LOCAL_BUFFER_SIZE       (1024)
+#define LIBSAIS_PER_THREAD_CACHE_SIZE   (24576)
+
+typedef struct LIBSAIS_THREAD_CACHE
+{
+        sa_sint_t                       symbol;
+        sa_sint_t                       index;
+} LIBSAIS_THREAD_CACHE;
+
+typedef union LIBSAIS_THREAD_STATE
+{
+    struct
+    {
+        fast_sint_t                     position;
+        fast_sint_t                     count;
+
+        fast_sint_t                     m;
+        fast_sint_t                     last_lms_suffix;
+
+        sa_sint_t *                     buckets;
+        LIBSAIS_THREAD_CACHE *          cache;
+    } state;
+
+    uint8_t padding[64];
+} LIBSAIS_THREAD_STATE;
+
+typedef struct LIBSAIS_CONTEXT
+{
+    sa_sint_t *                         buckets;
+    LIBSAIS_THREAD_STATE *              thread_state;
+    fast_sint_t                         threads;
+} LIBSAIS_CONTEXT;
+
+typedef struct LIBSAIS_UNBWT_CONTEXT
+{
+    sa_uint_t *                         bucket2;
+    uint16_t *                          fastbits;
+    sa_uint_t *                         buckets;
+    fast_sint_t                         threads;
+} LIBSAIS_UNBWT_CONTEXT;
+
+#if defined(__GNUC__) || defined(__clang__)
+    #define RESTRICT __restrict__
+#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
+    #define RESTRICT __restrict
+#else
+    #error Your compiler, configuration or platform is not supported.
+#endif
+
+#if defined(__has_builtin)
+    #if __has_builtin(__builtin_prefetch)
+        #define HAS_BUILTIN_PREFETCH
+    #endif
+#elif defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4))
+    #define HAS_BUILTIN_PREFETCH
+#endif 
+
+#if defined(HAS_BUILTIN_PREFETCH)
+    #define libsais16x64_prefetchr(address) __builtin_prefetch((const void *)(address), 0, 3)
+    #define libsais16x64_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 3)
+#elif defined (_M_IX86) || defined (_M_AMD64)
+    #include <intrin.h>
+    #define libsais16x64_prefetchr(address) _mm_prefetch((const void *)(address), _MM_HINT_T0)
+    #define libsais16x64_prefetchw(address) _m_prefetchw((const void *)(address))
+#elif defined (_M_ARM)
+    #include <intrin.h>
+    #define libsais16x64_prefetchr(address) __prefetch((const void *)(address))
+    #define libsais16x64_prefetchw(address) __prefetchw((const void *)(address))
+#elif defined (_M_ARM64)
+    #include <intrin.h>
+    #define libsais16x64_prefetchr(address) __prefetch2((const void *)(address), 0)
+    #define libsais16x64_prefetchw(address) __prefetch2((const void *)(address), 16)
+#else
+    #error Your compiler, configuration or platform is not supported.
+#endif
+
+#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
+    #if defined(_LITTLE_ENDIAN) \
+            || (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && BYTE_ORDER == LITTLE_ENDIAN) \
+            || (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && _BYTE_ORDER == _LITTLE_ENDIAN) \
+            || (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) \
+            || (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+        #define __LITTLE_ENDIAN__
+    #elif defined(_BIG_ENDIAN) \
+            || (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN) \
+            || (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) && _BYTE_ORDER == _BIG_ENDIAN) \
+            || (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) \
+            || (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+        #define __BIG_ENDIAN__
+    #elif defined(_WIN32)
+        #define __LITTLE_ENDIAN__
+    #endif
+#endif
+
+static void * libsais16x64_align_up(const void * address, size_t alignment)
+{
+    return (void *)((((ptrdiff_t)address) + ((ptrdiff_t)alignment) - 1) & (-((ptrdiff_t)alignment)));
+}
+
+static void * libsais16x64_alloc_aligned(size_t size, size_t alignment)
+{
+    void * address = malloc(size + sizeof(short) + alignment - 1);
+    if (address != NULL)
+    {
+        void * aligned_address = libsais16x64_align_up((void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))), alignment);
+        ((short *)aligned_address)[-1] = (short)((ptrdiff_t)aligned_address - (ptrdiff_t)address);
+
+        return aligned_address;
+    }
+
+    return NULL;
+}
+
+static void libsais16x64_free_aligned(void * aligned_address)
+{
+    if (aligned_address != NULL)
+    {
+        free((void *)((ptrdiff_t)aligned_address - ((short *)aligned_address)[-1]));
+    }
+}
+
+static LIBSAIS_THREAD_STATE * libsais16x64_alloc_thread_state(sa_sint_t threads)
+{
+    LIBSAIS_THREAD_STATE *  RESTRICT thread_state    = (LIBSAIS_THREAD_STATE *)libsais16x64_alloc_aligned((size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096);
+    sa_sint_t *             RESTRICT thread_buckets  = (sa_sint_t *)libsais16x64_alloc_aligned((size_t)threads * 4 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+    LIBSAIS_THREAD_CACHE *  RESTRICT thread_cache    = (LIBSAIS_THREAD_CACHE *)libsais16x64_alloc_aligned((size_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE * sizeof(LIBSAIS_THREAD_CACHE), 4096);
+
+    if (thread_state != NULL && thread_buckets != NULL && thread_cache != NULL)
+    {
+        fast_sint_t t;
+        for (t = 0; t < threads; ++t)
+        { 
+            thread_state[t].state.buckets   = thread_buckets;   thread_buckets  += 4 * ALPHABET_SIZE;
+            thread_state[t].state.cache     = thread_cache;     thread_cache    += LIBSAIS_PER_THREAD_CACHE_SIZE;
+        }
+
+        return thread_state;
+    }
+
+    libsais16x64_free_aligned(thread_cache);
+    libsais16x64_free_aligned(thread_buckets);
+    libsais16x64_free_aligned(thread_state);
+    return NULL;
+}
+
+static void libsais16x64_free_thread_state(LIBSAIS_THREAD_STATE * thread_state)
+{
+    if (thread_state != NULL)
+    {
+        libsais16x64_free_aligned(thread_state[0].state.cache);
+        libsais16x64_free_aligned(thread_state[0].state.buckets);
+        libsais16x64_free_aligned(thread_state);
+    }
+}
+
+#if defined(LIBSAIS_OPENMP)
+
+static sa_sint_t libsais16x64_count_negative_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    sa_sint_t count = 0;
+
+    fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] < 0); }
+
+    return count;
+}
+
+static sa_sint_t libsais16x64_count_zero_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    sa_sint_t count = 0;
+
+    fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] == 0); }
+
+    return count;
+}
+
+static void libsais16x64_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais16x64_prefetchr(&cache[i + 2 * prefetch_distance]);
+
+        libsais16x64_prefetchw(&SA[cache[i + prefetch_distance + 0].symbol]);
+        libsais16x64_prefetchw(&SA[cache[i + prefetch_distance + 1].symbol]);
+        libsais16x64_prefetchw(&SA[cache[i + prefetch_distance + 2].symbol]);
+        libsais16x64_prefetchw(&SA[cache[i + prefetch_distance + 3].symbol]);
+
+        SA[cache[i + 0].symbol] = cache[i + 0].index;
+        SA[cache[i + 1].symbol] = cache[i + 1].index;
+        SA[cache[i + 2].symbol] = cache[i + 2].index;
+        SA[cache[i + 3].symbol] = cache[i + 3].index;
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        SA[cache[i].symbol] = cache[i].index;
+    }
+}
+
+static void libsais16x64_compact_and_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j, l;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4)
+    {
+        libsais16x64_prefetchw(&cache[i + prefetch_distance]);
+
+        cache[l] = cache[i + 0]; l += cache[l].symbol >= 0;
+        cache[l] = cache[i + 1]; l += cache[l].symbol >= 0;
+        cache[l] = cache[i + 2]; l += cache[l].symbol >= 0;
+        cache[l] = cache[i + 3]; l += cache[l].symbol >= 0;
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        cache[l] = cache[i]; l += cache[l].symbol >= 0;
+    }
+
+    libsais16x64_place_cached_suffixes(SA, cache, omp_block_start, l - omp_block_start);
+}
+
+static void libsais16x64_accumulate_counts_s32_2(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s]; }
+}
+
+static void libsais16x64_accumulate_counts_s32_3(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s]; }
+}
+
+static void libsais16x64_accumulate_counts_s32_4(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s]; }
+}
+
+static void libsais16x64_accumulate_counts_s32_5(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+    sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s]; }
+}
+
+static void libsais16x64_accumulate_counts_s32_6(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+    sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+    sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s]; }
+}
+
+static void libsais16x64_accumulate_counts_s32_7(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+    sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+    sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
+    sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s]; }
+}
+
+static void libsais16x64_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+    sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+    sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
+    sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
+    sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s]; }
+}
+
+static void libsais16x64_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
+{
+    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
+    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
+    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
+    sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
+    sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
+    sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
+    sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride;
+    sa_sint_t * RESTRICT bucket08 = bucket07 - bucket_stride;
+    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s] + bucket08[s]; }
+}
+
+static void libsais16x64_accumulate_counts_s32(sa_sint_t * RESTRICT buckets, fast_sint_t bucket_size, fast_sint_t bucket_stride, fast_sint_t num_buckets)
+{
+    while (num_buckets >= 9)
+    {
+        libsais16x64_accumulate_counts_s32_9(buckets - (num_buckets - 9) * bucket_stride, bucket_size, bucket_stride); num_buckets -= 8;
+    }
+
+    switch (num_buckets)
+    {
+        case 1: break;
+        case 2: libsais16x64_accumulate_counts_s32_2(buckets, bucket_size, bucket_stride); break;
+        case 3: libsais16x64_accumulate_counts_s32_3(buckets, bucket_size, bucket_stride); break;
+        case 4: libsais16x64_accumulate_counts_s32_4(buckets, bucket_size, bucket_stride); break;
+        case 5: libsais16x64_accumulate_counts_s32_5(buckets, bucket_size, bucket_stride); break;
+        case 6: libsais16x64_accumulate_counts_s32_6(buckets, bucket_size, bucket_stride); break;
+        case 7: libsais16x64_accumulate_counts_s32_7(buckets, bucket_size, bucket_stride); break;
+        case 8: libsais16x64_accumulate_counts_s32_8(buckets, bucket_size, bucket_stride); break;
+    }
+}
+
+#endif
+
+static void libsais16x64_gather_lms_suffixes_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, fast_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    if (omp_block_size > 0)
+    {
+        const fast_sint_t prefetch_distance = 128;
+
+        fast_sint_t i, j = omp_block_start + omp_block_size, c0 = T[omp_block_start + omp_block_size - 1], c1 = -1;
+
+        while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+        fast_uint_t s = c0 >= c1;
+
+        for (i = omp_block_start + omp_block_size - 2, j = omp_block_start + 3; i >= j; i -= 4)
+        {
+            libsais16x64_prefetchr(&T[i - prefetch_distance]);
+
+            c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+            c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+            c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+        }
+
+        for (j -= 3; i >= j; i -= 1)
+        {
+            c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+        }
+
+        SA[m] = (sa_sint_t)(i + 1);
+    }
+}
+
+static void libsais16x64_gather_lms_suffixes_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_gather_lms_suffixes_16u(T, SA, n, (fast_sint_t)n - 1, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t > omp_thread_num; --t) { m += thread_state[t].state.m; }
+
+            libsais16x64_gather_lms_suffixes_16u(T, SA, n, (fast_sint_t)n - 1 - m, omp_block_start, omp_block_size);
+
+            #pragma omp barrier
+
+            if (thread_state[omp_thread_num].state.m > 0)
+            {
+                SA[(fast_sint_t)n - 1 - m] = (sa_sint_t)thread_state[omp_thread_num].state.last_lms_suffix;
+            }
+        }
+#endif
+    }
+}
+
+static sa_sint_t libsais16x64_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t             i   = n - 2;
+    sa_sint_t             m   = n - 1;
+    fast_uint_t           s   = 1;
+    fast_sint_t           c0  = T[n - 1];
+    fast_sint_t           c1  = 0;
+
+    for (; i >= 3; i -= 4)
+    {
+        libsais16x64_prefetchr(&T[i - prefetch_distance]);
+
+        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1);
+        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 0; m -= ((s & 3) == 1);
+        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i - 1; m -= ((s & 3) == 1);
+        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 2; m -= ((s & 3) == 1);
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1);
+    }
+
+    return n - 1 - m;
+}
+
+static sa_sint_t libsais16x64_gather_compacted_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t             i   = n - 2;
+    sa_sint_t             m   = n - 1;
+    fast_uint_t           s   = 1;
+    fast_sint_t           c0  = T[n - 1];
+    fast_sint_t           c1  = 0;
+
+    for (; i >= 3; i -= 4)
+    {
+        libsais16x64_prefetchr(&T[i - prefetch_distance]);
+
+        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 0; m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i - 1; m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 2; m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+    }
+
+    return n - 1 - m;
+}
+
+#if defined(LIBSAIS_OPENMP)
+
+static void libsais16x64_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t));
+
+    sa_sint_t             i   = n - 2;
+    fast_uint_t           s   = 1;
+    fast_sint_t           c0  = T[n - 1];
+    fast_sint_t           c1  = 0;
+
+    for (; i >= prefetch_distance + 3; i -= 4)
+    {
+        libsais16x64_prefetchr(&T[i - 2 * prefetch_distance]);
+
+        libsais16x64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
+        libsais16x64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
+        libsais16x64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
+        libsais16x64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
+
+        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+
+        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+    }
+
+    buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]++;
+}
+
+#endif
+
+static void libsais16x64_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+    sa_sint_t             i   = n - 2;
+    fast_uint_t           s   = 1;
+    fast_sint_t           c0  = T[n - 1];
+    fast_sint_t           c1  = 0;
+
+    for (; i >= prefetch_distance + 3; i -= 4)
+    {
+        libsais16x64_prefetchr(&T[i - 2 * prefetch_distance]);
+
+        libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
+        libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
+        libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
+        libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
+
+        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+    }
+
+    buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++;
+}
+
+#if defined(LIBSAIS_OPENMP)
+
+static void libsais16x64_count_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+    sa_sint_t             i   = n - 2;
+    fast_uint_t           s   = 1;
+    fast_sint_t           c0  = T[n - 1];
+    fast_sint_t           c1  = 0;
+
+    for (; i >= prefetch_distance + 3; i -= 4)
+    {
+        libsais16x64_prefetchr(&T[i - 2 * prefetch_distance]);
+
+        libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
+        libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
+        libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
+        libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
+
+        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+    }
+
+    c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++;
+}
+
+#endif
+
+static sa_sint_t libsais16x64_count_and_gather_lms_suffixes_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+    fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+    if (omp_block_size > 0)
+    {
+        const fast_sint_t prefetch_distance = 128;
+
+        fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+        while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+        fast_uint_t s = c0 >= c1;
+
+        for (i = m - 1, j = omp_block_start + 3; i >= j; i -= 4)
+        {
+            libsais16x64_prefetchr(&T[i - prefetch_distance]);
+
+            c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+            c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+
+            c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+            c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+        }
+
+        for (j -= 3; i >= j; i -= 1)
+        {
+            c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+        }
+
+        c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+        buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+    }
+
+    return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+static sa_sint_t libsais16x64_count_and_gather_lms_suffixes_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t m = 0;
+
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            m = libsais16x64_count_and_gather_lms_suffixes_16u(T, SA, n, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.m = libsais16x64_count_and_gather_lms_suffixes_16u(T, SA, n, thread_state[omp_thread_num].state.buckets, omp_block_start, omp_block_size);
+
+                if (thread_state[omp_thread_num].state.m > 0)
+                {
+                    thread_state[omp_thread_num].state.last_lms_suffix = SA[thread_state[omp_thread_num].state.position - 1];
+                }
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    m += (sa_sint_t)thread_state[t].state.m;
+
+                    if (t != omp_num_threads - 1 && thread_state[t].state.m > 0)
+                    {
+                        memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.m], (size_t)thread_state[t].state.m * sizeof(sa_sint_t));
+                    }
+
+                    {
+                        sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                        fast_sint_t s; for (s = 0; s < 4 * ALPHABET_SIZE; s += 1) { sa_sint_t A = buckets[s], B = temp_bucket[s]; buckets[s] = A + B; temp_bucket[s] = A; }
+                    }
+                }
+            }
+        }
+#endif
+    }
+
+    return m;
+}
+
+static sa_sint_t libsais16x64_count_and_gather_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t));
+
+    fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+    if (omp_block_size > 0)
+    {
+        const fast_sint_t prefetch_distance = 32;
+
+        fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+        while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+        fast_uint_t s = c0 >= c1;
+
+        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+        {
+            libsais16x64_prefetchr(&T[i - 2 * prefetch_distance]);
+
+            libsais16x64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
+            libsais16x64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
+            libsais16x64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
+            libsais16x64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
+
+            c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+            c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+
+            c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+            c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+        }
+
+        for (j -= prefetch_distance + 3; i >= j; i -= 1)
+        {
+            c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+        }
+
+        c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+        buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+    }
+
+    return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+static sa_sint_t libsais16x64_count_and_gather_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+    fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+    if (omp_block_size > 0)
+    {
+        const fast_sint_t prefetch_distance = 32;
+
+        fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+        while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+        fast_uint_t s = c0 >= c1;
+
+        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+        {
+            libsais16x64_prefetchr(&T[i - 2 * prefetch_distance]);
+
+            libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
+            libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
+            libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
+            libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
+
+            c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+            c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+            c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+            c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+        }
+
+        for (j -= prefetch_distance + 3; i >= j; i -= 1)
+        {
+            c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+        }
+
+        c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+        buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+    }
+
+    return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+static sa_sint_t libsais16x64_count_and_gather_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+    fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+    if (omp_block_size > 0)
+    {
+        const fast_sint_t prefetch_distance = 32;
+
+        fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+        while (j < n && (c1 = T[j]) == c0) { ++j; }
+
+        fast_uint_t s = c0 >= c1;
+
+        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+        {
+            libsais16x64_prefetchr(&T[i - 2 * prefetch_distance]);
+
+            libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
+            libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
+            libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
+            libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
+
+            c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+            c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+            c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+            c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+            c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+            c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+            c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+            c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+        }
+
+        for (j -= prefetch_distance + 3; i >= j; i -= 1)
+        {
+            c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+            c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+        }
+
+        c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+        c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+    }
+
+    return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+#if defined(LIBSAIS_OPENMP)
+
+static fast_sint_t libsais16x64_get_bucket_stride(fast_sint_t free_space, fast_sint_t bucket_size, fast_sint_t num_buckets)
+{
+    fast_sint_t bucket_size_1024 = (bucket_size + 1023) & (-1024); if (free_space / (num_buckets - 1) >= bucket_size_1024) { return bucket_size_1024; }
+    fast_sint_t bucket_size_16 = (bucket_size + 15) & (-16); if (free_space / (num_buckets - 1) >= bucket_size_16) { return bucket_size_16; }
+
+    return bucket_size;
+}
+
+static sa_sint_t libsais16x64_count_and_gather_lms_suffixes_32s_4k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t m = 0;
+
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            m = libsais16x64_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            fast_sint_t bucket_size       = 4 * (fast_sint_t)k;
+            fast_sint_t bucket_stride     = libsais16x64_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads);
+
+            {
+                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.count = libsais16x64_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            if (omp_thread_num == omp_num_threads - 1)
+            {
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    m += (sa_sint_t)thread_state[t].state.count;
+
+                    if (t != omp_num_threads - 1 && thread_state[t].state.count > 0)
+                    {
+                        memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+                    }
+                }
+            }
+            else
+            {
+                omp_num_threads     = omp_num_threads - 1;
+                omp_block_stride    = (bucket_size / omp_num_threads) & (-16);
+                omp_block_start     = omp_thread_num * omp_block_stride;
+                omp_block_size      = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start;
+
+                libsais16x64_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1);
+            }
+        }
+#endif
+    }
+
+    return m;
+}
+
+static sa_sint_t libsais16x64_count_and_gather_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t m = 0;
+
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            m = libsais16x64_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            fast_sint_t bucket_size       = 2 * (fast_sint_t)k;
+            fast_sint_t bucket_stride     = libsais16x64_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads);
+
+            {
+                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.count = libsais16x64_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            if (omp_thread_num == omp_num_threads - 1)
+            {
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    m += (sa_sint_t)thread_state[t].state.count;
+
+                    if (t != omp_num_threads - 1 && thread_state[t].state.count > 0)
+                    {
+                        memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+                    }
+                }
+            }
+            else
+            {
+                omp_num_threads     = omp_num_threads - 1;
+                omp_block_stride    = (bucket_size / omp_num_threads) & (-16);
+                omp_block_start     = omp_thread_num * omp_block_stride;
+                omp_block_size      = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start;
+
+                libsais16x64_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1);
+            }
+        }
+#endif
+    }
+
+    return m;
+}
+
+static void libsais16x64_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            fast_sint_t bucket_size       = 2 * (fast_sint_t)k;
+            fast_sint_t bucket_stride     = libsais16x64_get_bucket_stride(buckets - &SA[(fast_sint_t)n + (fast_sint_t)n], bucket_size, omp_num_threads);
+
+            {
+                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.count = libsais16x64_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA + n, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t >= omp_thread_num; --t) { m += (sa_sint_t)thread_state[t].state.count; }
+
+                if (thread_state[omp_thread_num].state.count > 0)
+                {
+                    memcpy(&SA[n - m], &SA[n + thread_state[omp_thread_num].state.position - thread_state[omp_thread_num].state.count], (size_t)thread_state[omp_thread_num].state.count * sizeof(sa_sint_t));
+                }
+            }
+
+            {
+                omp_block_stride    = (bucket_size / omp_num_threads) & (-16);
+                omp_block_start     = omp_thread_num * omp_block_stride;
+                omp_block_size      = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start;
+
+                libsais16x64_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads);
+            }
+        }
+#endif
+    }
+}
+
+#endif
+
+static sa_sint_t libsais16x64_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+    sa_sint_t m = 0;
+
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        if (omp_num_threads == 1)
+        {
+            m = libsais16x64_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, 0, n);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else if (omp_thread_num == 0)
+        {
+            libsais16x64_count_lms_suffixes_32s_4k(T, n, k, buckets);
+        }
+        else
+        {
+            m = libsais16x64_gather_lms_suffixes_32s(T, SA, n);
+        }
+#endif
+    }
+
+    return m;
+}
+
+static sa_sint_t libsais16x64_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+    sa_sint_t m = 0;
+
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        if (omp_num_threads == 1)
+        {
+            m = libsais16x64_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else if (omp_thread_num == 0)
+        {
+            libsais16x64_count_lms_suffixes_32s_2k(T, n, k, buckets);
+        }
+        else
+        {
+            m = libsais16x64_gather_lms_suffixes_32s(T, SA, n);
+        }
+#endif
+    }
+
+    return m;
+}
+
+static sa_sint_t libsais16x64_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+    sa_sint_t m = 0;
+
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        if (omp_num_threads == 1)
+        {
+            m = libsais16x64_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else if (omp_thread_num == 0)
+        {
+            libsais16x64_count_compacted_lms_suffixes_32s_2k(T, n, k, buckets);
+        }
+        else
+        {
+            m = libsais16x64_gather_compacted_lms_suffixes_32s(T, SA, n);
+        }
+#endif
+    }
+
+    return m;
+}
+
+static sa_sint_t libsais16x64_count_and_gather_lms_suffixes_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t m;
+
+#if defined(LIBSAIS_OPENMP)
+    sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((4 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; }
+    if (max_threads > 1 && n >= 65536 && n / k >= 2)
+    {
+        if (max_threads > n / 16 / k) { max_threads = n / 16 / k; }
+        m = libsais16x64_count_and_gather_lms_suffixes_32s_4k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
+    }
+    else
+#else
+    UNUSED(thread_state);
+#endif
+    {
+        m = libsais16x64_count_and_gather_lms_suffixes_32s_4k_nofs_omp(T, SA, n, k, buckets, threads);
+    }
+
+    return m;
+}
+
+static sa_sint_t libsais16x64_count_and_gather_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t m;
+
+#if defined(LIBSAIS_OPENMP)
+    sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; }
+    if (max_threads > 1 && n >= 65536 && n / k >= 2)
+    {
+        if (max_threads > n / 8 / k) { max_threads = n / 8 / k; }
+        m = libsais16x64_count_and_gather_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
+    }
+    else
+#else
+    UNUSED(thread_state);
+#endif
+    {
+        m = libsais16x64_count_and_gather_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads);
+    }
+
+    return m;
+}
+
+static void libsais16x64_count_and_gather_compacted_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(LIBSAIS_OPENMP)
+    sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[(fast_sint_t)n + (fast_sint_t)n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; }
+    if (max_threads > 1 && n >= 65536 && n / k >= 2)
+    {
+        if (max_threads > n / 8 / k) { max_threads = n / 8 / k; }
+        libsais16x64_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
+    }
+    else
+#else
+    UNUSED(thread_state);
+#endif
+    {
+        libsais16x64_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads);
+    }
+}
+
+static void libsais16x64_count_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    memset(buckets, 0, (size_t)k * sizeof(sa_sint_t));
+
+    fast_sint_t i, j;
+    for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8)
+    {
+        libsais16x64_prefetchr(&T[i + prefetch_distance]);
+
+        buckets[T[i + 0]]++;
+        buckets[T[i + 1]]++;
+        buckets[T[i + 2]]++;
+        buckets[T[i + 3]]++;
+        buckets[T[i + 4]]++;
+        buckets[T[i + 5]]++;
+        buckets[T[i + 6]]++;
+        buckets[T[i + 7]]++;
+    }
+
+    for (j += 7; i < j; i += 1)
+    {
+        buckets[T[i]]++;
+    }
+}
+
+static void libsais16x64_initialize_buckets_start_and_end_16u(sa_sint_t * RESTRICT buckets, sa_sint_t * RESTRICT freq)
+{
+    sa_sint_t * RESTRICT bucket_start = &buckets[6 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT bucket_end   = &buckets[7 * ALPHABET_SIZE];
+
+    if (freq != NULL)
+    {
+        fast_sint_t i, j; sa_sint_t sum = 0;
+        for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
+        {
+            bucket_start[j] = sum;
+            sum += (freq[j] = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]);
+            bucket_end[j] = sum;
+        }
+    }
+    else
+    {
+        fast_sint_t i, j; sa_sint_t sum = 0;
+        for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
+        {
+            bucket_start[j] = sum;
+            sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)];
+            bucket_end[j] = sum;
+        }
+    }
+}
+
+static void libsais16x64_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    sa_sint_t * RESTRICT bucket_start = &buckets[4 * (fast_sint_t)k];
+    sa_sint_t * RESTRICT bucket_end   = &buckets[5 * (fast_sint_t)k];
+
+    fast_sint_t i, j; sa_sint_t sum = 0;
+    for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
+    {
+        bucket_start[j] = sum;
+        sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)];
+        bucket_end[j] = sum;
+    }
+}
+
+static void libsais16x64_initialize_buckets_start_and_end_32s_4k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    sa_sint_t * RESTRICT bucket_start = &buckets[2 * (fast_sint_t)k];
+    sa_sint_t * RESTRICT bucket_end   = &buckets[3 * (fast_sint_t)k];
+
+    fast_sint_t i, j; sa_sint_t sum = 0;
+    for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1)
+    { 
+        bucket_start[j] = sum;
+        sum += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+        bucket_end[j] = sum;
+    }
+}
+
+static void libsais16x64_initialize_buckets_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    fast_sint_t i; sa_sint_t sum0 = 0;
+    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0))
+    { 
+        sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
+    }
+}
+
+static void libsais16x64_initialize_buckets_start_and_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    fast_sint_t i, j;
+    for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1)
+    {
+        buckets[j] = buckets[i];
+    }
+
+    buckets[k] = 0; memcpy(&buckets[k + 1], buckets, ((size_t)k - 1) * sizeof(sa_sint_t));
+}
+
+static void libsais16x64_initialize_buckets_start_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    fast_sint_t i; sa_sint_t sum = 0;
+    for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sa_sint_t tmp = buckets[i]; buckets[i] = sum; sum += tmp; }
+}
+
+static void libsais16x64_initialize_buckets_end_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    fast_sint_t i; sa_sint_t sum = 0;
+    for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sum += buckets[i]; buckets[i] = sum; }
+}
+
+static sa_sint_t libsais16x64_initialize_buckets_for_lms_suffixes_radix_sort_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
+{
+    {
+        fast_uint_t     s = 0;
+        fast_sint_t     c0 = T[first_lms_suffix];
+        fast_sint_t     c1 = 0;
+
+        for (; --first_lms_suffix >= 0; )
+        {
+            c1 = c0; c0 = T[first_lms_suffix]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]--;
+        }
+
+        buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]--;
+    }
+
+    {
+        sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
+
+        fast_sint_t i, j; sa_sint_t sum = 0;
+        for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
+        { 
+            temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum;
+        }
+
+        return sum;
+    }
+}
+
+static void libsais16x64_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
+{
+    buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++;
+    buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--;
+
+    fast_sint_t i; sa_sint_t sum0 = 0, sum1 = 0;
+    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0))
+    { 
+        sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+        sum1 += buckets[i + BUCKETS_INDEX2(0, 1)];
+        
+        buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
+        buckets[i + BUCKETS_INDEX2(0, 1)] = sum1;
+    }
+}
+
+static sa_sint_t libsais16x64_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
+{
+    {
+        fast_uint_t     s = 0;
+        fast_sint_t     c0 = T[first_lms_suffix];
+        fast_sint_t     c1 = 0;
+
+        for (; --first_lms_suffix >= 0; )
+        {
+            c1 = c0; c0 = T[first_lms_suffix]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]--;
+        }
+
+        buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]--;
+    }
+
+    {
+        sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k];
+
+        fast_sint_t i, j; sa_sint_t sum = 0;
+        for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
+        { 
+            sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum;
+        }
+
+        return sum;
+    }
+}
+
+static void libsais16x64_initialize_buckets_for_radix_and_partial_sorting_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
+{
+    sa_sint_t * RESTRICT bucket_start = &buckets[2 * (fast_sint_t)k];
+    sa_sint_t * RESTRICT bucket_end   = &buckets[3 * (fast_sint_t)k];
+
+    buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++;
+    buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--;
+
+    fast_sint_t i, j; sa_sint_t sum0 = 0, sum1 = 0;
+    for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1)
+    { 
+        bucket_start[j] = sum1;
+
+        sum0 += buckets[i + BUCKETS_INDEX2(0, 1)];
+        sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+        buckets[i + BUCKETS_INDEX2(0, 1)] = sum0;
+
+        bucket_end[j] = sum1;
+    }
+}
+
+static void libsais16x64_radix_sort_lms_suffixes_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+    {
+        libsais16x64_prefetchr(&SA[i - 2 * prefetch_distance]);
+
+        libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 0]]);
+        libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 1]]);
+        libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 2]]);
+        libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 3]]);
+
+        sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0;
+        sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1;
+        sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2;
+        sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3;
+    }
+
+    for (j -= prefetch_distance + 3; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p;
+    }
+}
+
+static void libsais16x64_radix_sort_lms_suffixes_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && m >= 65536 && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_radix_sort_lms_suffixes_16u(T, SA, &buckets[4 * ALPHABET_SIZE], (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                sa_sint_t * RESTRICT src_bucket = &buckets[4 * ALPHABET_SIZE];
+                sa_sint_t * RESTRICT dst_bucket = thread_state[omp_thread_num].state.buckets;
+
+                fast_sint_t i, j;
+                for (i = BUCKETS_INDEX2(0, 0), j = BUCKETS_INDEX4(0, 1); i <= BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX2(1, 0), j += BUCKETS_INDEX4(1, 0))
+                {
+                    dst_bucket[i] = src_bucket[i] - dst_bucket[j];
+                }
+            }
+
+            {
+                fast_sint_t t, omp_block_start = 0, omp_block_size = thread_state[omp_thread_num].state.m;
+                for (t = omp_num_threads - 1; t >= omp_thread_num; --t) omp_block_start += thread_state[t].state.m;
+
+                if (omp_block_start == (fast_sint_t)m && omp_block_size > 0)
+                {
+                    omp_block_start -= 1; omp_block_size -= 1;
+                }
+
+                libsais16x64_radix_sort_lms_suffixes_16u(T, SA, thread_state[omp_thread_num].state.buckets, (fast_sint_t)n - omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16x64_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4)
+    {
+        libsais16x64_prefetchr(&SA[i - 3 * prefetch_distance]);
+        
+        libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 0]]);
+        libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 1]]);
+        libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 2]]);
+        libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 3]]);
+
+        libsais16x64_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 0]]]);
+        libsais16x64_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 1]]]);
+        libsais16x64_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 2]]]);
+        libsais16x64_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 3]]]);
+
+        sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[T[p0]]] = p0;
+        sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[T[p1]]] = p1;
+        sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[T[p2]]] = p2;
+        sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[T[p3]]] = p3;
+    }
+
+    for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; SA[--induction_bucket[T[p]]] = p;
+    }
+}
+
+static void libsais16x64_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4)
+    {
+        libsais16x64_prefetchr(&SA[i - 3 * prefetch_distance]);
+        
+        libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 0]]);
+        libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 1]]);
+        libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 2]]);
+        libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 3]]);
+
+        libsais16x64_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 0]], 0)]);
+        libsais16x64_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 1]], 0)]);
+        libsais16x64_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 2]], 0)]);
+        libsais16x64_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 3]], 0)]);
+
+        sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0;
+        sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1;
+        sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2;
+        sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3;
+    }
+
+    for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p;
+    }
+}
+
+#if defined(LIBSAIS_OPENMP)
+
+static void libsais16x64_radix_sort_lms_suffixes_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]);
+
+        libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 0]]);
+        libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 1]]);
+        libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 2]]);
+        libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 3]]);
+
+        libsais16x64_prefetchw(&cache[i + prefetch_distance]);
+
+        cache[i + 0].symbol = T[cache[i + 0].index = SA[i + 0]];
+        cache[i + 1].symbol = T[cache[i + 1].index = SA[i + 1]];
+        cache[i + 2].symbol = T[cache[i + 2].index = SA[i + 2]];
+        cache[i + 3].symbol = T[cache[i + 3].index = SA[i + 3]];
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        cache[i].symbol = T[cache[i].index = SA[i]];
+    }
+}
+
+static void libsais16x64_radix_sort_lms_suffixes_32s_6k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+    {
+        libsais16x64_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+        libsais16x64_prefetchw(&induction_bucket[cache[i - prefetch_distance - 0].symbol]);
+        libsais16x64_prefetchw(&induction_bucket[cache[i - prefetch_distance - 1].symbol]);
+        libsais16x64_prefetchw(&induction_bucket[cache[i - prefetch_distance - 2].symbol]);
+        libsais16x64_prefetchw(&induction_bucket[cache[i - prefetch_distance - 3].symbol]);
+
+        cache[i - 0].symbol = --induction_bucket[cache[i - 0].symbol];
+        cache[i - 1].symbol = --induction_bucket[cache[i - 1].symbol];
+        cache[i - 2].symbol = --induction_bucket[cache[i - 2].symbol];
+        cache[i - 3].symbol = --induction_bucket[cache[i - 3].symbol];
+    }
+
+    for (j -= prefetch_distance + 3; i >= j; i -= 1)
+    {
+        cache[i].symbol = --induction_bucket[cache[i].symbol];
+    }
+}
+
+static void libsais16x64_radix_sort_lms_suffixes_32s_2k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
+    {
+        libsais16x64_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+        libsais16x64_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 0].symbol, 0)]);
+        libsais16x64_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 1].symbol, 0)]);
+        libsais16x64_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 2].symbol, 0)]);
+        libsais16x64_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 3].symbol, 0)]);
+
+        cache[i - 0].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 0].symbol, 0)];
+        cache[i - 1].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 1].symbol, 0)];
+        cache[i - 2].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 2].symbol, 0)];
+        cache[i - 3].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 3].symbol, 0)];
+    }
+
+    for (j -= prefetch_distance + 3; i >= j; i -= 1)
+    {
+        cache[i].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i].symbol, 0)];
+    }
+}
+
+static void libsais16x64_radix_sort_lms_suffixes_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                libsais16x64_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                libsais16x64_radix_sort_lms_suffixes_32s_6k_block_sort(induction_bucket, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16x64_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16x64_radix_sort_lms_suffixes_32s_2k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                libsais16x64_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                libsais16x64_radix_sort_lms_suffixes_32s_2k_block_sort(induction_bucket, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16x64_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+#endif
+
+static void libsais16x64_radix_sort_lms_suffixes_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || m < 65536)
+    {
+        libsais16x64_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+    }
+#if defined(LIBSAIS_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end)
+        {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; }
+
+            libsais16x64_radix_sort_lms_suffixes_32s_6k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais16x64_radix_sort_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || m < 65536)
+    {
+        libsais16x64_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+    }
+#if defined(LIBSAIS_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end)
+        {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; }
+
+            libsais16x64_radix_sort_lms_suffixes_32s_2k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static sa_sint_t libsais16x64_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t             i = n - 2;
+    sa_sint_t             m = 0;
+    fast_uint_t           s = 1;
+    fast_sint_t           c0 = T[n - 1];
+    fast_sint_t           c1 = 0;
+    fast_sint_t           c2 = 0;
+
+    for (; i >= prefetch_distance + 3; i -= 4)
+    {
+        libsais16x64_prefetchr(&T[i - 2 * prefetch_distance]);
+
+        libsais16x64_prefetchw(&buckets[T[i - prefetch_distance - 0]]);
+        libsais16x64_prefetchw(&buckets[T[i - prefetch_distance - 1]]);
+        libsais16x64_prefetchw(&buckets[T[i - prefetch_distance - 2]]);
+        libsais16x64_prefetchw(&buckets[T[i - prefetch_distance - 3]]);
+
+        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); 
+        if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i + 1; m++; }
+        
+        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); 
+        if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 0; m++; }
+
+        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); 
+        if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i - 1; m++; }
+
+        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); 
+        if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 2; m++; }
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); 
+        if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i + 1; m++; }
+    }
+
+    if (m > 1)
+    {
+        SA[buckets[c2]] = 0;
+    }
+
+    return m;
+}
+
+static void libsais16x64_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais16x64_prefetchr(&induction_bucket[i + 2 * prefetch_distance]);
+
+        libsais16x64_prefetchw(&SA[induction_bucket[i + prefetch_distance + 0]]);
+        libsais16x64_prefetchw(&SA[induction_bucket[i + prefetch_distance + 1]]);
+        libsais16x64_prefetchw(&SA[induction_bucket[i + prefetch_distance + 2]]);
+        libsais16x64_prefetchw(&SA[induction_bucket[i + prefetch_distance + 3]]);
+
+        SA[induction_bucket[i + 0]] |= SAINT_MIN;
+        SA[induction_bucket[i + 1]] |= SAINT_MIN;
+        SA[induction_bucket[i + 2]] |= SAINT_MIN;
+        SA[induction_bucket[i + 3]] |= SAINT_MIN;
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        SA[induction_bucket[i]] |= SAINT_MIN;
+    }
+}
+
+static void libsais16x64_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais16x64_prefetchr(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]);
+
+        libsais16x64_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 0, 0)]]);
+        libsais16x64_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 1, 0)]]);
+        libsais16x64_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 2, 0)]]);
+        libsais16x64_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 3, 0)]]);
+
+        SA[induction_bucket[BUCKETS_INDEX2(i + 0, 0)]] |= SUFFIX_GROUP_MARKER;
+        SA[induction_bucket[BUCKETS_INDEX2(i + 1, 0)]] |= SUFFIX_GROUP_MARKER;
+        SA[induction_bucket[BUCKETS_INDEX2(i + 2, 0)]] |= SUFFIX_GROUP_MARKER;
+        SA[induction_bucket[BUCKETS_INDEX2(i + 3, 0)]] |= SUFFIX_GROUP_MARKER;
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        SA[induction_bucket[BUCKETS_INDEX2(i, 0)]] |= SUFFIX_GROUP_MARKER;
+    }
+}
+
+static void libsais16x64_radix_sort_set_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_block_stride  = (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start;
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_block_start   = 0;
+        fast_sint_t omp_block_size    = (fast_sint_t)k - 1;
+#endif
+
+        libsais16x64_radix_sort_set_markers_32s_6k(SA, induction_bucket, omp_block_start, omp_block_size);
+    }
+}
+
+static void libsais16x64_radix_sort_set_markers_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_block_stride  = (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start;
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_block_start   = 0;
+        fast_sint_t omp_block_size    = (fast_sint_t)k - 1;
+#endif
+
+        libsais16x64_radix_sort_set_markers_32s_4k(SA, induction_bucket, omp_block_start, omp_block_size);
+    }
+}
+
+static void libsais16x64_initialize_buckets_for_partial_sorting_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count)
+{
+    sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
+
+    buckets[BUCKETS_INDEX4((fast_uint_t)T[first_lms_suffix], 1)]++;
+
+    fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0;
+    for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
+    { 
+        temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
+
+        sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)];
+        sum1 += buckets[i + BUCKETS_INDEX4(0, 1)];
+
+        buckets[j + BUCKETS_INDEX2(0, 0)] = sum0;
+        buckets[j + BUCKETS_INDEX2(0, 1)] = sum1;
+    }
+}
+
+static void libsais16x64_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count)
+{
+    sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k];
+
+    fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0, sum2 = 0;
+    for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4((fast_sint_t)first_lms_suffix - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
+    {
+        sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)];
+        sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)];
+        sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)];
+        sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)];
+
+        buckets[i + BUCKETS_INDEX4(0, 0)] = sum0;
+        buckets[i + BUCKETS_INDEX4(0, 1)] = sum2;
+        buckets[i + BUCKETS_INDEX4(0, 2)] = 0;
+        buckets[i + BUCKETS_INDEX4(0, 3)] = 0;
+
+        sum0 += SS + SL; sum1 += LS; sum2 += LS + LL;
+
+        temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
+        temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1;
+    }
+
+    for (sum1 += 1; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
+    { 
+        sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)];
+        sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)];
+        sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)];
+        sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)];
+
+        buckets[i + BUCKETS_INDEX4(0, 0)] = sum0;
+        buckets[i + BUCKETS_INDEX4(0, 1)] = sum2;
+        buckets[i + BUCKETS_INDEX4(0, 2)] = 0;
+        buckets[i + BUCKETS_INDEX4(0, 3)] = 0;
+
+        sum0 += SS + SL; sum1 += LS; sum2 += LS + LL;
+
+        temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
+        temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1;
+    }
+}
+
+static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]);
+
+        libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+        libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+        libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+        libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+        sa_sint_t p0 = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]);
+        SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+
+        sa_sint_t p1 = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]);
+        SA[induction_bucket[v1]++] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
+        SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+    }
+
+    return d;
+}
+
+#if defined(LIBSAIS_OPENMP)
+
+static void libsais16x64_partial_sorting_scan_left_to_right_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+    fast_sint_t i, j, count = 0; sa_sint_t d = 1;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]);
+
+        libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+        libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+        libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+        libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+        sa_sint_t p0 = cache[count].index = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d;
+        sa_sint_t p1 = cache[count].index = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); induction_bucket[v]++; distinct_names[v] = d;
+    }
+
+    state[0].state.position   = (fast_sint_t)d - 1;
+    state[0].state.count      = count;
+}
+
+static void libsais16x64_partial_sorting_scan_left_to_right_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    fast_sint_t i, j;
+    for (i = 0, j = count - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchr(&cache[i + prefetch_distance]);
+
+        sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol;
+        SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+
+        sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol;
+        SA[induction_bucket[v1]++] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+    }
+
+    for (j += 1; i < j; i += 1)
+    {
+        sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol;
+        SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+    }
+}
+
+static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            d = libsais16x64_partial_sorting_scan_left_to_right_16u(T, SA, buckets, d, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                libsais16x64_partial_sorting_scan_left_to_right_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
+                sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+                fast_sint_t t;
+                for (t = 0; t < omp_num_threads; ++t)
+                {
+                    sa_sint_t * RESTRICT temp_induction_bucket    = &thread_state[t].state.buckets[0 * ALPHABET_SIZE];
+                    sa_sint_t * RESTRICT temp_distinct_names      = &thread_state[t].state.buckets[2 * ALPHABET_SIZE];
+
+                    fast_sint_t c; 
+                    for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A + B; temp_induction_bucket[c] = A; }
+
+                    for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; }
+                    d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position;
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16x64_partial_sorting_scan_left_to_right_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position);
+            }
+        }
+#endif
+    }
+
+    return d;
+}
+
+#endif
+
+static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
+    distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
+
+    if (threads == 1 || left_suffixes_count < 65536)
+    {
+        d = libsais16x64_partial_sorting_scan_left_to_right_16u(T, SA, buckets, d, 0, left_suffixes_count);
+    }
+#if defined(LIBSAIS_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = 0; block_start < left_suffixes_count; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start++;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > left_suffixes_count) { block_max_end = left_suffixes_count;}
+                fast_sint_t block_end     = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
+                fast_sint_t block_size    = block_end - block_start;
+
+                if (block_size < 32)
+                {
+                    for (; block_start < block_end; block_start += 1)
+                    {
+                        sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
+                        SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+                    }
+                }
+                else
+                {
+                    d = libsais16x64_partial_sorting_scan_left_to_right_16u_block_omp(T, SA, buckets, d, block_start, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+
+    return d;
+}
+
+static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchr(&SA[i + 3 * prefetch_distance]);
+
+        libsais16x64_prefetchr(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 1);
+        libsais16x64_prefetchr(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 2);
+        libsais16x64_prefetchr(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 1);
+        libsais16x64_prefetchr(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 2);
+
+        sa_sint_t p0 = SA[i + prefetch_distance + 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais16x64_prefetchw(&buckets[v0]);
+        sa_sint_t p1 = SA[i + prefetch_distance + 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais16x64_prefetchw(&buckets[v1]);
+
+        sa_sint_t p2 = SA[i + 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] >= T[p2 - 1]);
+        SA[buckets[v2]++] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d;
+
+        sa_sint_t p3 = SA[i + 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] >= T[p3 - 1]);
+        SA[buckets[v3]++] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d;
+    }
+
+    for (j += 2 * prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]);
+        SA[buckets[v]++] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+    }
+
+    return d;
+}
+
+static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[0 * (fast_sint_t)k];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchw(&SA[i + 3 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16x64_prefetchw(&induction_bucket[Ts2]); libsais16x64_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); }
+        sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16x64_prefetchw(&induction_bucket[Ts3]); libsais16x64_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); }
+
+        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX;
+        if (p0 > 0)
+        {
+            SA[i + 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]);
+            SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+        }
+
+        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX;
+        if (p1 > 0)
+        {
+            SA[i + 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]);
+            SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+        }
+    }
+
+    for (j += 2 * prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX;
+        if (p > 0)
+        {
+            SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]);
+            SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+        }
+    }
+
+    return d;
+}
+
+static void libsais16x64_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchw(&SA[i + 3 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais16x64_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16x64_prefetchr(&T[s2] - 2); }
+        sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais16x64_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16x64_prefetchr(&T[s3] - 2); }
+
+        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { SA[i + 0] = 0; SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { SA[i + 1] = 0; SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j += 2 * prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { SA[i] = 0; SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); }
+    }
+}
+
+#if defined(LIBSAIS_OPENMP)
+
+static void libsais16x64_partial_sorting_scan_left_to_right_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]);
+
+        libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+        libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+        libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+        libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+        libsais16x64_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); } cache[i + 0].symbol = symbol0;
+        sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); } cache[i + 1].symbol = symbol1;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]); } cache[i].symbol = symbol;
+    }
+}
+
+static void libsais16x64_partial_sorting_scan_left_to_right_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+
+        libsais16x64_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]); p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]); p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]); p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX;
+    }
+}
+
+static void libsais16x64_partial_sorting_scan_left_to_right_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+
+        libsais16x64_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX;
+    }
+}
+
+static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
+    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchw(&cache[i + 2 * prefetch_distance]);
+
+        libsais16x64_prefetchw(&buckets[cache[i + prefetch_distance + 0].symbol]);
+        libsais16x64_prefetchw(&buckets[cache[i + prefetch_distance + 1].symbol]);
+
+        sa_sint_t v0 = cache[i + 0].symbol, p0 = cache[i + 0].index; d += (p0 < 0); cache[i + 0].symbol = buckets[v0]++; cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d;
+        if (cache[i + 0].symbol < omp_block_end) { sa_sint_t s = cache[i + 0].symbol, q = (cache[s].index = cache[i + 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); }
+
+        sa_sint_t v1 = cache[i + 1].symbol, p1 = cache[i + 1].index; d += (p1 < 0); cache[i + 1].symbol = buckets[v1]++; cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d;
+        if (cache[i + 1].symbol < omp_block_end) { sa_sint_t s = cache[i + 1].symbol, q = (cache[s].index = cache[i + 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = buckets[v]++; cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+        if (cache[i].symbol < omp_block_end) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); }
+    }
+
+    return d;
+}
+
+static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[0 * (fast_sint_t)k];
+
+    fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
+    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchw(&cache[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 >> 1]; libsais16x64_prefetchw(s0 >= 0 ? Is0 : NULL); const sa_sint_t * Ds0 = &distinct_names[s0]; libsais16x64_prefetchw(s0 >= 0 ? Ds0 : NULL); 
+        sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 >> 1]; libsais16x64_prefetchw(s1 >= 0 ? Is1 : NULL); const sa_sint_t * Ds1 = &distinct_names[s1]; libsais16x64_prefetchw(s1 >= 0 ? Ds1 : NULL);
+        
+        sa_sint_t v0 = cache[i + 0].symbol;
+        if (v0 >= 0)
+        {
+            sa_sint_t p0 = cache[i + 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 0].symbol = induction_bucket[v0 >> 1]++; cache[i + 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+            if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 0].index = np & SAINT_MAX; }
+        }
+
+        sa_sint_t v1 = cache[i + 1].symbol;
+        if (v1 >= 0)
+        {
+            sa_sint_t p1 = cache[i + 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 1].symbol = induction_bucket[v1 >> 1]++; cache[i + 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+            if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 1].index = np & SAINT_MAX; }
+        }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t v = cache[i].symbol;
+        if (v >= 0)
+        {
+            sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = induction_bucket[v >> 1]++; cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+            if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i].index = np & SAINT_MAX; }
+        }
+    }
+
+    return d;
+}
+
+static void libsais16x64_partial_sorting_scan_left_to_right_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
+    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchw(&cache[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais16x64_prefetchw(s0 >= 0 ? Is0 : NULL);
+        sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais16x64_prefetchw(s1 >= 0 ? Is1 : NULL);
+        
+        sa_sint_t v0 = cache[i + 0].symbol;
+        if (v0 >= 0)
+        {
+            cache[i + 0].symbol = induction_bucket[v0]++;
+            if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 0].index = np & SAINT_MAX; }
+        }
+
+        sa_sint_t v1 = cache[i + 1].symbol;
+        if (v1 >= 0)
+        {
+            cache[i + 1].symbol = induction_bucket[v1]++;
+            if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 1].index = np & SAINT_MAX; }
+        }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t v = cache[i].symbol;
+        if (v >= 0)
+        {
+            cache[i].symbol = induction_bucket[v]++;
+            if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i].index = np & SAINT_MAX; }
+        }
+    }
+}
+
+static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            d = libsais16x64_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                libsais16x64_partial_sorting_scan_left_to_right_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                d = libsais16x64_partial_sorting_scan_left_to_right_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16x64_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return d;
+}
+
+static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            d = libsais16x64_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                libsais16x64_partial_sorting_scan_left_to_right_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                d = libsais16x64_partial_sorting_scan_left_to_right_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16x64_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return d;
+}
+
+static void libsais16x64_partial_sorting_scan_left_to_right_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                libsais16x64_partial_sorting_scan_left_to_right_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                libsais16x64_partial_sorting_scan_left_to_right_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16x64_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+#endif
+
+static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
+    buckets[2 + BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
+
+    if (threads == 1 || left_suffixes_count < 65536)
+    {
+        d = libsais16x64_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, 0, left_suffixes_count);
+    }
+#if defined(LIBSAIS_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = 0; block_start < left_suffixes_count; block_start = block_end)
+        {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > left_suffixes_count) { block_end = left_suffixes_count; }
+
+            d = libsais16x64_partial_sorting_scan_left_to_right_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+
+    return d;
+}
+
+static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[0 * (fast_sint_t)k];
+
+    SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER;
+    distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d;
+
+    if (threads == 1 || n < 65536)
+    {
+        d = libsais16x64_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, 0, n);
+    }
+#if defined(LIBSAIS_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = 0; block_start < n; block_start = block_end)
+        {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; }
+
+            d = libsais16x64_partial_sorting_scan_left_to_right_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+
+    return d;
+}
+
+static void libsais16x64_partial_sorting_scan_left_to_right_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    SA[buckets[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+
+    if (threads == 1 || n < 65536)
+    {
+       libsais16x64_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, 0, n);
+    }
+#if defined(LIBSAIS_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = 0; block_start < n; block_start = block_end)
+        {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; }
+
+            libsais16x64_partial_sorting_scan_left_to_right_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais16x64_partial_sorting_shift_markers_16u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, const sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
+
+    fast_sint_t c;
+
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536)
+#else
+    UNUSED(threads); UNUSED(n);
+#endif
+    for (c = BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); c >= BUCKETS_INDEX2(1, 0); c -= BUCKETS_INDEX2(1, 0))
+    {
+        fast_sint_t i, j; sa_sint_t s = SAINT_MIN;
+        for (i = (fast_sint_t)temp_bucket[c] - 1, j = (fast_sint_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3; i >= j; i -= 4)
+        {
+            libsais16x64_prefetchw(&SA[i - prefetch_distance]);
+
+            sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0;
+            sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1;
+            sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2;
+            sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3;
+        }
+
+        for (j -= 3; i >= j; i -= 1)
+        {
+            sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q;
+        }
+    }
+}
+
+static void libsais16x64_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, const sa_sint_t * RESTRICT buckets, sa_sint_t threads)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k];
+    
+    fast_sint_t c;
+
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && k >= 65536)
+#else
+    UNUSED(threads);
+#endif
+    for (c = (fast_sint_t)k - 1; c >= 1; c -= 1)
+    {
+        fast_sint_t i, j; sa_sint_t s = SAINT_MIN;
+        for (i = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 0)] - 1, j = (fast_sint_t)temp_bucket[BUCKETS_INDEX2(c - 1, 0)] + 3; i >= j; i -= 4)
+        {
+            libsais16x64_prefetchw(&SA[i - prefetch_distance]);
+
+            sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0;
+            sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1;
+            sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2;
+            sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3;
+        }
+
+        for (j -= 3; i >= j; i -= 1)
+        {
+            sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q;
+        }
+    }
+}
+
+static void libsais16x64_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i; sa_sint_t s = SUFFIX_GROUP_MARKER;
+    for (i = (fast_sint_t)n - 1; i >= 3; i -= 4)
+    {
+        libsais16x64_prefetchw(&SA[i - prefetch_distance]);
+
+        sa_sint_t p0 = SA[i - 0], q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q0; SA[i - 0] = p0 ^ q0;
+        sa_sint_t p1 = SA[i - 1], q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q1; SA[i - 1] = p1 ^ q1;
+        sa_sint_t p2 = SA[i - 2], q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q2; SA[i - 2] = p2 ^ q2;
+        sa_sint_t p3 = SA[i - 3], q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q3; SA[i - 3] = p3 ^ q3;
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        sa_sint_t p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q; SA[i] = p ^ q;
+    }
+}
+
+static void libsais16x64_partial_sorting_shift_buckets_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
+{
+    sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k];
+
+    fast_sint_t i;
+    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0))
+    {
+        buckets[2 * i + BUCKETS_INDEX4(0, 0)] = temp_bucket[i + BUCKETS_INDEX2(0, 0)];
+        buckets[2 * i + BUCKETS_INDEX4(0, 1)] = temp_bucket[i + BUCKETS_INDEX2(0, 1)];
+    }
+}
+
+static sa_sint_t libsais16x64_partial_sorting_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16x64_prefetchr(&SA[i - 2 * prefetch_distance]);
+
+        libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1);
+        libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2);
+        libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1);
+        libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2);
+
+        sa_sint_t p0 = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+        SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+
+        sa_sint_t p1 = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+        SA[--induction_bucket[v1]] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+        SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+    }
+
+    return d;
+}
+
+#if defined(LIBSAIS_OPENMP)
+
+static void libsais16x64_partial_sorting_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+    fast_sint_t i, j, count = 0; sa_sint_t d = 1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16x64_prefetchr(&SA[i - 2 * prefetch_distance]);
+
+        libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1);
+        libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2);
+        libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1);
+        libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2);
+
+        sa_sint_t p0 = cache[count].index = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d;
+        sa_sint_t p1 = cache[count].index = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d;
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); induction_bucket[v]++; distinct_names[v] = d;
+    }
+
+    state[0].state.position   = (fast_sint_t)d - 1;
+    state[0].state.count      = count;
+}
+
+static void libsais16x64_partial_sorting_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+    fast_sint_t i, j;
+    for (i = 0, j = count - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchr(&cache[i + prefetch_distance]);
+
+        sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol;
+        SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+
+        sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol;
+        SA[--induction_bucket[v1]] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+    }
+
+    for (j += 1; i < j; i += 1)
+    {
+        sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol;
+        SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+    }
+}
+
+static sa_sint_t libsais16x64_partial_sorting_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            d = libsais16x64_partial_sorting_scan_right_to_left_16u(T, SA, buckets, d, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                libsais16x64_partial_sorting_scan_right_to_left_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+                sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    sa_sint_t * RESTRICT temp_induction_bucket    = &thread_state[t].state.buckets[0 * ALPHABET_SIZE];
+                    sa_sint_t * RESTRICT temp_distinct_names      = &thread_state[t].state.buckets[2 * ALPHABET_SIZE];
+
+                    fast_sint_t c; 
+                    for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A - B; temp_induction_bucket[c] = A; }
+
+                    for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; }
+                    d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position;
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16x64_partial_sorting_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position);
+            }
+        }
+#endif
+    }
+
+    return d;
+}
+
+#endif
+
+static void libsais16x64_partial_sorting_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    fast_sint_t scan_start    = (fast_sint_t)left_suffixes_count + 1;
+    fast_sint_t scan_end      = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
+
+    if (threads == 1 || (scan_end - scan_start) < 65536)
+    {
+        libsais16x64_partial_sorting_scan_right_to_left_16u(T, SA, buckets, d, scan_start, scan_end - scan_start);
+    }
+#if defined(LIBSAIS_OPENMP)
+    else
+    {
+        sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+        sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+
+        fast_sint_t block_start;
+        for (block_start = scan_end - 1; block_start >= scan_start; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start--;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < scan_start) { block_max_end = scan_start - 1; }
+                fast_sint_t block_end     = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
+                fast_sint_t block_size    = block_start - block_end;
+
+                if (block_size < 32)
+                {
+                    for (; block_start > block_end; block_start -= 1)
+                    {
+                        sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+                        SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+                    }
+                }
+                else
+                {
+                    d = libsais16x64_partial_sorting_scan_right_to_left_16u_block_omp(T, SA, buckets, d, block_end + 1, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static sa_sint_t libsais16x64_partial_sorting_scan_right_to_left_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16x64_prefetchr(&SA[i - 3 * prefetch_distance]);
+
+        libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 1);
+        libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 2);
+        libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 1);
+        libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 2);
+
+        sa_sint_t p0 = SA[i - prefetch_distance - 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais16x64_prefetchw(&buckets[v0]);
+        sa_sint_t p1 = SA[i - prefetch_distance - 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais16x64_prefetchw(&buckets[v1]);
+
+        sa_sint_t p2 = SA[i - 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] > T[p2 - 1]);
+        SA[--buckets[v2]] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d;
+
+        sa_sint_t p3 = SA[i - 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] > T[p3 - 1]);
+        SA[--buckets[v3]] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d;
+    }
+
+    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]);
+        SA[--buckets[v]] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+    }
+
+    return d;
+}
+
+static sa_sint_t libsais16x64_partial_sorting_scan_right_to_left_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[3 * (fast_sint_t)k];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[0 * (fast_sint_t)k];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16x64_prefetchw(&SA[i - 3 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16x64_prefetchw(&induction_bucket[Ts2]); libsais16x64_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); }
+        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16x64_prefetchw(&induction_bucket[Ts3]); libsais16x64_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); }
+
+        sa_sint_t p0 = SA[i - 0];
+        if (p0 > 0)
+        {
+            SA[i - 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+            SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+        }
+
+        sa_sint_t p1 = SA[i - 1];
+        if (p1 > 0)
+        {
+            SA[i - 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+            SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+        }
+    }
+
+    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i];
+        if (p > 0)
+        {
+            SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+            SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+        }
+    }
+
+    return d;
+}
+
+static void libsais16x64_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16x64_prefetchw(&SA[i - 3 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais16x64_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16x64_prefetchr(&T[s2] - 2); }
+        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais16x64_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16x64_prefetchr(&T[s3] - 2); }
+
+        sa_sint_t p0 = SA[i - 0]; if (p0 > 0) { SA[i - 0] = 0; SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i - 1]; if (p1 > 0) { SA[i - 1] = 0; SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; if (p > 0) { SA[i] = 0; SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); }
+    }
+}
+
+#if defined(LIBSAIS_OPENMP)
+
+static void libsais16x64_partial_sorting_scan_right_to_left_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]);
+
+        libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+        libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+        libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+        libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+        libsais16x64_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0;
+        sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol;
+    }
+}
+
+static void libsais16x64_partial_sorting_scan_right_to_left_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+
+        libsais16x64_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol;
+    }
+}
+
+static void libsais16x64_partial_sorting_scan_right_to_left_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+
+        libsais16x64_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; } cache[i + 0].symbol = symbol0;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; } cache[i + 1].symbol = symbol1;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; } cache[i].symbol = symbol;
+    }
+}
+
+static sa_sint_t libsais16x64_partial_sorting_scan_right_to_left_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16x64_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+        libsais16x64_prefetchw(&buckets[cache[i - prefetch_distance - 0].symbol]);
+        libsais16x64_prefetchw(&buckets[cache[i - prefetch_distance - 1].symbol]);
+
+        sa_sint_t v0 = cache[i - 0].symbol, p0 = cache[i - 0].index; d += (p0 < 0); cache[i - 0].symbol = --buckets[v0]; cache[i - 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d;
+        if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t s = cache[i - 0].symbol, q = (cache[s].index = cache[i - 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); }
+
+        sa_sint_t v1 = cache[i - 1].symbol, p1 = cache[i - 1].index; d += (p1 < 0); cache[i - 1].symbol = --buckets[v1]; cache[i - 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d;
+        if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t s = cache[i - 1].symbol, q = (cache[s].index = cache[i - 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = --buckets[v]; cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+        if (cache[i].symbol >= omp_block_start) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); }
+    }
+
+    return d;
+}
+
+static sa_sint_t libsais16x64_partial_sorting_scan_right_to_left_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[3 * (fast_sint_t)k];
+    sa_sint_t * RESTRICT distinct_names   = &buckets[0 * (fast_sint_t)k];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16x64_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+        sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 >> 1]; libsais16x64_prefetchw(s0 >= 0 ? Is0 : NULL); const sa_sint_t * Ds0 = &distinct_names[s0]; libsais16x64_prefetchw(s0 >= 0 ? Ds0 : NULL); 
+        sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 >> 1]; libsais16x64_prefetchw(s1 >= 0 ? Is1 : NULL); const sa_sint_t * Ds1 = &distinct_names[s1]; libsais16x64_prefetchw(s1 >= 0 ? Ds1 : NULL);
+
+        sa_sint_t v0 = cache[i - 0].symbol;
+        if (v0 >= 0)
+        {
+            sa_sint_t p0 = cache[i - 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 0].symbol = --induction_bucket[v0 >> 1]; cache[i - 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+            if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } }
+        }
+
+        sa_sint_t v1 = cache[i - 1].symbol;
+        if (v1 >= 0)
+        {
+            sa_sint_t p1 = cache[i - 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 1].symbol = --induction_bucket[v1 >> 1]; cache[i - 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+            if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } }
+        }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t v = cache[i].symbol;
+        if (v >= 0)
+        {
+            sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = --induction_bucket[v >> 1]; cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+            if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } }
+        }
+    }
+
+    return d;
+}
+
+static void libsais16x64_partial_sorting_scan_right_to_left_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16x64_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+        sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais16x64_prefetchw(s0 >= 0 ? Is0 : NULL);
+        sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais16x64_prefetchw(s1 >= 0 ? Is1 : NULL);
+
+        sa_sint_t v0 = cache[i - 0].symbol;
+        if (v0 >= 0)
+        {
+            cache[i - 0].symbol = --induction_bucket[v0];
+            if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } }
+        }
+
+        sa_sint_t v1 = cache[i - 1].symbol;
+        if (v1 >= 0)
+        {
+            cache[i - 1].symbol = --induction_bucket[v1];
+            if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; }}
+        }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t v = cache[i].symbol;
+        if (v >= 0)
+        {
+            cache[i].symbol = --induction_bucket[v];
+            if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } }
+        }
+    }
+}
+
+static sa_sint_t libsais16x64_partial_sorting_scan_right_to_left_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            d = libsais16x64_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                libsais16x64_partial_sorting_scan_right_to_left_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                d = libsais16x64_partial_sorting_scan_right_to_left_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16x64_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return d;
+}
+
+static sa_sint_t libsais16x64_partial_sorting_scan_right_to_left_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            d = libsais16x64_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                libsais16x64_partial_sorting_scan_right_to_left_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                d = libsais16x64_partial_sorting_scan_right_to_left_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16x64_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return d;
+}
+
+static void libsais16x64_partial_sorting_scan_right_to_left_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                libsais16x64_partial_sorting_scan_right_to_left_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                libsais16x64_partial_sorting_scan_right_to_left_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16x64_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+#endif
+
+static sa_sint_t libsais16x64_partial_sorting_scan_right_to_left_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    fast_sint_t scan_start    = (fast_sint_t)left_suffixes_count + 1;
+    fast_sint_t scan_end      = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
+
+    if (threads == 1 || (scan_end - scan_start) < 65536)
+    {
+        d = libsais16x64_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, scan_start, scan_end - scan_start);
+    }
+#if defined(LIBSAIS_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = scan_end - 1; block_start >= scan_start; block_start = block_end)
+        {
+            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < scan_start) { block_end = scan_start - 1; }
+
+            d = libsais16x64_partial_sorting_scan_right_to_left_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+
+    return d;
+}
+
+static sa_sint_t libsais16x64_partial_sorting_scan_right_to_left_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || n < 65536)
+    {
+        d = libsais16x64_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, 0, n);
+    }
+#if defined(LIBSAIS_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end)
+        {
+            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; }
+
+            d = libsais16x64_partial_sorting_scan_right_to_left_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+
+    return d;
+}
+
+static void libsais16x64_partial_sorting_scan_right_to_left_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || n < 65536)
+    {
+        libsais16x64_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, 0, n);
+    }
+#if defined(LIBSAIS_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end)
+        {
+            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; }
+
+            libsais16x64_partial_sorting_scan_right_to_left_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static fast_sint_t libsais16x64_partial_sorting_gather_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j, l;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4)
+    {
+        libsais16x64_prefetchr(&SA[i + prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + 0]; SA[l] = (s0 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s0 < 0);
+        sa_sint_t s1 = SA[i + 1]; SA[l] = (s1 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s1 < 0);
+        sa_sint_t s2 = SA[i + 2]; SA[l] = (s2 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s2 < 0);
+        sa_sint_t s3 = SA[i + 3]; SA[l] = (s3 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s3 < 0);
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        sa_sint_t s = SA[i]; SA[l] = (s - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s < 0);
+    }
+
+    return l;
+}
+
+static fast_sint_t libsais16x64_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j, l;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4)
+    {
+        libsais16x64_prefetchr(&SA[i + prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + 0]; SA[l] = s0 & SAINT_MAX; l += (s0 < 0);
+        sa_sint_t s1 = SA[i + 1]; SA[l] = s1 & SAINT_MAX; l += (s1 < 0);
+        sa_sint_t s2 = SA[i + 2]; SA[l] = s2 & SAINT_MAX; l += (s2 < 0);
+        sa_sint_t s3 = SA[i + 3]; SA[l] = s3 & SAINT_MAX; l += (s3 < 0);
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l += (s < 0);
+    }
+
+    return l;
+}
+
+static void libsais16x64_partial_sorting_gather_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.position = omp_block_start;
+                thread_state[omp_thread_num].state.count = libsais16x64_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size) - omp_block_start;
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t, position = 0;
+                for (t = 0; t < omp_num_threads; ++t)
+                { 
+                    if (t > 0 && thread_state[t].state.count > 0)
+                    {
+                        memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+                    }
+
+                    position += thread_state[t].state.count;
+                }
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16x64_partial_sorting_gather_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.position = omp_block_start;
+                thread_state[omp_thread_num].state.count = libsais16x64_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size) - omp_block_start;
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t, position = 0;
+                for (t = 0; t < omp_num_threads; ++t)
+                { 
+                    if (t > 0 && thread_state[t].state.count > 0)
+                    {
+                        memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+                    }
+
+                    position += thread_state[t].state.count;
+                }
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16x64_induce_partial_order_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    memset(&buckets[2 * ALPHABET_SIZE], 0, (size_t)2 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+    sa_sint_t d = libsais16x64_partial_sorting_scan_left_to_right_16u_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
+    libsais16x64_partial_sorting_shift_markers_16u_omp(SA, n, buckets, threads);
+    libsais16x64_partial_sorting_scan_right_to_left_16u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
+}
+
+static void libsais16x64_induce_partial_order_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t d = libsais16x64_partial_sorting_scan_left_to_right_32s_6k_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
+    libsais16x64_partial_sorting_shift_markers_32s_6k_omp(SA, k, buckets, threads);
+    libsais16x64_partial_sorting_shift_buckets_32s_6k(k, buckets);
+    libsais16x64_partial_sorting_scan_right_to_left_32s_6k_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
+}
+
+static void libsais16x64_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+    sa_sint_t d = libsais16x64_partial_sorting_scan_left_to_right_32s_4k_omp(T, SA, n, k, buckets, 0, threads, thread_state);
+    libsais16x64_partial_sorting_shift_markers_32s_4k(SA, n);
+    libsais16x64_partial_sorting_scan_right_to_left_32s_4k_omp(T, SA, n, k, buckets, d, threads, thread_state);
+    libsais16x64_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads, thread_state);
+}
+
+static void libsais16x64_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais16x64_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * (fast_sint_t)k], threads, thread_state);
+    libsais16x64_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * (fast_sint_t)k], threads, thread_state);
+    libsais16x64_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
+}
+
+static void libsais16x64_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais16x64_count_suffixes_32s(T, n, k, buckets);
+    libsais16x64_initialize_buckets_start_32s_1k(k, buckets);
+    libsais16x64_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
+
+    libsais16x64_count_suffixes_32s(T, n, k, buckets);
+    libsais16x64_initialize_buckets_end_32s_1k(k, buckets);
+    libsais16x64_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
+
+    libsais16x64_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
+}
+
+static sa_sint_t libsais16x64_renumber_lms_suffixes_16u(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAm = &SA[m];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]);
+
+        libsais16x64_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
+        libsais16x64_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
+        libsais16x64_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
+        libsais16x64_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
+
+        sa_sint_t p0 = SA[i + 0]; SAm[(p0 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p0 < 0;
+        sa_sint_t p1 = SA[i + 1]; SAm[(p1 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p1 < 0;
+        sa_sint_t p2 = SA[i + 2]; SAm[(p2 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p2 < 0;
+        sa_sint_t p3 = SA[i + 3]; SAm[(p3 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p3 < 0;
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SAm[(p & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p < 0;
+    }
+
+    return name;
+}
+
+static fast_sint_t libsais16x64_gather_marked_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    l -= 1;
+
+    fast_sint_t i, j;
+    for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4)
+    {
+        libsais16x64_prefetchr(&SA[i - prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - 0]; SA[l] = s0 & SAINT_MAX; l -= s0 < 0;
+        sa_sint_t s1 = SA[i - 1]; SA[l] = s1 & SAINT_MAX; l -= s1 < 0;
+        sa_sint_t s2 = SA[i - 2]; SA[l] = s2 & SAINT_MAX; l -= s2 < 0;
+        sa_sint_t s3 = SA[i - 3]; SA[l] = s3 & SAINT_MAX; l -= s3 < 0;
+    }
+
+    for (j -= 3; i >= j; i -= 1)
+    {
+        sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l -= s < 0;
+    }
+
+    l += 1;
+
+    return l;
+}
+
+static sa_sint_t libsais16x64_renumber_lms_suffixes_16u_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t name = 0;
+
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            name = libsais16x64_renumber_lms_suffixes_16u(SA, m, 0, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16x64_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+                if (omp_thread_num == omp_num_threads - 1)
+                {
+                    name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
+                }
+
+                libsais16x64_renumber_lms_suffixes_16u(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return name;
+}
+
+static void libsais16x64_gather_marked_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_gather_marked_lms_suffixes(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                if (omp_thread_num < omp_num_threads - 1)
+                {
+                    thread_state[omp_thread_num].state.position = libsais16x64_gather_marked_lms_suffixes(SA, m, (fast_sint_t)m + omp_block_start + omp_block_size, omp_block_start, omp_block_size);
+                    thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size - thread_state[omp_thread_num].state.position;
+                }
+                else
+                {
+                    thread_state[omp_thread_num].state.position = libsais16x64_gather_marked_lms_suffixes(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
+                    thread_state[omp_thread_num].state.count = (fast_sint_t)n + (fast_sint_t)fs - thread_state[omp_thread_num].state.position;
+                }
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t, position = (fast_sint_t)n + (fast_sint_t)fs;
+                    
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                { 
+                    position -= thread_state[t].state.count;
+                    if (t != omp_num_threads - 1 && thread_state[t].state.count > 0)
+                    {
+                        memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+                    }
+                }
+            }
+        }
+#endif
+    }
+}
+
+static sa_sint_t libsais16x64_renumber_and_gather_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
+
+    sa_sint_t name = libsais16x64_renumber_lms_suffixes_16u_omp(SA, m, threads, thread_state);
+    if (name < m)
+    {
+        libsais16x64_gather_marked_lms_suffixes_omp(SA, n, m, fs, threads, thread_state);
+    }
+    else
+    {
+        fast_sint_t i; for (i = 0; i < m; i += 1) { SA[i] &= SAINT_MAX; }
+    }
+
+    return name;
+}
+
+static sa_sint_t libsais16x64_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAm = &SA[m];
+
+    fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        libsais16x64_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
+        libsais16x64_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
+        libsais16x64_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
+        libsais16x64_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
+
+        p0 = SA[i + 0]; SAm[(SA[i + 0] = p0 & SAINT_MAX) >> 1] = name | (p0 & p3 & SAINT_MIN); name += p0 < 0;
+        p1 = SA[i + 1]; SAm[(SA[i + 1] = p1 & SAINT_MAX) >> 1] = name | (p1 & p0 & SAINT_MIN); name += p1 < 0;
+        p2 = SA[i + 2]; SAm[(SA[i + 2] = p2 & SAINT_MAX) >> 1] = name | (p2 & p1 & SAINT_MIN); name += p2 < 0;
+        p3 = SA[i + 3]; SAm[(SA[i + 3] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0;
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        p2 = p3; p3 = SA[i]; SAm[(SA[i] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0;
+    }
+
+    return name;
+}
+
+static void libsais16x64_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0;
+    for (i = (fast_sint_t)m + omp_block_start, j = (fast_sint_t)m + omp_block_start + omp_block_size - 3; i < j; i += 4)
+    {
+        libsais16x64_prefetchw(&SA[i + prefetch_distance]);
+
+        p0 = SA[i + 0]; SA[i + 0] = p0 & (p3 | SAINT_MAX); p0 = (p0 == 0) ? p3 : p0;
+        p1 = SA[i + 1]; SA[i + 1] = p1 & (p0 | SAINT_MAX); p1 = (p1 == 0) ? p0 : p1;
+        p2 = SA[i + 2]; SA[i + 2] = p2 & (p1 | SAINT_MAX); p2 = (p2 == 0) ? p1 : p2;
+        p3 = SA[i + 3]; SA[i + 3] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3;
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        p2 = p3; p3 = SA[i]; SA[i] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3;
+    }
+}
+
+static void libsais16x64_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAm = &SA[m];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4)
+    {
+        libsais16x64_prefetchw(&SAm[i + prefetch_distance]);
+
+        SAm[i + 0] = (SAm[i + 0] < 0 ? SAm[i + 0] : 0) & SAINT_MAX;
+        SAm[i + 1] = (SAm[i + 1] < 0 ? SAm[i + 1] : 0) & SAINT_MAX;
+        SAm[i + 2] = (SAm[i + 2] < 0 ? SAm[i + 2] : 0) & SAINT_MAX;
+        SAm[i + 3] = (SAm[i + 3] < 0 ? SAm[i + 3] : 0) & SAINT_MAX;
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        SAm[i] = (SAm[i] < 0 ? SAm[i] : 0) & SAINT_MAX;
+    }
+}
+
+static sa_sint_t libsais16x64_renumber_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t name = 0;
+
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            name = libsais16x64_renumber_distinct_lms_suffixes_32s_4k(SA, m, 1, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16x64_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t t, count = 1; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+                if (omp_thread_num == omp_num_threads - 1)
+                {
+                    name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
+                }
+
+                libsais16x64_renumber_distinct_lms_suffixes_32s_4k(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return name - 1;
+}
+
+static void libsais16x64_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_block_stride  = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_block_start   = 0;
+        fast_sint_t omp_block_size    = (fast_sint_t)n >> 1;
+#endif
+        libsais16x64_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start, omp_block_size);
+    }
+}
+
+static void libsais16x64_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_block_stride  = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_block_start   = 0;
+        fast_sint_t omp_block_size    = (fast_sint_t)n >> 1;
+#endif
+        libsais16x64_clamp_lms_suffixes_length_32s(SA, m, omp_block_start, omp_block_size);
+    }
+}
+
+static sa_sint_t libsais16x64_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
+
+    sa_sint_t name = libsais16x64_renumber_distinct_lms_suffixes_32s_4k_omp(SA, m, threads, thread_state);
+    if (name < m)
+    {
+        libsais16x64_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
+    }
+
+    return name;
+}
+
+static sa_sint_t libsais16x64_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAm = &SA[m];
+
+    {
+        libsais16x64_gather_lms_suffixes_32s(T, SA, n);
+
+        memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t));
+
+        fast_sint_t i, j;
+        for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3; i < j; i += 4)
+        {
+            libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]);
+
+            libsais16x64_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
+            libsais16x64_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
+            libsais16x64_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
+            libsais16x64_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
+
+            SAm[((sa_uint_t)SA[i + 0]) >> 1] = SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN;
+            SAm[((sa_uint_t)SA[i + 1]) >> 1] = SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN;
+            SAm[((sa_uint_t)SA[i + 2]) >> 1] = SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN;
+            SAm[((sa_uint_t)SA[i + 3]) >> 1] = SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN;
+        }
+
+        for (j += prefetch_distance + 3; i < j; i += 1)
+        {
+            SAm[((sa_uint_t)SA[i]) >> 1] = SA[i + 1] - SA[i] + 1 + SAINT_MIN;
+        }
+
+        SAm[((sa_uint_t)SA[n - 1]) >> 1] = 1 + SAINT_MIN;
+    }
+
+    {
+        libsais16x64_clamp_lms_suffixes_length_32s_omp(SA, n, m, threads);
+    }
+
+    sa_sint_t name = 1;
+
+    {
+        fast_sint_t i, j, p = SA[0], plen = SAm[p >> 1]; sa_sint_t pdiff = SAINT_MIN;
+        for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2)
+        {
+            libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]);
+            
+            libsais16x64_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais16x64_prefetchr(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]);
+            libsais16x64_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais16x64_prefetchr(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]);
+
+            fast_sint_t q = SA[i + 0], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN;
+            if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < qlen); qdiff = (sa_sint_t)(l - qlen) & SAINT_MIN; }
+            SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0);
+
+            p = SA[i + 1]; plen = SAm[p >> 1]; pdiff = SAINT_MIN;
+            if (qlen == plen) { fast_sint_t l = 0; do { if (T[q + l] != T[p + l]) { break; } } while (++l < plen); pdiff = (sa_sint_t)(l - plen) & SAINT_MIN; }
+            SAm[q >> 1] = name | (qdiff & pdiff); name += (pdiff < 0);
+        }
+
+        for (j += prefetch_distance + 1; i < j; i += 1)
+        {
+            fast_sint_t q = SA[i], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN;
+            if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < plen); qdiff = (sa_sint_t)(l - plen) & SAINT_MIN; }
+            SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0);
+
+            p = q; plen = qlen; pdiff = qdiff;
+        }
+
+        SAm[p >> 1] = name | pdiff; name++;
+    }
+
+    if (name <= m)
+    {
+        libsais16x64_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
+    }
+
+    return name - 1;
+}
+
+static void libsais16x64_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    const sa_sint_t * RESTRICT SAnm = &SA[n - m];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        libsais16x64_prefetchr(&SAnm[SA[i + prefetch_distance + 0]]);
+        libsais16x64_prefetchr(&SAnm[SA[i + prefetch_distance + 1]]);
+        libsais16x64_prefetchr(&SAnm[SA[i + prefetch_distance + 2]]);
+        libsais16x64_prefetchr(&SAnm[SA[i + prefetch_distance + 3]]);
+
+        SA[i + 0] = SAnm[SA[i + 0]];
+        SA[i + 1] = SAnm[SA[i + 1]];
+        SA[i + 2] = SAnm[SA[i + 2]];
+        SA[i + 3] = SAnm[SA[i + 3]];
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        SA[i] = SAnm[SA[i]];
+    }
+}
+
+static void libsais16x64_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_block_stride  = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_block_start   = 0;
+        fast_sint_t omp_block_size    = m;
+#endif
+
+        libsais16x64_reconstruct_lms_suffixes(SA, n, m, omp_block_start, omp_block_size);
+    }
+}
+
+static void libsais16x64_place_lms_suffixes_interval_16u(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+    const sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
+
+    fast_sint_t c, j = n;
+    for (c = ALPHABET_SIZE - 2; c >= 0; --c)
+    {
+        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+        if (l > 0)
+        {
+            fast_sint_t i = bucket_end[c];
+            if (j - i > 0)
+            {
+                memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+            }
+
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+        }
+    }
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais16x64_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+    const sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k];
+
+    fast_sint_t c, j = n;
+    for (c = (fast_sint_t)k - 2; c >= 0; --c)
+    {
+        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+        if (l > 0)
+        {
+            fast_sint_t i = bucket_end[c];
+            if (j - i > 0)
+            {
+                memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+            }
+
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+        }
+    }
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais16x64_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+    fast_sint_t j = n;
+
+    if (k > 1)
+    {
+        fast_sint_t c;
+        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0))
+        {
+            fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] - (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
+            if (l > 0)
+            {
+                fast_sint_t i = buckets[c];
+                if (j - i > 0)
+                {
+                    memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+                }
+
+                memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+            }
+        }
+    }
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais16x64_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t m, sa_sint_t * RESTRICT buckets)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t c = k - 1; fast_sint_t i, l = buckets[c];
+    for (i = (fast_sint_t)m - 1; i >= prefetch_distance + 3; i -= 4)
+    {
+        libsais16x64_prefetchr(&SA[i - 2 * prefetch_distance]);
+
+        libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 0]]);
+        libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 1]]);
+        libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 2]]);
+        libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 3]]);
+
+        sa_sint_t p0 = SA[i - 0]; if (T[p0] != c) { c = T[p0]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p0;
+        sa_sint_t p1 = SA[i - 1]; if (T[p1] != c) { c = T[p1]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p1;
+        sa_sint_t p2 = SA[i - 2]; if (T[p2] != c) { c = T[p2]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p2;
+        sa_sint_t p3 = SA[i - 3]; if (T[p3] != c) { c = T[p3]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p3;
+    }
+
+    for (; i >= 0; i -= 1)
+    {
+        sa_sint_t p = SA[i]; if (T[p] != c) { c = T[p]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p;
+    }
+
+    memset(&SA[0], 0, (size_t)l * sizeof(sa_sint_t));
+}
+
+static void libsais16x64_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+    const sa_sint_t * RESTRICT bucket_end = &buckets[5 * (fast_sint_t)k];
+
+    fast_sint_t c, j = n;
+    for (c = (fast_sint_t)k - 2; c >= 0; --c)
+    {
+        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 1)];
+        if (l > 0)
+        {
+            fast_sint_t i = bucket_end[c];
+            if (j - i > 0)
+            {
+                memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+            }
+
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+        }
+    }
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais16x64_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+    const sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k];
+
+    fast_sint_t c, j = n;
+    for (c = (fast_sint_t)k - 2; c >= 0; --c)
+    {
+        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+        if (l > 0)
+        {
+            fast_sint_t i = bucket_end[c];
+            if (j - i > 0)
+            {
+                memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+            }
+
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+        }
+    }
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais16x64_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
+{
+    fast_sint_t j = n;
+
+    if (k > 1)
+    {
+        fast_sint_t c;
+        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0))
+        {
+            fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
+            if (l > 0)
+            {
+                fast_sint_t i = buckets[c];
+                if (j - i > 0)
+                {
+                    memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+                }
+
+                memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+            }
+        }
+    }
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais16x64_final_bwt_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+    }
+}
+
+static void libsais16x64_final_bwt_aux_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]]; }}
+        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]]; }}
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } }
+    }
+}
+
+static void libsais16x64_final_sorting_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+    }
+}
+
+static void libsais16x64_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchw(&SA[i + 3 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais16x64_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16x64_prefetchr(&T[s2] - 2); }
+        sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais16x64_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16x64_prefetchr(&T[s3] - 2); }
+
+        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j += 2 * prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+    }
+}
+
+#if defined(LIBSAIS_OPENMP)
+
+static fast_sint_t libsais16x64_final_bwt_scan_left_to_right_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+   const fast_sint_t prefetch_distance = 32;
+
+   memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+   fast_sint_t i, j, count = 0;
+   for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+   {
+       libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+       sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+       sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+
+       sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+       sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+   }
+
+   for (j += prefetch_distance + 1; i < j; i += 1)
+   {
+       sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+   }
+
+   return count;
+}
+
+static fast_sint_t libsais16x64_final_sorting_scan_left_to_right_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+   const fast_sint_t prefetch_distance = 32;
+
+   memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+   fast_sint_t i, j, count = 0;
+   for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+   {
+       libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+       sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+       sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+
+       sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
+       sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+   }
+
+   for (j += prefetch_distance + 1; i < j; i += 1)
+   {
+       sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+   }
+
+   return count;
+}
+
+static void libsais16x64_final_order_scan_left_to_right_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = 0, j = count - 3; i < j; i += 4)
+    {
+        libsais16x64_prefetchr(&cache[i + prefetch_distance]);
+
+        SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index;
+        SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index;
+        SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index;
+        SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index;
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        SA[buckets[cache[i].symbol]++] = cache[i].index;
+    }
+}
+
+static void libsais16x64_final_bwt_aux_scan_left_to_right_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = 0, j = count - 3; i < j; i += 4)
+    {
+        libsais16x64_prefetchr(&cache[i + prefetch_distance]);
+
+        SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index; if ((cache[i + 0].index & rm) == 0) { I[(cache[i + 0].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 0].symbol]; }
+        SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 1].symbol]; }
+        SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index; if ((cache[i + 2].index & rm) == 0) { I[(cache[i + 2].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 2].symbol]; }
+        SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index; if ((cache[i + 3].index & rm) == 0) { I[(cache[i + 3].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 3].symbol]; }
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        SA[buckets[cache[i].symbol]++] = cache[i].index; if ((cache[i].index & rm) == 0) { I[(cache[i].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol]; }
+    }
+}
+
+static void libsais16x64_final_sorting_scan_left_to_right_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+
+        libsais16x64_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol;
+    }
+}
+
+static void libsais16x64_final_sorting_scan_left_to_right_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
+    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchw(&cache[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais16x64_prefetchw(s0 >= 0 ? Is0 : NULL);
+        sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais16x64_prefetchw(s1 >= 0 ? Is1 : NULL);
+        
+        sa_sint_t v0 = cache[i + 0].symbol;
+        if (v0 >= 0)
+        {
+            cache[i + 0].symbol = induction_bucket[v0]++;
+            if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; cache[i + 0].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+        }
+
+        sa_sint_t v1 = cache[i + 1].symbol;
+        if (v1 >= 0)
+        {
+            cache[i + 1].symbol = induction_bucket[v1]++;
+            if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; cache[i + 1].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+        }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t v = cache[i].symbol;
+        if (v >= 0)
+        {
+            cache[i].symbol = induction_bucket[v]++;
+            if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+        }
+    }
+}
+
+static void libsais16x64_final_bwt_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_final_bwt_scan_left_to_right_16u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16x64_final_bwt_scan_left_to_right_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t;
+                for (t = 0; t < omp_num_threads; ++t)
+                {
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16x64_final_order_scan_left_to_right_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16x64_final_bwt_aux_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_final_bwt_aux_scan_left_to_right_16u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16x64_final_bwt_scan_left_to_right_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t;
+                for (t = 0; t < omp_num_threads; ++t)
+                {
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16x64_final_bwt_aux_scan_left_to_right_16u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16x64_final_sorting_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_final_sorting_scan_left_to_right_16u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16x64_final_sorting_scan_left_to_right_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t;
+                for (t = 0; t < omp_num_threads; ++t)
+                {
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16x64_final_order_scan_left_to_right_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16x64_final_sorting_scan_left_to_right_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_final_sorting_scan_left_to_right_32s(T, SA, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                libsais16x64_final_sorting_scan_left_to_right_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                libsais16x64_final_sorting_scan_left_to_right_32s_block_sort(T, buckets, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16x64_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+#endif
+
+static void libsais16x64_final_bwt_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+    if (threads == 1 || n < 65536)
+    {
+        libsais16x64_final_bwt_scan_left_to_right_16u(T, SA, induction_bucket, 0, n);
+    }
+#if defined(LIBSAIS_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = 0; block_start < n; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start++;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;}
+                fast_sint_t block_end     = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
+                fast_sint_t block_size    = block_end - block_start;
+
+                if (block_size < 32)
+                {
+                    for (; block_start < block_end; block_start += 1)
+                    {
+                        sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+                    }
+                }
+                else
+                {
+                    libsais16x64_final_bwt_scan_left_to_right_16u_block_omp(T, SA, induction_bucket, block_start, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais16x64_final_bwt_aux_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+    if ((((sa_sint_t)n - 1) & rm) == 0) { I[((sa_sint_t)n - 1) / (rm + 1)] = induction_bucket[T[(sa_sint_t)n - 1]]; }
+
+    if (threads == 1 || n < 65536)
+    {
+        libsais16x64_final_bwt_aux_scan_left_to_right_16u(T, SA, rm, I, induction_bucket, 0, n);
+    }
+#if defined(LIBSAIS_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = 0; block_start < n; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start++;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;}
+                fast_sint_t block_end     = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
+                fast_sint_t block_size    = block_end - block_start;
+
+                if (block_size < 32)
+                {
+                    for (; block_start < block_end; block_start += 1)
+                    {
+                        sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } }
+                    }
+                }
+                else
+                {
+                    libsais16x64_final_bwt_aux_scan_left_to_right_16u_block_omp(T, SA, rm, I, induction_bucket, block_start, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais16x64_final_sorting_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+    if (threads == 1 || n < 65536)
+    {
+        libsais16x64_final_sorting_scan_left_to_right_16u(T, SA, induction_bucket, 0, n);
+    }
+#if defined(LIBSAIS_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = 0; block_start < n; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start++;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;}
+                fast_sint_t block_end     = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
+                fast_sint_t block_size    = block_end - block_start;
+
+                if (block_size < 32)
+                {
+                    for (; block_start < block_end; block_start += 1)
+                    {
+                        sa_sint_t p = SA[block_start]; SA[block_start] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+                    }
+                }
+                else
+                {
+                    libsais16x64_final_sorting_scan_left_to_right_16u_block_omp(T, SA, induction_bucket, block_start, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais16x64_final_sorting_scan_left_to_right_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+
+    if (threads == 1 || n < 65536)
+    {
+        libsais16x64_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0, n);
+    }
+#if defined(LIBSAIS_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = 0; block_start < n; block_start = block_end)
+        {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; }
+
+            libsais16x64_final_sorting_scan_left_to_right_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static sa_sint_t libsais16x64_final_bwt_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j; sa_sint_t index = -1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16x64_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i - 0]; index = (p0 == 0) ? (sa_sint_t)(i - 0) : index;
+        SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; }
+
+        sa_sint_t p1 = SA[i - 1]; index = (p1 == 0) ? (sa_sint_t)(i - 1) : index;
+        SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; index = (p == 0) ? (sa_sint_t)i : index;
+        SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; }
+    }
+
+    return index;
+}
+
+static void libsais16x64_final_bwt_aux_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16x64_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i - 0];
+        SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]] + 1; } }
+
+        sa_sint_t p1 = SA[i - 1];
+        SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]] + 1; } }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i];
+        SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } }
+    }
+}
+
+static void libsais16x64_final_sorting_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16x64_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+    }
+}
+
+static void libsais16x64_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16x64_prefetchw(&SA[i - 3 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais16x64_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16x64_prefetchr(&T[s2] - 2); }
+        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais16x64_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16x64_prefetchr(&T[s3] - 2); }
+
+        sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); }
+        sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); }
+    }
+
+    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+    }
+}
+
+#if defined(LIBSAIS_OPENMP)
+
+static fast_sint_t libsais16x64_final_bwt_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+   const fast_sint_t prefetch_distance = 32;
+
+   memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+   fast_sint_t i, j, count = 0;
+   for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+   {
+       libsais16x64_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+       sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+       sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+
+       sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p0 : t; }
+       sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p1 : t; }
+   }
+
+   for (j -= prefetch_distance + 1; i >= j; i -= 1)
+   {
+       sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p : t; }
+   }
+
+   return count;
+}
+
+static fast_sint_t libsais16x64_final_bwt_aux_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+   const fast_sint_t prefetch_distance = 32;
+
+   memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+   fast_sint_t i, j, count = 0;
+   for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+   {
+       libsais16x64_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+       sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+       sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+
+       sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p0 : t; cache[count + 1].index = p0; count += 2; }
+       sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p1 : t; cache[count + 1].index = p1; count += 2; }
+   }
+
+   for (j -= prefetch_distance + 1; i >= j; i -= 1)
+   {
+       sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p : t; cache[count + 1].index = p; count += 2; }
+   }
+
+   return count;
+}
+
+static fast_sint_t libsais16x64_final_sorting_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+   const fast_sint_t prefetch_distance = 32;
+
+   memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+
+   fast_sint_t i, j, count = 0;
+   for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+   {
+       libsais16x64_prefetchw(&SA[i - 2 * prefetch_distance]);
+
+       sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+       sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+
+       sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); }
+       sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); }
+   }
+
+   for (j -= prefetch_distance + 1; i >= j; i -= 1)
+   {
+       sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+   }
+
+   return count;
+}
+
+static void libsais16x64_final_order_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = 0, j = count - 3; i < j; i += 4)
+    {
+        libsais16x64_prefetchr(&cache[i + prefetch_distance]);
+
+        SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index;
+        SA[--buckets[cache[i + 1].symbol]] = cache[i + 1].index;
+        SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index;
+        SA[--buckets[cache[i + 3].symbol]] = cache[i + 3].index;
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        SA[--buckets[cache[i].symbol]] = cache[i].index;
+    }
+}
+
+static void libsais16x64_final_bwt_aux_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = 0, j = count - 6; i < j; i += 8)
+    {
+        libsais16x64_prefetchr(&cache[i + prefetch_distance]);
+
+        SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; if ((cache[i + 1].index & rm) == 0) { I[cache[i + 1].index / (rm + 1)] = buckets[cache[i + 0].symbol] + 1; }
+        SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; if ((cache[i + 3].index & rm) == 0) { I[cache[i + 3].index / (rm + 1)] = buckets[cache[i + 2].symbol] + 1; }
+        SA[--buckets[cache[i + 4].symbol]] = cache[i + 4].index; if ((cache[i + 5].index & rm) == 0) { I[cache[i + 5].index / (rm + 1)] = buckets[cache[i + 4].symbol] + 1; }
+        SA[--buckets[cache[i + 6].symbol]] = cache[i + 6].index; if ((cache[i + 7].index & rm) == 0) { I[cache[i + 7].index / (rm + 1)] = buckets[cache[i + 6].symbol] + 1; }
+    }
+
+    for (j += 6; i < j; i += 2)
+    {
+        SA[--buckets[cache[i].symbol]] = cache[i].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol] + 1; }
+    }
+}
+
+static void libsais16x64_final_sorting_scan_right_to_left_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
+    {
+        libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL);
+
+        libsais16x64_prefetchw(&cache[i + prefetch_distance]);
+
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1)
+    {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol;
+    }
+}
+
+static void libsais16x64_final_sorting_scan_right_to_left_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
+    {
+        libsais16x64_prefetchw(&cache[i - 2 * prefetch_distance]);
+
+        sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais16x64_prefetchw(s0 >= 0 ? Is0 : NULL);
+        sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais16x64_prefetchw(s1 >= 0 ? Is1 : NULL);
+
+        sa_sint_t v0 = cache[i - 0].symbol;
+        if (v0 >= 0)
+        {
+            cache[i - 0].symbol = --induction_bucket[v0];
+            if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; cache[i - 0].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+        }
+
+        sa_sint_t v1 = cache[i - 1].symbol;
+        if (v1 >= 0)
+        {
+            cache[i - 1].symbol = --induction_bucket[v1];
+            if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; cache[i - 1].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+        }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1)
+    {
+        sa_sint_t v = cache[i].symbol;
+        if (v >= 0)
+        {
+            cache[i].symbol = --induction_bucket[v];
+            if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+        }
+    }
+}
+
+static void libsais16x64_final_bwt_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_final_bwt_scan_right_to_left_16u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16x64_final_bwt_scan_right_to_left_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16x64_final_order_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16x64_final_bwt_aux_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_final_bwt_aux_scan_right_to_left_16u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16x64_final_bwt_aux_scan_right_to_left_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16x64_final_bwt_aux_scan_right_to_left_16u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16x64_final_sorting_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_final_sorting_scan_right_to_left_16u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16x64_final_sorting_scan_right_to_left_16u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t;
+                for (t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
+                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16x64_final_order_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16x64_final_sorting_scan_right_to_left_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(cache);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+        omp_block_start += block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_final_sorting_scan_right_to_left_32s(T, SA, buckets, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                libsais16x64_final_sorting_scan_right_to_left_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                libsais16x64_final_sorting_scan_right_to_left_32s_block_sort(T, buckets, cache - block_start, block_start, block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                libsais16x64_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+#endif
+
+static sa_sint_t libsais16x64_final_bwt_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t index = -1;
+
+    if (threads == 1 || n < 65536)
+    {
+        index = libsais16x64_final_bwt_scan_right_to_left_16u(T, SA, induction_bucket, 0, n);
+    }
+#if defined(LIBSAIS_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; )
+        {
+            if (SA[block_start] == 0)
+            {
+                index = (sa_sint_t)block_start--;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < 0) { block_max_end = -1; }
+                fast_sint_t block_end     = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
+                fast_sint_t block_size    = block_start - block_end;
+
+                if (block_size < 32)
+                {
+                    for (; block_start > block_end; block_start -= 1)
+                    {
+                        sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; }
+                    }
+                }
+                else
+                {
+                    libsais16x64_final_bwt_scan_right_to_left_16u_block_omp(T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+
+    return index;
+}
+
+static void libsais16x64_final_bwt_aux_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || n < 65536)
+    {
+        libsais16x64_final_bwt_aux_scan_right_to_left_16u(T, SA, rm, I, induction_bucket, 0, n);
+    }
+#if defined(LIBSAIS_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start--;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * ((LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads) / 2); if (block_max_end < 0) { block_max_end = -1; }
+                fast_sint_t block_end     = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
+                fast_sint_t block_size    = block_start - block_end;
+
+                if (block_size < 32)
+                {
+                    for (; block_start > block_end; block_start -= 1)
+                    {
+                        sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } }
+                    }
+                }
+                else
+                {
+                    libsais16x64_final_bwt_aux_scan_right_to_left_16u_block_omp(T, SA, rm, I, induction_bucket, block_end + 1, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais16x64_final_sorting_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || n < 65536)
+    {
+        libsais16x64_final_sorting_scan_right_to_left_16u(T, SA, induction_bucket, 0, n);
+    }
+#if defined(LIBSAIS_OPENMP)
+    else
+    {
+        fast_sint_t block_start;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; )
+        {
+            if (SA[block_start] == 0)
+            {
+                block_start--;
+            }
+            else
+            {
+                fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < -1) { block_max_end = -1; }
+                fast_sint_t block_end     = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
+                fast_sint_t block_size    = block_start - block_end;
+
+                if (block_size < 32)
+                {
+                    for (; block_start > block_end; block_start -= 1)
+                    {
+                        sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+                    }
+                }
+                else
+                {
+                    libsais16x64_final_sorting_scan_right_to_left_16u_block_omp(T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state);
+                    block_start = block_end;
+                }
+            }
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais16x64_final_sorting_scan_right_to_left_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (threads == 1 || n < 65536)
+    {
+        libsais16x64_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0, n);
+    }
+#if defined(LIBSAIS_OPENMP)
+    else
+    {
+        fast_sint_t block_start, block_end;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end)
+        {
+            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; }
+
+            libsais16x64_final_sorting_scan_right_to_left_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+        }
+    }
+#else
+    UNUSED(thread_state);
+#endif
+}
+
+static void libsais16x64_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT bucket_start, sa_sint_t * RESTRICT bucket_end, sa_sint_t threads)
+{
+    fast_sint_t c;
+
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536)
+#else
+    UNUSED(threads); UNUSED(n);
+#endif
+    for (c = 0; c < k; ++c)
+    {
+        if (bucket_end[c] > bucket_start[c])
+        {
+            memset(&SA[bucket_start[c]], 0, ((size_t)bucket_end[c] - (size_t)bucket_start[c]) * sizeof(sa_sint_t));
+        }
+    }
+}
+
+static sa_sint_t libsais16x64_induce_final_order_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (!bwt)
+    {
+        libsais16x64_final_sorting_scan_left_to_right_16u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+        if (threads > 1 && n >= 65536) { libsais16x64_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); }
+        libsais16x64_final_sorting_scan_right_to_left_16u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+        return 0;
+    }
+    else if (I != NULL)
+    {
+        libsais16x64_final_bwt_aux_scan_left_to_right_16u_omp(T, SA, n, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+        if (threads > 1 && n >= 65536) { libsais16x64_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); }
+        libsais16x64_final_bwt_aux_scan_right_to_left_16u_omp(T, SA, n, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+        return 0;
+    }
+    else
+    {
+        libsais16x64_final_bwt_scan_left_to_right_16u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+        if (threads > 1 && n >= 65536) { libsais16x64_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); }
+        return libsais16x64_final_bwt_scan_right_to_left_16u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+    }
+}
+
+static void libsais16x64_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais16x64_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * (fast_sint_t)k], threads, thread_state);
+    libsais16x64_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * (fast_sint_t)k], threads, thread_state);
+}
+
+static void libsais16x64_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais16x64_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * (fast_sint_t)k], threads, thread_state);
+    libsais16x64_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * (fast_sint_t)k], threads, thread_state);
+}
+
+static void libsais16x64_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais16x64_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * (fast_sint_t)k], threads, thread_state);
+    libsais16x64_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * (fast_sint_t)k], threads, thread_state);
+}
+
+static void libsais16x64_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais16x64_count_suffixes_32s(T, n, k, buckets);
+    libsais16x64_initialize_buckets_start_32s_1k(k, buckets);
+    libsais16x64_final_sorting_scan_left_to_right_32s_omp(T, SA, n, buckets, threads, thread_state);
+
+    libsais16x64_count_suffixes_32s(T, n, k, buckets);
+    libsais16x64_initialize_buckets_end_32s_1k(k, buckets);
+    libsais16x64_final_sorting_scan_right_to_left_32s_omp(T, SA, n, buckets, threads, thread_state);
+}
+
+static sa_sint_t libsais16x64_renumber_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t f, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAm = &SA[m];
+
+    sa_sint_t i, j;
+    for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 2 * (sa_sint_t)prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais16x64_prefetchr(&SA[i + 3 * prefetch_distance]);
+
+        libsais16x64_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 0]) >> 1]);
+        libsais16x64_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 1]) >> 1]);
+        libsais16x64_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 2]) >> 1]);
+        libsais16x64_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 3]) >> 1]);
+
+        sa_uint_t q0 = (sa_uint_t)SA[i + prefetch_distance + 0]; const sa_sint_t * Tq0 = &T[q0]; libsais16x64_prefetchw(SAm[q0 >> 1] < 0 ? Tq0 : NULL);
+        sa_uint_t q1 = (sa_uint_t)SA[i + prefetch_distance + 1]; const sa_sint_t * Tq1 = &T[q1]; libsais16x64_prefetchw(SAm[q1 >> 1] < 0 ? Tq1 : NULL);
+        sa_uint_t q2 = (sa_uint_t)SA[i + prefetch_distance + 2]; const sa_sint_t * Tq2 = &T[q2]; libsais16x64_prefetchw(SAm[q2 >> 1] < 0 ? Tq2 : NULL);
+        sa_uint_t q3 = (sa_uint_t)SA[i + prefetch_distance + 3]; const sa_sint_t * Tq3 = &T[q3]; libsais16x64_prefetchw(SAm[q3 >> 1] < 0 ? Tq3 : NULL);
+
+        sa_uint_t p0 = (sa_uint_t)SA[i + 0]; sa_sint_t s0 = SAm[p0 >> 1]; if (s0 < 0) { T[p0] |= SAINT_MIN; f++; s0 = i + 0 + SAINT_MIN + f; } SAm[p0 >> 1] = s0 - f;
+        sa_uint_t p1 = (sa_uint_t)SA[i + 1]; sa_sint_t s1 = SAm[p1 >> 1]; if (s1 < 0) { T[p1] |= SAINT_MIN; f++; s1 = i + 1 + SAINT_MIN + f; } SAm[p1 >> 1] = s1 - f;
+        sa_uint_t p2 = (sa_uint_t)SA[i + 2]; sa_sint_t s2 = SAm[p2 >> 1]; if (s2 < 0) { T[p2] |= SAINT_MIN; f++; s2 = i + 2 + SAINT_MIN + f; } SAm[p2 >> 1] = s2 - f;
+        sa_uint_t p3 = (sa_uint_t)SA[i + 3]; sa_sint_t s3 = SAm[p3 >> 1]; if (s3 < 0) { T[p3] |= SAINT_MIN; f++; s3 = i + 3 + SAINT_MIN + f; } SAm[p3 >> 1] = s3 - f;
+    }
+
+    for (j += 2 * (sa_sint_t)prefetch_distance + 3; i < j; i += 1)
+    {
+        sa_uint_t p = (sa_uint_t)SA[i]; sa_sint_t s = SAm[p >> 1]; if (s < 0) { T[p] |= SAINT_MIN; f++; s = i + SAINT_MIN + f; } SAm[p >> 1] = s - f;
+    }
+
+    return f;
+}
+
+static void libsais16x64_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t * pl, fast_sint_t * pr, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAl = &SA[0];
+    sa_sint_t * RESTRICT SAr = &SA[0];
+
+    fast_sint_t i, j, l = *pl - 1, r = *pr - 1;
+    for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4)
+    {
+        libsais16x64_prefetchr(&SA[i - prefetch_distance]);
+
+        sa_sint_t p0 = SA[i - 0]; SAl[l] = p0 & SAINT_MAX; l -= p0 < 0; SAr[r] = p0 - 1; r -= p0 > 0;
+        sa_sint_t p1 = SA[i - 1]; SAl[l] = p1 & SAINT_MAX; l -= p1 < 0; SAr[r] = p1 - 1; r -= p1 > 0;
+        sa_sint_t p2 = SA[i - 2]; SAl[l] = p2 & SAINT_MAX; l -= p2 < 0; SAr[r] = p2 - 1; r -= p2 > 0;
+        sa_sint_t p3 = SA[i - 3]; SAl[l] = p3 & SAINT_MAX; l -= p3 < 0; SAr[r] = p3 - 1; r -= p3 > 0;
+    }
+
+    for (j -= 3; i >= j; i -= 1)
+    {
+        sa_sint_t p = SA[i]; SAl[l] = p & SAINT_MAX; l -= p < 0; SAr[r] = p - 1; r -= p > 0;
+    }
+    
+    *pl = l + 1; *pr = r + 1;
+}
+
+
+#if defined(LIBSAIS_OPENMP)
+
+static sa_sint_t libsais16x64_count_unique_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT SAm = &SA[m];
+
+    fast_sint_t i, j; sa_sint_t f0 = 0, f1 = 0, f2 = 0, f3 = 0;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]);
+
+        libsais16x64_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
+        libsais16x64_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
+        libsais16x64_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
+        libsais16x64_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
+
+        f0 += SAm[((sa_uint_t)SA[i + 0]) >> 1] < 0;
+        f1 += SAm[((sa_uint_t)SA[i + 1]) >> 1] < 0;
+        f2 += SAm[((sa_uint_t)SA[i + 2]) >> 1] < 0;
+        f3 += SAm[((sa_uint_t)SA[i + 3]) >> 1] < 0;
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        f0 += SAm[((sa_uint_t)SA[i]) >> 1] < 0;
+    }
+
+    return f0 + f1 + f2 + f3;
+}
+
+#endif
+
+static sa_sint_t libsais16x64_renumber_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t f = 0;
+
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            f = libsais16x64_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, 0, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16x64_count_unique_suffixes(SA, m, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+                if (omp_thread_num == omp_num_threads - 1)
+                {
+                    f = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
+                }
+
+                libsais16x64_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+
+    return f;
+}
+
+static void libsais16x64_compact_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072 && m < fs)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            fast_sint_t l = m, r = (fast_sint_t)n + (fast_sint_t)fs;
+            libsais16x64_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &l, &r, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.position   = (fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.count      = (fast_sint_t)m + omp_block_start + omp_block_size;
+
+                libsais16x64_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &thread_state[omp_thread_num].state.position, &thread_state[omp_thread_num].state.count, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                fast_sint_t t, position;
+
+                for (position = m, t = omp_num_threads - 1; t >= 0; --t)
+                { 
+                    fast_sint_t omp_block_end     = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1);
+                    fast_sint_t count             = ((fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_end - thread_state[t].state.position);
+
+                    if (count > 0)
+                    {
+                        position -= count; memcpy(&SA[position], &SA[thread_state[t].state.position], (size_t)count * sizeof(sa_sint_t));
+                    }
+                }
+
+                for (position = (fast_sint_t)n + (fast_sint_t)fs, t = omp_num_threads - 1; t >= 0; --t)
+                {
+                    fast_sint_t omp_block_end     = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1);
+                    fast_sint_t count             = ((fast_sint_t)m + omp_block_end - thread_state[t].state.count);
+
+                    if (count > 0)
+                    {
+                        position -= count; memcpy(&SA[position], &SA[thread_state[t].state.count], (size_t)count * sizeof(sa_sint_t));
+                    }
+                }
+            }
+        }
+#endif
+    }
+
+    memcpy(&SA[(fast_sint_t)n + (fast_sint_t)fs - (fast_sint_t)m], &SA[(fast_sint_t)m - (fast_sint_t)f], (size_t)f * sizeof(sa_sint_t));
+}
+
+static sa_sint_t libsais16x64_compact_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t f = libsais16x64_renumber_unique_and_nonunique_lms_suffixes_32s_omp(T, SA, m, threads, thread_state);
+    libsais16x64_compact_unique_and_nonunique_lms_suffixes_32s_omp(SA, n, m, fs, f, threads, thread_state);
+
+    return f;
+}
+
+static void libsais16x64_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
+
+    sa_sint_t i, j; fast_sint_t tmp = *SAnm++;
+    for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 6; i < j; i += 4)
+    {
+        libsais16x64_prefetchr(&T[i + prefetch_distance]);
+
+        sa_sint_t c0 = T[i + 0]; if (c0 < 0) { T[i + 0] = c0 & SAINT_MAX; SA[tmp] = i + 0; i++; tmp = *SAnm++; }
+        sa_sint_t c1 = T[i + 1]; if (c1 < 0) { T[i + 1] = c1 & SAINT_MAX; SA[tmp] = i + 1; i++; tmp = *SAnm++; }
+        sa_sint_t c2 = T[i + 2]; if (c2 < 0) { T[i + 2] = c2 & SAINT_MAX; SA[tmp] = i + 2; i++; tmp = *SAnm++; }
+        sa_sint_t c3 = T[i + 3]; if (c3 < 0) { T[i + 3] = c3 & SAINT_MAX; SA[tmp] = i + 3; i++; tmp = *SAnm++; }
+    }
+
+    for (j += 6; i < j; i += 1)
+    {
+        sa_sint_t c = T[i]; if (c < 0) { T[i] = c & SAINT_MAX; SA[tmp] = i; i++; tmp = *SAnm++; }
+    }
+}
+
+static void libsais16x64_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
+
+    fast_sint_t i, j; sa_sint_t tmp = *SAnm++;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4)
+    {
+        libsais16x64_prefetchr(&SA[i + prefetch_distance]);
+
+        if (SA[i + 0] == 0) { SA[i + 0] = tmp; tmp = *SAnm++; }
+        if (SA[i + 1] == 0) { SA[i + 1] = tmp; tmp = *SAnm++; }
+        if (SA[i + 2] == 0) { SA[i + 2] = tmp; tmp = *SAnm++; }
+        if (SA[i + 3] == 0) { SA[i + 3] = tmp; tmp = *SAnm++; }
+    }
+
+    for (j += 3; i < j; i += 1)
+    {
+        if (SA[i] == 0) { SA[i] = tmp; tmp = *SAnm++; }
+    }
+}
+
+static void libsais16x64_merge_unique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_merge_unique_lms_suffixes_32s(T, SA, n, m, 0, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16x64_count_negative_marked_suffixes(T, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+                libsais16x64_merge_unique_lms_suffixes_32s(T, SA, n, m, count, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16x64_merge_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads); UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_merge_nonunique_lms_suffixes_32s(SA, n, m, f, omp_block_start, omp_block_size);
+        }
+#if defined(LIBSAIS_OPENMP)
+        else
+        {
+            {
+                thread_state[omp_thread_num].state.count = libsais16x64_count_zero_marked_suffixes(SA, omp_block_start, omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            {
+                fast_sint_t t, count = f; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+
+                libsais16x64_merge_nonunique_lms_suffixes_32s(SA, n, m, count, omp_block_start, omp_block_size);
+            }
+        }
+#endif
+    }
+}
+
+static void libsais16x64_merge_compacted_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    libsais16x64_merge_unique_lms_suffixes_32s_omp(T, SA, n, m, threads, thread_state);
+    libsais16x64_merge_nonunique_lms_suffixes_32s_omp(SA, n, m, f, threads, thread_state);
+}
+
+static void libsais16x64_reconstruct_compacted_lms_suffixes_32s_2k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (f > 0)
+    {
+        memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
+
+        libsais16x64_count_and_gather_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+        libsais16x64_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
+
+        memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
+        memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t));
+
+        libsais16x64_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state);
+    }
+    else
+    {
+        libsais16x64_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+        libsais16x64_reconstruct_lms_suffixes_omp(SA, n, m, threads);
+    }
+}
+
+static void libsais16x64_reconstruct_compacted_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    if (f > 0)
+    {
+        memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
+
+        libsais16x64_gather_compacted_lms_suffixes_32s(T, SA, n);
+        libsais16x64_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
+
+        memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
+        memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t));
+
+        libsais16x64_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state);
+    }
+    else
+    {
+        libsais16x64_gather_lms_suffixes_32s(T, SA, n);
+        libsais16x64_reconstruct_lms_suffixes_omp(SA, n, m, threads);
+    }
+}
+
+static void libsais16x64_convert_32u_to_64u(uint32_t * RESTRICT S, uint64_t * RESTRICT D, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size; i < j; i += 1) 
+    {
+        D[i] = (uint64_t)S[i]; 
+    }
+}
+
+static void libsais16x64_convert_inplace_32u_to_64u(uint32_t * V, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start; i >= j; i -= 1) 
+    {
+#if defined(__LITTLE_ENDIAN__)
+        V[i + i + 0] = V[i]; V[i + i + 1] = 0;
+#else
+        V[i + i + 0] = 0; V[i + i + 1] = V[i];
+#endif
+    }
+}
+
+static void libsais16x64_convert_inplace_64u_to_32u(uint32_t * V, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size; i < j; i += 1) 
+    {
+#if defined(__LITTLE_ENDIAN__)
+        V[i] = V[i + i + 0];
+#else
+        V[i] = V[i + i + 1];
+#endif
+    }
+}
+
+static void libsais16x64_convert_inplace_32u_to_64u_omp(uint32_t * V, sa_sint_t n, sa_sint_t threads)
+{
+    while (n >= 65536)
+    {
+        fast_sint_t block_size = n >> 1; n -= block_size;
+
+#if defined(LIBSAIS_OPENMP)
+        #pragma omp parallel num_threads(threads) if(threads > 1)
+#endif
+        {
+#if defined(LIBSAIS_OPENMP)
+            fast_sint_t omp_thread_num      = omp_get_thread_num();
+            fast_sint_t omp_num_threads     = omp_get_num_threads();
+#else
+            UNUSED(threads);
+
+            fast_sint_t omp_thread_num      = 0;
+            fast_sint_t omp_num_threads     = 1;
+#endif
+            fast_sint_t omp_block_stride    = (block_size / omp_num_threads) & (-16);
+            fast_sint_t omp_block_start     = omp_thread_num * omp_block_stride;
+            fast_sint_t omp_block_size      = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+
+            libsais16x64_convert_32u_to_64u(((uint32_t *)(void *)V) + n, ((uint64_t *)(void *)V) + n, omp_block_start, omp_block_size);
+        }
+    }
+
+    libsais16x64_convert_inplace_32u_to_64u(V, 0, n);
+}
+
+static sa_sint_t libsais16x64_main_32s_recursion(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state, sa_sint_t * RESTRICT local_buffer)
+{
+    fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n);
+
+    if (n <= INT32_MAX)
+    {
+        sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n;
+        if ((new_fs / k >= 4) || (new_fs >= fs))
+        {
+            libsais16x64_convert_inplace_64u_to_32u((uint32_t *)(void *)T, 0, n);
+
+#if defined(LIBSAIS_OPENMP)
+            sa_sint_t index = libsais16_int_omp((int32_t *)T, (int32_t *)SA, (int32_t)n, (int32_t)k, (int32_t)new_fs, (int32_t)threads);
+#else
+            sa_sint_t index = libsais16_int((int32_t *)T, (int32_t *)SA, (int32_t)n, (int32_t)k, (int32_t)new_fs);
+#endif
+            if (index >= 0)
+            {
+                libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)SA, n, threads);
+            }
+
+            return index;
+        }
+    }
+
+    if (k > 0 && ((fs / k >= 6) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 6)))
+    {
+        sa_sint_t alignment = (fs - 1024) / k >= 6 ? (sa_sint_t)1024 : (sa_sint_t)16;
+        sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais16x64_align_up(&SA[n + fs - 6 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * (fast_sint_t)k];
+        buckets = (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 6) ? local_buffer : buckets;
+
+        sa_sint_t m = libsais16x64_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state);
+        if (m > 1)
+        {
+            memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t));
+
+            sa_sint_t first_lms_suffix    = SA[n - m];
+            sa_sint_t left_suffixes_count = libsais16x64_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix);
+
+            libsais16x64_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * (fast_sint_t)k], threads, thread_state);
+
+            if ((n / 8192) < k) { libsais16x64_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * (fast_sint_t)k], threads); }
+            if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); }
+
+            libsais16x64_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix, left_suffixes_count);
+            libsais16x64_induce_partial_order_32s_6k_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state);
+
+            sa_sint_t names = (n / 8192) < k
+                ? libsais16x64_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state)
+                : libsais16x64_renumber_and_gather_lms_suffixes_omp(SA, n, m, fs, threads, thread_state);
+
+            if (names < m)
+            {
+                sa_sint_t f = (n / 8192) < k
+                    ? libsais16x64_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state)
+                    : 0;
+
+                if (libsais16x64_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0)
+                {
+                    return -2;
+                }
+
+                libsais16x64_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+            }
+            else
+            {
+                libsais16x64_count_lms_suffixes_32s_2k(T, n, k, buckets);
+            }
+
+            libsais16x64_initialize_buckets_start_and_end_32s_4k(k, buckets);
+            libsais16x64_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets);
+            libsais16x64_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state);
+        }
+        else
+        {
+            SA[0] = SA[n - 1];
+
+            libsais16x64_initialize_buckets_start_and_end_32s_6k(k, buckets);
+            libsais16x64_place_lms_suffixes_histogram_32s_6k(SA, n, k, m, buckets);
+            libsais16x64_induce_final_order_32s_6k(T, SA, n, k, buckets, threads, thread_state);
+        }
+
+        return 0;
+    }
+    else if (k > 0 && ((fs / k >= 4) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 4)))
+    {
+        sa_sint_t alignment = (fs - 1024) / k >= 4 ? (sa_sint_t)1024 : (sa_sint_t)16;
+        sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais16x64_align_up(&SA[n + fs - 4 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * (fast_sint_t)k];
+        buckets = (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 4) ? local_buffer : buckets;
+
+        sa_sint_t m = libsais16x64_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+        if (m > 1)
+        {
+            libsais16x64_initialize_buckets_for_radix_and_partial_sorting_32s_4k(T, k, buckets, SA[n - m]);
+
+            libsais16x64_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state);
+            libsais16x64_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1], threads);
+            
+            libsais16x64_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1, buckets);
+            libsais16x64_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state);
+
+            sa_sint_t names = libsais16x64_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state);
+            if (names < m)
+            {
+                sa_sint_t f = libsais16x64_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+                if (libsais16x64_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0)
+                {
+                    return -2;
+                }
+
+                libsais16x64_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+            }
+            else
+            {
+                libsais16x64_count_lms_suffixes_32s_2k(T, n, k, buckets);
+            }
+        }
+        else
+        {
+            SA[0] = SA[n - 1];
+        }
+
+        libsais16x64_initialize_buckets_start_and_end_32s_4k(k, buckets);
+        libsais16x64_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets);
+        libsais16x64_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state);
+
+        return 0;
+    }
+    else if (k > 0 && ((fs / k >= 2) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 2)))
+    {
+        sa_sint_t alignment = (fs - 1024) / k >= 2 ? (sa_sint_t)1024 : (sa_sint_t)16;
+        sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais16x64_align_up(&SA[n + fs - 2 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * (fast_sint_t)k];
+        buckets = (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 2) ? local_buffer : buckets;
+
+        sa_sint_t m = libsais16x64_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+        if (m > 1)
+        {
+            libsais16x64_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(T, k, buckets, SA[n - m]);
+
+            libsais16x64_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state);
+            libsais16x64_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1, buckets);
+
+            libsais16x64_initialize_buckets_start_and_end_32s_2k(k, buckets);
+            libsais16x64_induce_partial_order_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+
+            sa_sint_t names = libsais16x64_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
+            if (names < m)
+            {
+                sa_sint_t f = libsais16x64_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+                if (libsais16x64_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0)
+                {
+                    return -2;
+                }
+
+                libsais16x64_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+            }
+            else
+            {
+                libsais16x64_count_lms_suffixes_32s_2k(T, n, k, buckets);
+            }
+        }
+        else
+        {
+            SA[0] = SA[n - 1];
+        }
+
+        libsais16x64_initialize_buckets_end_32s_2k(k, buckets);
+        libsais16x64_place_lms_suffixes_histogram_32s_2k(SA, n, k, m, buckets);
+
+        libsais16x64_initialize_buckets_start_and_end_32s_2k(k, buckets);
+        libsais16x64_induce_final_order_32s_2k(T, SA, n, k, buckets, threads, thread_state);
+
+        return 0;
+    }
+    else
+    {
+        sa_sint_t * buffer = fs < k ? (sa_sint_t *)libsais16x64_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096) : (sa_sint_t *)NULL;
+
+        sa_sint_t alignment = fs - 1024 >= k ? (sa_sint_t)1024 : (sa_sint_t)16;
+        sa_sint_t * RESTRICT buckets = fs - alignment >= k ? (sa_sint_t *)libsais16x64_align_up(&SA[n + fs - k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : fs >= k ? &SA[n + fs - k] : buffer;
+
+        if (buckets == NULL) { return -2; }
+
+        memset(SA, 0, (size_t)n * sizeof(sa_sint_t));
+
+        libsais16x64_count_suffixes_32s(T, n, k, buckets); 
+        libsais16x64_initialize_buckets_end_32s_1k(k, buckets);
+
+        sa_sint_t m = libsais16x64_radix_sort_lms_suffixes_32s_1k(T, SA, n, buckets);
+        if (m > 1)
+        {
+            libsais16x64_induce_partial_order_32s_1k_omp(T, SA, n, k, buckets, threads, thread_state);
+
+            sa_sint_t names = libsais16x64_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
+            if (names < m)
+            {
+                if (buffer != NULL) { libsais16x64_free_aligned(buffer); buckets = NULL; }
+
+                sa_sint_t f = libsais16x64_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+                if (libsais16x64_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0)
+                {
+                    return -2;
+                }
+
+                libsais16x64_reconstruct_compacted_lms_suffixes_32s_1k_omp(T, SA, n, m, fs, f, threads, thread_state);
+
+                if (buckets == NULL) { buckets = buffer = (sa_sint_t *)libsais16x64_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096); }
+                if (buckets == NULL) { return -2; }
+            }
+            
+            libsais16x64_count_suffixes_32s(T, n, k, buckets);
+            libsais16x64_initialize_buckets_end_32s_1k(k, buckets);
+            libsais16x64_place_lms_suffixes_interval_32s_1k(T, SA, k, m, buckets);
+        }
+
+        libsais16x64_induce_final_order_32s_1k(T, SA, n, k, buckets, threads, thread_state);
+        libsais16x64_free_aligned(buffer);
+
+        return 0;
+    }
+}
+
+static sa_sint_t libsais16x64_main_32s_entry(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    sa_sint_t local_buffer[LIBSAIS_LOCAL_BUFFER_SIZE];
+
+    return libsais16x64_main_32s_recursion(T, SA, n, k, fs, threads, thread_state, local_buffer);
+}
+
+static sa_sint_t libsais16x64_main_16u(const uint16_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
+{
+    fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n);
+
+    sa_sint_t m = libsais16x64_count_and_gather_lms_suffixes_16u_omp(T, SA, n, buckets, threads, thread_state);
+
+    libsais16x64_initialize_buckets_start_and_end_16u(buckets, freq);
+
+    if (m > 0)
+    {
+        sa_sint_t first_lms_suffix    = SA[n - m];
+        sa_sint_t left_suffixes_count = libsais16x64_initialize_buckets_for_lms_suffixes_radix_sort_16u(T, buckets, first_lms_suffix);
+
+        if (threads > 1 && n >= 65536) { memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t)); }
+        libsais16x64_radix_sort_lms_suffixes_16u_omp(T, SA, n, m, buckets, threads, thread_state);
+        if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); }
+
+        libsais16x64_initialize_buckets_for_partial_sorting_16u(T, buckets, first_lms_suffix, left_suffixes_count);
+        libsais16x64_induce_partial_order_16u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state);
+
+        sa_sint_t names = libsais16x64_renumber_and_gather_lms_suffixes_omp(SA, n, m, fs, threads, thread_state);
+        if (names < m)
+        {
+            if (libsais16x64_main_32s_entry(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0)
+            {
+                return -2;
+            }
+
+            libsais16x64_gather_lms_suffixes_16u_omp(T, SA, n, threads, thread_state);
+            libsais16x64_reconstruct_lms_suffixes_omp(SA, n, m, threads);
+        }
+
+        libsais16x64_place_lms_suffixes_interval_16u(SA, n, m, buckets);
+    }
+    else
+    {
+        memset(SA, 0, (size_t)n * sizeof(sa_sint_t));
+    }
+
+    return libsais16x64_induce_final_order_16u_omp(T, SA, n, bwt, r, I, buckets, threads, thread_state);
+}
+
+static sa_sint_t libsais16x64_main(const uint16_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads)
+{
+    LIBSAIS_THREAD_STATE *  RESTRICT thread_state   = threads > 1 ? libsais16x64_alloc_thread_state(threads) : NULL;
+    sa_sint_t *             RESTRICT buckets        = (sa_sint_t *)libsais16x64_alloc_aligned((size_t)8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+
+    sa_sint_t index = buckets != NULL && (thread_state != NULL || threads == 1)
+        ? libsais16x64_main_16u(T, SA, n, buckets, bwt, r, I, fs, freq, threads, thread_state)
+        : -2;
+
+    libsais16x64_free_aligned(buckets);
+    libsais16x64_free_thread_state(thread_state);
+
+    return index;
+}
+
+static sa_sint_t libsais16x64_main_long(sa_sint_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads)
+{
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais16x64_alloc_thread_state(threads) : NULL;
+
+    sa_sint_t index = thread_state != NULL || threads == 1
+        ? libsais16x64_main_32s_entry(T, SA, n, k, fs, threads, thread_state)
+        : -2;
+
+    libsais16x64_free_thread_state(thread_state);
+
+    return index;
+}
+
+static void libsais16x64_bwt_copy_16u(uint16_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8)
+    {
+        libsais16x64_prefetchr(&A[i + prefetch_distance]);
+
+        U[i + 0] = (uint16_t)A[i + 0];
+        U[i + 1] = (uint16_t)A[i + 1];
+        U[i + 2] = (uint16_t)A[i + 2];
+        U[i + 3] = (uint16_t)A[i + 3];
+        U[i + 4] = (uint16_t)A[i + 4];
+        U[i + 5] = (uint16_t)A[i + 5];
+        U[i + 6] = (uint16_t)A[i + 6];
+        U[i + 7] = (uint16_t)A[i + 7];
+    }
+
+    for (j += 7; i < j; i += 1)
+    {
+        U[i] = (uint16_t)A[i];
+    }
+}
+
+#if defined(LIBSAIS_OPENMP)
+
+static void libsais16x64_bwt_copy_16u_omp(uint16_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n, sa_sint_t threads)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_block_stride  = ((fast_sint_t)n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)n - omp_block_start;
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_block_start   = 0;
+        fast_sint_t omp_block_size    = (fast_sint_t)n;
+#endif
+
+        libsais16x64_bwt_copy_16u(U + omp_block_start, A + omp_block_start, (sa_sint_t)omp_block_size);
+    }
+}
+
+#endif
+
+int64_t libsais16x64(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq)
+{
+    if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0))
+    {
+        return -1;
+    }
+    else if (n < 2)
+    {
+        if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); }
+        if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } }
+        return 0;
+    }
+
+    if (n <= INT32_MAX)
+    {
+        sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n;
+        sa_sint_t index = libsais16(T, (int32_t *)SA, (int32_t)n, (int32_t)new_fs, (int32_t *)freq);
+
+        if (index >= 0)
+        {
+            libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)SA, n, 1);
+            if (freq != NULL) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, 1); }
+        }
+
+        return index;
+    }
+
+    return libsais16x64_main(T, SA, n, 0, 0, NULL, fs, freq, 1);
+}
+
+int64_t libsais16x64_long(int64_t * T, int64_t * SA, int64_t n, int64_t k, int64_t fs)
+{
+    if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0))
+    {
+        return -1;
+    }
+    else if (n < 2)
+    {
+        if (n == 1) { SA[0] = 0; }
+        return 0;
+    }
+
+    return libsais16x64_main_long(T, SA, n, k, fs, 1);
+}
+
+int64_t libsais16x64_bwt(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq)
+{
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0))
+    { 
+        return -1; 
+    }
+    else if (n <= 1) 
+    {
+        if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); }
+        if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } }
+        return n; 
+    }
+
+    if (n <= INT32_MAX)
+    {
+        sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n;
+        sa_sint_t index = libsais16_bwt(T, U, (int32_t *)A, (int32_t)n, (int32_t)new_fs, (int32_t *)freq);
+
+        if (index >= 0)
+        {
+            if (freq != NULL) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, 1); }
+        }
+
+        return index;
+    }
+
+    sa_sint_t index = libsais16x64_main(T, A, n, 1, 0, NULL, fs, freq, 1);
+    if (index >= 0) 
+    { 
+        index++;
+
+        U[0] = T[n - 1];
+        libsais16x64_bwt_copy_16u(U + 1, A, index - 1);
+        libsais16x64_bwt_copy_16u(U + index, A + index, n - index);
+    }
+
+    return index;
+}
+
+int64_t libsais16x64_bwt_aux(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t r, int64_t * I)
+{
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL))
+    { 
+        return -1; 
+    }
+    else if (n <= 1) 
+    {
+        if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); }
+        if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } }
+        I[0] = n;
+        return 0;
+    }
+
+    if (n <= INT32_MAX && r <= INT32_MAX)
+    {
+        sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n;
+        sa_sint_t index = libsais16_bwt_aux(T, U, (int32_t *)A, (int32_t)n, (int32_t)new_fs, (int32_t *)freq, (int32_t)r, (int32_t *)I);
+
+        if (index >= 0)
+        {
+            libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)I, 1 + ((n - 1) / r), 1);
+            if (freq != NULL) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, 1); }
+        }
+
+        return index;
+    }
+
+    if (libsais16x64_main(T, A, n, 1, r, I, fs, freq, 1) != 0)
+    {
+        return -2;
+    }
+
+    U[0] = T[n - 1];
+    libsais16x64_bwt_copy_16u(U + 1, A, I[0] - 1);
+    libsais16x64_bwt_copy_16u(U + I[0], A + I[0], n - I[0]);
+
+    return 0;
+}
+
+#if defined(LIBSAIS_OPENMP)
+
+int64_t libsais16x64_omp(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq, int64_t threads)
+{
+    if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0))
+    {
+        return -1;
+    }
+    else if (n < 2)
+    {
+        if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); }
+        if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } }
+        return 0;
+    }
+
+    threads = threads > 0 ? threads : omp_get_max_threads();
+
+    if (n <= INT32_MAX)
+    {
+        sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n;
+        sa_sint_t index = libsais16_omp(T, (int32_t *)SA, (int32_t)n, (int32_t)new_fs, (int32_t *)freq, (int32_t)threads);
+
+        if (index >= 0)
+        {
+            libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)SA, n, threads);
+            if (freq != NULL) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, threads); }
+        }
+
+        return index;
+    }
+
+    return libsais16x64_main(T, SA, n, 0, 0, NULL, fs, freq, threads);
+}
+
+int64_t libsais16x64_long_omp(int64_t * T, int64_t * SA, int64_t n, int64_t k, int64_t fs, int64_t threads)
+{
+    if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0))
+    {
+        return -1;
+    }
+    else if (n < 2)
+    {
+        if (n == 1) { SA[0] = 0; }
+        return 0;
+    }
+
+    threads = threads > 0 ? threads : omp_get_max_threads();
+
+    return libsais16x64_main_long(T, SA, n, k, fs, threads);
+}
+
+int64_t libsais16x64_bwt_omp(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t threads)
+{
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (threads < 0))
+    {
+        return -1;
+    }
+    else if (n <= 1)
+    {
+        if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); }
+        if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } }
+        return n;
+    }
+
+    threads = threads > 0 ? threads : omp_get_max_threads();
+
+    if (n <= INT32_MAX)
+    {
+        sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n;
+        sa_sint_t index = libsais16_bwt_omp(T, U, (int32_t *)A, (int32_t)n, (int32_t)new_fs, (int32_t *)freq, (int32_t)threads);
+
+        if (index >= 0)
+        {
+            if (freq != NULL) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, threads); }
+        }
+
+        return index;
+    }
+
+    sa_sint_t index = libsais16x64_main(T, A, n, 1, 0, NULL, fs, freq, threads);
+    if (index >= 0)
+    {
+        index++;
+
+        U[0] = T[n - 1];
+        libsais16x64_bwt_copy_16u_omp(U + 1, A, index - 1, threads);
+        libsais16x64_bwt_copy_16u_omp(U + index, A + index, n - index, threads);
+    }
+
+    return index;
+}
+
+int64_t libsais16x64_bwt_aux_omp(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t r, int64_t * I, int64_t threads)
+{
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL) || (threads < 0))
+    {
+        return -1;
+    }
+    else if (n <= 1)
+    {
+        if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); }
+        if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } }
+        I[0] = n;
+        return 0;
+    }
+
+    threads = threads > 0 ? threads : omp_get_max_threads();
+
+    if (n <= INT32_MAX && r <= INT32_MAX)
+    {
+        sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n;
+        sa_sint_t index = libsais16_bwt_aux_omp(T, U, (int32_t *)A, (int32_t)n, (int32_t)new_fs, (int32_t *)freq, (int32_t)r, (int32_t *)I, (int32_t)threads);
+
+        if (index >= 0)
+        {
+            libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)I, 1 + ((n - 1) / r), threads);
+            if (freq != NULL) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, threads); }
+        }
+
+        return index;
+    }
+
+    if (libsais16x64_main(T, A, n, 1, r, I, fs, freq, threads) != 0)
+    {
+        return -2;
+    }
+
+    U[0] = T[n - 1];
+    libsais16x64_bwt_copy_16u_omp(U + 1, A, I[0] - 1, threads);
+    libsais16x64_bwt_copy_16u_omp(U + I[0], A + I[0], n - I[0], threads);
+
+    return 0;
+}
+
+#endif
+
+static void libsais16x64_unbwt_compute_histogram(const uint16_t * RESTRICT T, fast_sint_t n, sa_uint_t * RESTRICT count)
+{
+    fast_sint_t i; for (i = 0; i < n; i += 1) { count[T[i]]++; }
+}
+
+static void libsais16x64_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift)
+{
+    fast_uint_t v, w, sum;
+    for (v = 0, sum = 1, w = 0; w < ALPHABET_SIZE; ++w)
+    {
+        fast_uint_t prev = sum; sum += bucket2[w]; bucket2[w] = (sa_uint_t)prev;
+        if (prev != sum)
+        {
+            for (; v <= ((sum - 1) >> shift); ++v) { fastbits[v] = (uint16_t)w; }
+        }
+    }
+}
+
+static void libsais16x64_unbwt_calculate_P(const uint16_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, fast_uint_t index, fast_sint_t omp_block_start, fast_sint_t omp_block_end)
+{
+    {
+        fast_sint_t i = omp_block_start, j = (fast_sint_t)index; if (omp_block_end < j) { j = omp_block_end; }
+        for (; i < j; ++i) { fast_uint_t c = T[i]; P[bucket2[c]++] = (sa_uint_t)i; }
+    }
+
+    {
+        fast_sint_t i = (fast_sint_t)index, j = omp_block_end; if (omp_block_start > i) { i = omp_block_start; }
+        for (T -= 1, i += 1; i <= j; ++i) { fast_uint_t c = T[i]; P[bucket2[c]++] = (sa_uint_t)i; }
+    }
+}
+
+static void libsais16x64_unbwt_init_single(const uint16_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits)
+{
+    fast_uint_t index = I[0];
+    fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; }
+
+    if (freq != NULL)
+    {
+        memcpy(bucket2, freq, ALPHABET_SIZE * sizeof(sa_uint_t));
+    }
+    else
+    {
+        memset(bucket2, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
+        libsais16x64_unbwt_compute_histogram(T, n, bucket2);
+    }
+
+    libsais16x64_unbwt_calculate_fastbits(bucket2, fastbits, shift);
+    libsais16x64_unbwt_calculate_P(T, P, bucket2, index, 0, n);
+}
+
+#if defined(LIBSAIS_OPENMP)
+
+static void libsais16x64_unbwt_init_parallel(const uint16_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads)
+{
+    fast_uint_t index = I[0];
+    fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; }
+
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+    {
+        fast_sint_t omp_thread_num  = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+
+        if (omp_num_threads == 1)
+        {
+            libsais16x64_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits);
+        }
+        else
+        {
+            {
+                sa_uint_t * RESTRICT bucket2_local  = buckets + omp_thread_num * ALPHABET_SIZE;
+                fast_sint_t omp_block_stride        = (n / omp_num_threads) & (-16);
+                fast_sint_t omp_block_start         = omp_thread_num * omp_block_stride;
+                fast_sint_t omp_block_size          = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+                memset(bucket2_local, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
+                libsais16x64_unbwt_compute_histogram(T + omp_block_start, omp_block_size, bucket2_local);
+            }
+
+            #pragma omp barrier
+
+            {
+                sa_uint_t * RESTRICT bucket2_temp   = buckets;
+                fast_sint_t omp_block_stride        = (ALPHABET_SIZE / omp_num_threads) & (-16);
+                fast_sint_t omp_block_start         = omp_thread_num * omp_block_stride;
+                fast_sint_t omp_block_size          = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ALPHABET_SIZE - omp_block_start;
+
+                memset(bucket2 + omp_block_start, 0, (size_t)omp_block_size * sizeof(sa_uint_t));
+
+                fast_sint_t t;
+                for (t = 0; t < omp_num_threads; ++t, bucket2_temp += ALPHABET_SIZE)
+                {
+                    fast_sint_t c; for (c = omp_block_start; c < omp_block_start + omp_block_size; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_temp[c]; bucket2[c] = A + B; bucket2_temp[c] = A; }
+                }
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                libsais16x64_unbwt_calculate_fastbits(bucket2, fastbits, shift);
+            }
+
+            #pragma omp barrier
+
+            {
+                sa_uint_t * RESTRICT bucket2_local  = buckets + omp_thread_num * ALPHABET_SIZE;
+                fast_sint_t omp_block_stride        = (n / omp_num_threads) & (-16);
+                fast_sint_t omp_block_start         = omp_thread_num * omp_block_stride;
+                fast_sint_t omp_block_size          = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+                fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_local[c]; bucket2_local[c] = A + B; }
+
+                libsais16x64_unbwt_calculate_P(T, P, bucket2_local, index, omp_block_start, omp_block_start + omp_block_size);
+            }
+
+            #pragma omp barrier
+
+            #pragma omp master
+            {
+                memcpy(bucket2, buckets + (omp_num_threads - 1) * ALPHABET_SIZE, ALPHABET_SIZE * sizeof(sa_uint_t));
+            }
+        }
+    }
+}
+
+#endif
+
+static void libsais16x64_unbwt_decode_1(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t * i0, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = U;
+
+    fast_uint_t i, p0 = *i0;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+    }
+
+    *i0 = p0;
+}
+
+static void libsais16x64_unbwt_decode_2(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = U;
+    uint16_t * RESTRICT U1 = U0 + r;
+
+    fast_uint_t i, p0 = *i0, p1 = *i1;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+    }
+
+    *i0 = p0; *i1 = p1;
+}
+
+static void libsais16x64_unbwt_decode_3(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = U;
+    uint16_t * RESTRICT U1 = U0 + r;
+    uint16_t * RESTRICT U2 = U1 + r;
+
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2;
+    }
+
+    *i0 = p0; *i1 = p1; *i2 = p2;
+}
+
+static void libsais16x64_unbwt_decode_4(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = U;
+    uint16_t * RESTRICT U1 = U0 + r;
+    uint16_t * RESTRICT U2 = U1 + r;
+    uint16_t * RESTRICT U3 = U2 + r;
+
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2;
+        uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3;
+    }
+
+    *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3;
+}
+
+static void libsais16x64_unbwt_decode_5(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = U;
+    uint16_t * RESTRICT U1 = U0 + r;
+    uint16_t * RESTRICT U2 = U1 + r;
+    uint16_t * RESTRICT U3 = U2 + r;
+    uint16_t * RESTRICT U4 = U3 + r;
+
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2;
+        uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3;
+        uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4;
+    }
+
+    *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4;
+}
+
+static void libsais16x64_unbwt_decode_6(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = U;
+    uint16_t * RESTRICT U1 = U0 + r;
+    uint16_t * RESTRICT U2 = U1 + r;
+    uint16_t * RESTRICT U3 = U2 + r;
+    uint16_t * RESTRICT U4 = U3 + r;
+    uint16_t * RESTRICT U5 = U4 + r;
+
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2;
+        uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3;
+        uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4;
+        uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = c5;
+    }
+
+    *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5;
+}
+
+static void libsais16x64_unbwt_decode_7(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = U;
+    uint16_t * RESTRICT U1 = U0 + r;
+    uint16_t * RESTRICT U2 = U1 + r;
+    uint16_t * RESTRICT U3 = U2 + r;
+    uint16_t * RESTRICT U4 = U3 + r;
+    uint16_t * RESTRICT U5 = U4 + r;
+    uint16_t * RESTRICT U6 = U5 + r;
+
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2;
+        uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3;
+        uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4;
+        uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = c5;
+        uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = c6;
+    }
+
+    *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6;
+}
+
+static void libsais16x64_unbwt_decode_8(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t * i7, fast_uint_t k)
+{
+    uint16_t * RESTRICT U0 = U;
+    uint16_t * RESTRICT U1 = U0 + r;
+    uint16_t * RESTRICT U2 = U1 + r;
+    uint16_t * RESTRICT U3 = U2 + r;
+    uint16_t * RESTRICT U4 = U3 + r;
+    uint16_t * RESTRICT U5 = U4 + r;
+    uint16_t * RESTRICT U6 = U5 + r;
+    uint16_t * RESTRICT U7 = U6 + r;
+
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6, p7 = *i7;
+
+    for (i = 0; i != k; ++i)
+    {
+        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0;
+        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1;
+        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2;
+        uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3;
+        uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4;
+        uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = c5;
+        uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = c6;
+        uint16_t c7 = fastbits[p7 >> shift]; if (bucket2[c7] <= p7) { do { c7++; } while (bucket2[c7] <= p7); } p7 = P[p7]; U7[i] = c7;
+    }
+
+    *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6; *i7 = p7;
+}
+
+static void libsais16x64_unbwt_decode(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_sint_t blocks, fast_uint_t remainder)
+{
+    fast_uint_t shift       = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; }
+    fast_uint_t offset      = 0;
+
+    while (blocks > 8)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7];
+        libsais16x64_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, (fast_uint_t)r);
+        I += 8; blocks -= 8; offset += 8 * (fast_uint_t)r;
+    }
+
+    if (blocks == 1)
+    {
+        fast_uint_t i0 = I[0];
+        libsais16x64_unbwt_decode_1(U + offset, P, bucket2, fastbits, shift, &i0, remainder);
+    }
+    else if (blocks == 2)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1];
+        libsais16x64_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, remainder);
+        libsais16x64_unbwt_decode_1(U + offset + remainder, P, bucket2, fastbits, shift, &i0, ((fast_uint_t)r) - remainder);
+    }
+    else if (blocks == 3)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2];
+        libsais16x64_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, remainder);
+        libsais16x64_unbwt_decode_2(U + offset + remainder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, ((fast_uint_t)r) - remainder);
+    }
+    else if (blocks == 4)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3];
+        libsais16x64_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, remainder);
+        libsais16x64_unbwt_decode_3(U + offset + remainder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, ((fast_uint_t)r) - remainder);
+    }
+    else if (blocks == 5)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4];
+        libsais16x64_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, remainder);
+        libsais16x64_unbwt_decode_4(U + offset + remainder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, ((fast_uint_t)r) - remainder);
+    }
+    else if (blocks == 6)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5];
+        libsais16x64_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, remainder);
+        libsais16x64_unbwt_decode_5(U + offset + remainder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, ((fast_uint_t)r) - remainder);
+    }
+    else if (blocks == 7)
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6];
+        libsais16x64_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, remainder);
+        libsais16x64_unbwt_decode_6(U + offset + remainder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, ((fast_uint_t)r) - remainder);
+    }
+    else
+    {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7];
+        libsais16x64_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, remainder);
+        libsais16x64_unbwt_decode_7(U + offset + remainder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, ((fast_uint_t)r) - remainder);
+    }
+}
+
+static void libsais16x64_unbwt_decode_omp(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_sint_t threads)
+{
+    fast_sint_t blocks      = 1 + (((fast_sint_t)n - 1) / (fast_sint_t)r);
+    fast_uint_t remainder   = (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1));
+
+#if defined(LIBSAIS_OPENMP)
+    fast_sint_t max_threads = blocks < threads ? blocks : threads;
+    #pragma omp parallel num_threads(max_threads) if(max_threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num      = omp_get_thread_num();
+        fast_sint_t omp_num_threads     = omp_get_num_threads();
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_thread_num      = 0;
+        fast_sint_t omp_num_threads     = 1;
+#endif
+
+        fast_sint_t omp_block_stride    = blocks / omp_num_threads;
+        fast_sint_t omp_block_remainder = blocks % omp_num_threads;
+        fast_sint_t omp_block_size      = omp_block_stride + (omp_thread_num < omp_block_remainder);
+        fast_sint_t omp_block_start     = omp_block_stride * omp_thread_num + (omp_thread_num < omp_block_remainder ? omp_thread_num : omp_block_remainder);
+
+        libsais16x64_unbwt_decode(U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2, fastbits, omp_block_size, omp_thread_num < omp_num_threads - 1 ? (fast_uint_t)r : remainder);
+    }
+}
+
+static sa_sint_t libsais16x64_unbwt_core(const uint16_t * RESTRICT T, uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads)
+{
+#if defined(LIBSAIS_OPENMP)
+    if (threads > 1 && n >= 262144)
+    {
+        libsais16x64_unbwt_init_parallel(T, P, n, freq, I, bucket2, fastbits, buckets, threads);
+    }
+    else
+#else
+    UNUSED(buckets);
+#endif
+    {
+        libsais16x64_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits);
+    }
+
+    libsais16x64_unbwt_decode_omp(U, P, n, r, I, bucket2, fastbits, threads);
+    return 0;
+}
+
+static sa_sint_t libsais16x64_unbwt_main(const uint16_t * T, uint16_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I, sa_sint_t threads)
+{
+    fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; }
+
+    sa_uint_t *     RESTRICT bucket2        = (sa_uint_t *)libsais16x64_alloc_aligned(ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
+    uint16_t *      RESTRICT fastbits       = (uint16_t *)libsais16x64_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(uint16_t), 4096);
+    sa_uint_t *     RESTRICT buckets        = threads > 1 && n >= 262144 ? (sa_uint_t *)libsais16x64_alloc_aligned((size_t)threads * ALPHABET_SIZE * sizeof(sa_uint_t), 4096) : NULL;
+
+    sa_sint_t index = bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1 || n < 262144)
+        ? libsais16x64_unbwt_core(T, U, P, n, freq, r, I, bucket2, fastbits, buckets, threads)
+        : -2;
+
+    libsais16x64_free_aligned(buckets);
+    libsais16x64_free_aligned(fastbits);
+    libsais16x64_free_aligned(bucket2);
+
+    return index;
+}
+
+int64_t libsais16x64_unbwt(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t i)
+{
+    return libsais16x64_unbwt_aux(T, U, A, n, freq, n, &i);
+}
+
+int64_t libsais16x64_unbwt_aux(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t r, const int64_t * I)
+{
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL))
+    {
+        return -1;
+    }
+    else if (n <= 1)
+    {
+        if (I[0] != n) { return -1; }
+        if (n == 1) { U[0] = T[0]; }
+        return 0;
+    }
+
+    fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } }
+
+    if (n <= INT32_MAX && r <= INT32_MAX && (n - 1) / r < 1024)
+    {
+        int32_t indexes[1024]; for (t = 0; t <= (n - 1) / r; ++t) { indexes[t] = (int32_t)I[t]; }
+
+        return libsais16_unbwt_aux(T, U, (int32_t *)A, (int32_t)n, NULL, (int32_t)r, indexes);
+    }
+
+    return libsais16x64_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, 1);
+}
+
+#if defined(LIBSAIS_OPENMP)
+
+int64_t libsais16x64_unbwt_omp(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t i, int64_t threads)
+{
+    return libsais16x64_unbwt_aux_omp(T, U, A, n, freq, n, &i, threads);
+}
+
+int64_t libsais16x64_unbwt_aux_omp(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t r, const int64_t * I, int64_t threads)
+{
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL) || (threads < 0))
+    {
+        return -1;
+    }
+    else if (n <= 1)
+    {
+        if (I[0] != n) { return -1; }
+        if (n == 1) { U[0] = T[0]; }
+        return 0;
+    }
+
+    fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } }
+
+    if (n <= INT32_MAX && r <= INT32_MAX && (n - 1) / r < 1024)
+    {
+        int32_t indexes[1024]; for (t = 0; t <= (n - 1) / r; ++t) { indexes[t] = (int32_t)I[t]; }
+
+        return libsais16_unbwt_aux_omp(T, U, (int32_t *)A, (int32_t)n, NULL,(int32_t)r, indexes, (int32_t)threads);
+    }
+
+    threads = threads > 0 ? threads : omp_get_max_threads();
+    return libsais16x64_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, threads);
+}
+
+#endif
+
+static void libsais16x64_compute_phi(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j; sa_sint_t k = omp_block_start > 0 ? SA[omp_block_start - 1] : n;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]);
+
+        libsais16x64_prefetchw(&PLCP[SA[i + prefetch_distance + 0]]);
+        libsais16x64_prefetchw(&PLCP[SA[i + prefetch_distance + 1]]);
+
+        PLCP[SA[i + 0]] = k; k = SA[i + 0];
+        PLCP[SA[i + 1]] = k; k = SA[i + 1];
+
+        libsais16x64_prefetchw(&PLCP[SA[i + prefetch_distance + 2]]);
+        libsais16x64_prefetchw(&PLCP[SA[i + prefetch_distance + 3]]);
+
+        PLCP[SA[i + 2]] = k; k = SA[i + 2];
+        PLCP[SA[i + 3]] = k; k = SA[i + 3];
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        PLCP[SA[i]] = k; k = SA[i];
+    }
+}
+
+static void libsais16x64_compute_phi_omp(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        libsais16x64_compute_phi(SA, PLCP, n, omp_block_start, omp_block_size);
+    }
+}
+
+static void libsais16x64_compute_plcp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, fast_sint_t n, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j, l = 0;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j; i += 1)
+    {
+        libsais16x64_prefetchw(&PLCP[i + 2 * prefetch_distance]);
+        libsais16x64_prefetchr(&T[PLCP[i + prefetch_distance] + l]);
+
+        fast_sint_t k = PLCP[i], m = n - (i > k ? i : k);
+        while (l < m && T[i + l] == T[k + l]) { l++; }
+
+        PLCP[i] = (sa_sint_t)l; l -= (l != 0);
+    }
+
+    for (j += prefetch_distance; i < j; i += 1)
+    {
+        fast_sint_t k = PLCP[i], m = n - (i > k ? i : k);
+        while (l < m && T[i + l] == T[k + l]) { l++; }
+
+        PLCP[i] = (sa_sint_t)l; l -= (l != 0);
+    }
+}
+
+static void libsais16x64_compute_plcp_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        libsais16x64_compute_plcp(T, PLCP, n, omp_block_start, omp_block_size);
+    }
+}
+
+static void libsais16x64_compute_lcp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT LCP, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
+{
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
+    {
+        libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]);
+        libsais16x64_prefetchw(&LCP[i + prefetch_distance]);
+
+        libsais16x64_prefetchr(&PLCP[SA[i + prefetch_distance + 0]]);
+        libsais16x64_prefetchr(&PLCP[SA[i + prefetch_distance + 1]]);
+
+        LCP[i + 0] = PLCP[SA[i + 0]];
+        LCP[i + 1] = PLCP[SA[i + 1]];
+
+        libsais16x64_prefetchr(&PLCP[SA[i + prefetch_distance + 2]]);
+        libsais16x64_prefetchr(&PLCP[SA[i + prefetch_distance + 3]]);
+
+        LCP[i + 2] = PLCP[SA[i + 2]];
+        LCP[i + 3] = PLCP[SA[i + 3]];
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1)
+    {
+        LCP[i] = PLCP[SA[i]];
+    }
+}
+
+static void libsais16x64_compute_lcp_omp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT LCP, sa_sint_t n, sa_sint_t threads)
+{
+#if defined(LIBSAIS_OPENMP)
+    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+#endif
+    {
+#if defined(LIBSAIS_OPENMP)
+        fast_sint_t omp_thread_num    = omp_get_thread_num();
+        fast_sint_t omp_num_threads   = omp_get_num_threads();
+#else
+        UNUSED(threads);
+
+        fast_sint_t omp_thread_num    = 0;
+        fast_sint_t omp_num_threads   = 1;
+#endif
+        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+        libsais16x64_compute_lcp(PLCP, SA, LCP, omp_block_start, omp_block_size);
+    }
+}
+
+int64_t libsais16x64_plcp(const uint16_t * T, const int64_t * SA, int64_t * PLCP, int64_t n)
+{
+    if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0))
+    {
+        return -1;
+    }
+    else if (n <= 1)
+    {
+        if (n == 1) { PLCP[0] = 0; }
+        return 0;
+    }
+
+    libsais16x64_compute_phi_omp(SA, PLCP, n, 1);
+    libsais16x64_compute_plcp_omp(T, PLCP, n, 1);
+
+    return 0;
+}
+
+int64_t libsais16x64_lcp(const int64_t * PLCP, const int64_t * SA, int64_t * LCP, int64_t n)
+{
+    if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0))
+    {
+        return -1;
+    }
+    else if (n <= 1)
+    {
+        if (n == 1) { LCP[0] = PLCP[SA[0]]; }
+        return 0;
+    }
+
+    libsais16x64_compute_lcp_omp(PLCP, SA, LCP, n, 1);
+
+    return 0;
+}
+
+#if defined(LIBSAIS_OPENMP)
+
+int64_t libsais16x64_plcp_omp(const uint16_t * T, const int64_t * SA, int64_t * PLCP, int64_t n, int64_t threads)
+{
+    if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (threads < 0))
+    {
+        return -1;
+    }
+    else if (n <= 1)
+    {
+        if (n == 1) { PLCP[0] = 0; }
+        return 0;
+    }
+    
+    threads = threads > 0 ? threads : omp_get_max_threads();
+
+    libsais16x64_compute_phi_omp(SA, PLCP, n, threads);
+    libsais16x64_compute_plcp_omp(T, PLCP, n, threads);
+
+    return 0;
+}
+
+int64_t libsais16x64_lcp_omp(const int64_t * PLCP, const int64_t * SA, int64_t * LCP, int64_t n, int64_t threads)
+{
+    if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0) || (threads < 0))
+    {
+        return -1;
+    }
+    else if (n <= 1)
+    {
+        if (n == 1) { LCP[0] = PLCP[SA[0]]; }
+        return 0;
+    }
+
+    threads = threads > 0 ? threads : omp_get_max_threads();
+
+    libsais16x64_compute_lcp_omp(PLCP, SA, LCP, n, threads);
+
+    return 0;
+}
+
+#endif