Skip to content

Commit

Permalink
Improved performance of suffix array and burrows wheeler transform co…
Browse files Browse the repository at this point in the history
…nstruction on degenerate inputs.
  • Loading branch information
IlyaGrebnov committed Feb 27, 2024
1 parent 381e3e5 commit 4283ec2
Show file tree
Hide file tree
Showing 10 changed files with 61 additions and 43 deletions.
4 changes: 3 additions & 1 deletion CHANGES
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
Changes in 2.7.4 (February 23, 2024)
Changes in 2.7.5 (February 26, 2024)
- Improved performance of suffix array and burrows wheeler transform construction on degenerate inputs.

Changes in 2.7.4 (February 23, 2024)
- Resolved strict aliasing violation resulted in invalid code generation by Intel compiler.

Changes in 2.7.3 (April 21, 2023)
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.10)

project(libsais VERSION 2.7.4 LANGUAGES C DESCRIPTION "libsais is a library for linear time suffix array, longest common prefix array and burrows wheeler transform construction based on induced sorting algorithm.")
project(libsais VERSION 2.7.5 LANGUAGES C DESCRIPTION "libsais is a library for linear time suffix array, longest common prefix array and burrows wheeler transform construction based on induced sorting algorithm.")

set(CMAKE_C_STANDARD 99)
set(CMAKE_C_STANDARD_REQUIRED ON)
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ The libsais provides simple C99 API to construct suffix array and Burrows-Wheele
The libsais is released under the [Apache License Version 2.0](LICENSE "Apache license")

## Changes
* February 23, 2024 (2.7.4)
* February 26, 2024 (2.7.5)
* Improved performance of suffix array and burrows wheeler transform construction on degenerate inputs.
* February 23, 2024 (2.7.4)
* Resolved strict aliasing violation resulted in invalid code generation by Intel compiler.
* April 21, 2023 (2.7.3)
* CMake script for library build and integration with other projects.
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.7.4
2.7.5
4 changes: 2 additions & 2 deletions include/libsais.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ Please see the file LICENSE for full copyright information.

#define LIBSAIS_VERSION_MAJOR 2
#define LIBSAIS_VERSION_MINOR 7
#define LIBSAIS_VERSION_PATCH 4
#define LIBSAIS_VERSION_STRING "2.7.4"
#define LIBSAIS_VERSION_PATCH 5
#define LIBSAIS_VERSION_STRING "2.7.5"

#ifdef _WIN32
#ifdef LIBSAIS_SHARED
Expand Down
4 changes: 2 additions & 2 deletions include/libsais16.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ Please see the file LICENSE for full copyright information.

#define LIBSAIS16_VERSION_MAJOR 2
#define LIBSAIS16_VERSION_MINOR 7
#define LIBSAIS16_VERSION_PATCH 4
#define LIBSAIS16_VERSION_STRING "2.7.4"
#define LIBSAIS16_VERSION_PATCH 5
#define LIBSAIS16_VERSION_STRING "2.7.5"

#ifdef _WIN32
#ifdef LIBSAIS_SHARED
Expand Down
4 changes: 2 additions & 2 deletions include/libsais64.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ Please see the file LICENSE for full copyright information.

#define LIBSAIS64_VERSION_MAJOR 2
#define LIBSAIS64_VERSION_MINOR 7
#define LIBSAIS64_VERSION_PATCH 4
#define LIBSAIS64_VERSION_STRING "2.7.4"
#define LIBSAIS64_VERSION_PATCH 5
#define LIBSAIS64_VERSION_STRING "2.7.5"

#ifdef _WIN32
#ifdef LIBSAIS_SHARED
Expand Down
27 changes: 16 additions & 11 deletions src/libsais.c
Original file line number Diff line number Diff line change
Expand Up @@ -3881,7 +3881,7 @@ static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA, sa_si
return name;
}

static fast_sint_t libsais_gather_marked_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
static fast_sint_t libsais_gather_marked_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
{
const fast_sint_t prefetch_distance = 32;

Expand Down Expand Up @@ -3959,7 +3959,7 @@ static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, s
return name;
}

static void libsais_gather_marked_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
static void libsais_gather_marked_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
{
#if defined(LIBSAIS_OPENMP)
#pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072)
Expand All @@ -3980,20 +3980,20 @@ static void libsais_gather_marked_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, s

if (omp_num_threads == 1)
{
libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
libsais_gather_marked_lms_suffixes(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
}
#if defined(LIBSAIS_OPENMP)
else
{
{
if (omp_thread_num < omp_num_threads - 1)
{
thread_state[omp_thread_num].state.position = libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)m + omp_block_start + omp_block_size, omp_block_start, omp_block_size);
thread_state[omp_thread_num].state.position = libsais_gather_marked_lms_suffixes(SA, m, (fast_sint_t)m + omp_block_start + omp_block_size, omp_block_start, omp_block_size);
thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size - thread_state[omp_thread_num].state.position;
}
else
{
thread_state[omp_thread_num].state.position = libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
thread_state[omp_thread_num].state.position = libsais_gather_marked_lms_suffixes(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
thread_state[omp_thread_num].state.count = (fast_sint_t)n + (fast_sint_t)fs - thread_state[omp_thread_num].state.position;
}
}
Expand All @@ -4018,14 +4018,14 @@ static void libsais_gather_marked_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, s
}
}

static sa_sint_t libsais_renumber_and_gather_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
static sa_sint_t libsais_renumber_and_gather_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
{
memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));

sa_sint_t name = libsais_renumber_lms_suffixes_8u_omp(SA, m, threads, thread_state);
if (name < m)
{
libsais_gather_marked_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state);
libsais_gather_marked_lms_suffixes_omp(SA, n, m, fs, threads, thread_state);
}
else
{
Expand Down Expand Up @@ -6279,17 +6279,22 @@ static sa_sint_t libsais_main_32s_recursion(sa_sint_t * RESTRICT T, sa_sint_t *
sa_sint_t left_suffixes_count = libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix);

libsais_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * (fast_sint_t)k], threads, thread_state);
libsais_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * (fast_sint_t)k], threads);

if ((n / 8192) < k) { libsais_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * (fast_sint_t)k], threads); }
if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); }

libsais_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix, left_suffixes_count);
libsais_induce_partial_order_32s_6k_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state);

sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state);
sa_sint_t names = (n / 8192) < k
? libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state)
: libsais_renumber_and_gather_lms_suffixes_omp(SA, n, m, fs, threads, thread_state);

if (names < m)
{
sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
sa_sint_t f = (n / 8192) < k
? libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state)
: 0;

if (libsais_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0)
{
Expand Down Expand Up @@ -6486,7 +6491,7 @@ static sa_sint_t libsais_main_8u(const uint8_t * T, sa_sint_t * SA, sa_sint_t n,
libsais_initialize_buckets_for_partial_sorting_8u(T, buckets, first_lms_suffix, left_suffixes_count);
libsais_induce_partial_order_8u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state);

sa_sint_t names = libsais_renumber_and_gather_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state);
sa_sint_t names = libsais_renumber_and_gather_lms_suffixes_omp(SA, n, m, fs, threads, thread_state);
if (names < m)
{
if (libsais_main_32s_entry(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0)
Expand Down
27 changes: 16 additions & 11 deletions src/libsais16.c
Original file line number Diff line number Diff line change
Expand Up @@ -3859,7 +3859,7 @@ static sa_sint_t libsais16_renumber_lms_suffixes_16u(sa_sint_t * RESTRICT SA, sa
return name;
}

static fast_sint_t libsais16_gather_marked_suffixes_16u(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
static fast_sint_t libsais16_gather_marked_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
{
const fast_sint_t prefetch_distance = 32;

Expand Down Expand Up @@ -3937,7 +3937,7 @@ static sa_sint_t libsais16_renumber_lms_suffixes_16u_omp(sa_sint_t * RESTRICT SA
return name;
}

static void libsais16_gather_marked_lms_suffixes_16u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
static void libsais16_gather_marked_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
{
#if defined(LIBSAIS_OPENMP)
#pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072)
Expand All @@ -3958,20 +3958,20 @@ static void libsais16_gather_marked_lms_suffixes_16u_omp(sa_sint_t * RESTRICT SA

if (omp_num_threads == 1)
{
libsais16_gather_marked_suffixes_16u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
libsais16_gather_marked_lms_suffixes(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
}
#if defined(LIBSAIS_OPENMP)
else
{
{
if (omp_thread_num < omp_num_threads - 1)
{
thread_state[omp_thread_num].state.position = libsais16_gather_marked_suffixes_16u(SA, m, (fast_sint_t)m + omp_block_start + omp_block_size, omp_block_start, omp_block_size);
thread_state[omp_thread_num].state.position = libsais16_gather_marked_lms_suffixes(SA, m, (fast_sint_t)m + omp_block_start + omp_block_size, omp_block_start, omp_block_size);
thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size - thread_state[omp_thread_num].state.position;
}
else
{
thread_state[omp_thread_num].state.position = libsais16_gather_marked_suffixes_16u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
thread_state[omp_thread_num].state.position = libsais16_gather_marked_lms_suffixes(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
thread_state[omp_thread_num].state.count = (fast_sint_t)n + (fast_sint_t)fs - thread_state[omp_thread_num].state.position;
}
}
Expand All @@ -3996,14 +3996,14 @@ static void libsais16_gather_marked_lms_suffixes_16u_omp(sa_sint_t * RESTRICT SA
}
}

static sa_sint_t libsais16_renumber_and_gather_lms_suffixes_16u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
static sa_sint_t libsais16_renumber_and_gather_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
{
memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));

sa_sint_t name = libsais16_renumber_lms_suffixes_16u_omp(SA, m, threads, thread_state);
if (name < m)
{
libsais16_gather_marked_lms_suffixes_16u_omp(SA, n, m, fs, threads, thread_state);
libsais16_gather_marked_lms_suffixes_omp(SA, n, m, fs, threads, thread_state);
}
else
{
Expand Down Expand Up @@ -6257,17 +6257,22 @@ static sa_sint_t libsais16_main_32s_recursion(sa_sint_t * RESTRICT T, sa_sint_t
sa_sint_t left_suffixes_count = libsais16_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix);

libsais16_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * (fast_sint_t)k], threads, thread_state);
libsais16_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * (fast_sint_t)k], threads);

if ((n / 8192) < k) { libsais16_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * (fast_sint_t)k], threads); }
if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); }

libsais16_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix, left_suffixes_count);
libsais16_induce_partial_order_32s_6k_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state);

sa_sint_t names = libsais16_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state);
sa_sint_t names = (n / 8192) < k
? libsais16_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state)
: libsais16_renumber_and_gather_lms_suffixes_omp(SA, n, m, fs, threads, thread_state);

if (names < m)
{
sa_sint_t f = libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
sa_sint_t f = (n / 8192) < k
? libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state)
: 0;

if (libsais16_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0)
{
Expand Down Expand Up @@ -6464,7 +6469,7 @@ static sa_sint_t libsais16_main_16u(const uint16_t * T, sa_sint_t * SA, sa_sint_
libsais16_initialize_buckets_for_partial_sorting_16u(T, buckets, first_lms_suffix, left_suffixes_count);
libsais16_induce_partial_order_16u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state);

sa_sint_t names = libsais16_renumber_and_gather_lms_suffixes_16u_omp(SA, n, m, fs, threads, thread_state);
sa_sint_t names = libsais16_renumber_and_gather_lms_suffixes_omp(SA, n, m, fs, threads, thread_state);
if (names < m)
{
if (libsais16_main_32s_entry(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0)
Expand Down
Loading

0 comments on commit 4283ec2

Please sign in to comment.