From 3847654b17a3123a084f3a3dea375e00355846a8 Mon Sep 17 00:00:00 2001 From: Ilya Grebnov Date: Tue, 18 Apr 2023 22:18:47 -0700 Subject: [PATCH] Version 2.7.2 Fixed out-of-bound memory access issue for large inputs. --- CHANGES | 3 ++ Makefile | 54 ------------------------ README.md | 2 + VERSION | 2 +- src/libsais.c => libsais.c | 76 +++++++++++++++++----------------- src/libsais.h => libsais.h | 4 +- src/libsais16.c => libsais16.c | 76 +++++++++++++++++----------------- src/libsais16.h => libsais16.h | 4 +- src/libsais64.c => libsais64.c | 76 +++++++++++++++++----------------- src/libsais64.h => libsais64.h | 4 +- 10 files changed, 126 insertions(+), 175 deletions(-) delete mode 100644 Makefile rename src/libsais.c => libsais.c (99%) rename src/libsais.h => libsais.h (99%) rename src/libsais16.c => libsais16.c (99%) rename src/libsais16.h => libsais16.h (99%) rename src/libsais64.c => libsais64.c (99%) rename src/libsais64.h => libsais64.h (99%) diff --git a/CHANGES b/CHANGES index c41b978..ba816ad 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,6 @@ +Changes in 2.7.2 (April 18, 2023) +- Fixed out-of-bound memory access issue for large inputs (libsais64). + Changes in 2.7.1 (June 19, 2022) - Improved cache coherence for ARMv8 architecture. diff --git a/Makefile b/Makefile deleted file mode 100644 index 82480b7..0000000 --- a/Makefile +++ /dev/null @@ -1,54 +0,0 @@ -PROJECT=sais -PLIBNAME=lib$(PROJECT) -PVER=2.7.1 -PSOVER=2 -ifeq ($(OS),Windows_NT) - PLIBSTATIC=$(PROJECT).a - PLIBSHARED=$(PROJECT)-$(PVER).dll -else - PLIBSTATIC=$(PLIBNAME).a - PLIBSHARED=$(PLIBNAME).so.$(PSOVER) -endif -PLIBS=$(PLIBSTATIC) $(PLIBSHARED) -CC=gcc -CFLAGS?=-Wall -O2 -LDFLAGS?=-lm -AR?=ar -INSTALL?=install -RM?=rm -f -RMD?=$(RM) -r -PREFIX?=/usr/local -SRCS=src -DOCS?=share/doc/$(LIBNAME) -LIBS?=lib -INCLUDES?=include -MANS?=man/man1 - -all: $(PLIBS) - -$(SRCS)/$(PLIBNAME).o: $(SRCS)/$(PLIBNAME).c - $(CC) $(CFLAGS) -c -o $@ $^ - -$(PLIBSTATIC): $(SRCS)/$(PLIBNAME).o - $(AR) rcs $@ $^ - -$(PLIBSHARED): $(SRCS)/$(PLIBNAME).o - $(CC) $(CFLAGS) -shared -Wl,-soname,$@ $^ -o $@ - -clean: - $(RM) $(SRCS)/$(PLIBNAME).o $(PLIBS) - -install: - $(INSTALL) -d $(PREFIX)/$(LIBS) - $(INSTALL) -d $(PREFIX)/$(INCLUDES) - $(INSTALL) -d $(PREFIX)/$(MANS) - $(INSTALL) -d $(PREFIX)/$(DOCS) - $(INSTALL) -m 0644 $(PLIBS) $(PREFIX)/$(LIBS) - $(INSTALL) -m 0644 $(SRCS)/$(PLIBNAME).h $(PREFIX)/$(INCLUDES) - $(INSTALL) -m 0644 CHANGES LICENSE README.md VERSION $(PREFIX)/$(DOCS) - -uninstall: - $(RM) $(PREFIX)/$(LIBS)/$(PLIBSTATIC) - $(RM) $(PREFIX)/$(LIBS)/$(PLIBSHARED) - $(RM) $(PREFIX)/$(INCLUDES)/$(SRCS)/$(PLIBNAME).h - $(RMD) $(PREFIX)/$(DOCS) diff --git a/README.md b/README.md index c869a9c..83f993b 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,8 @@ The libsais provides simple C99 API to construct suffix array and Burrows-Wheele The libsais is released under the [Apache License Version 2.0](LICENSE "Apache license") ## Changes +* April 18, 2023 (2.7.2) + * Fixed out-of-bound memory access issue for large inputs (libsais64). * June 19, 2022 (2.7.1) * Improved cache coherence for ARMv8 architecture. * April 12, 2022 (2.7.0) diff --git a/VERSION b/VERSION index 5588ae8..fbafd6b 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.7.1 \ No newline at end of file +2.7.2 \ No newline at end of file diff --git a/src/libsais.c b/libsais.c similarity index 99% rename from src/libsais.c rename to libsais.c index fcbfe7e..6d4c0e1 100644 --- a/src/libsais.c +++ b/libsais.c @@ -1126,7 +1126,7 @@ static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(const else { fast_sint_t bucket_size = 2 * (fast_sint_t)k; - fast_sint_t bucket_stride = libsais_get_bucket_stride(buckets - &SA[n + n], bucket_size, omp_num_threads); + fast_sint_t bucket_stride = libsais_get_bucket_stride(buckets - &SA[(fast_sint_t)n + (fast_sint_t)n], bucket_size, omp_num_threads); { thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; @@ -1310,7 +1310,7 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_omp(const sa_sint_ static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(_OPENMP) - sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n + n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } + sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[(fast_sint_t)n + (fast_sint_t)n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } if (max_threads > 1 && n >= 65536 && n / k >= 2) { if (max_threads > n / 8 / k) { max_threads = n / 8 / k; } @@ -1381,8 +1381,8 @@ static void libsais_initialize_buckets_start_and_end_8u(sa_sint_t * RESTRICT buc static void libsais_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { - sa_sint_t * RESTRICT bucket_start = &buckets[4 * k]; - sa_sint_t * RESTRICT bucket_end = &buckets[5 * k]; + sa_sint_t * RESTRICT bucket_start = &buckets[4 * (fast_sint_t)k]; + sa_sint_t * RESTRICT bucket_end = &buckets[5 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) @@ -1395,8 +1395,8 @@ static void libsais_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_sint static void libsais_initialize_buckets_start_and_end_32s_4k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { - sa_sint_t * RESTRICT bucket_start = &buckets[2 * k]; - sa_sint_t * RESTRICT bucket_end = &buckets[3 * k]; + sa_sint_t * RESTRICT bucket_start = &buckets[2 * (fast_sint_t)k]; + sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) @@ -1501,7 +1501,7 @@ static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(c } { - sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; + sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) @@ -1515,8 +1515,8 @@ static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(c static void libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) { - sa_sint_t * RESTRICT bucket_start = &buckets[2 * k]; - sa_sint_t * RESTRICT bucket_end = &buckets[3 * k]; + sa_sint_t * RESTRICT bucket_start = &buckets[2 * (fast_sint_t)k]; + sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++; buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--; @@ -2062,7 +2062,7 @@ static void libsais_initialize_buckets_for_partial_sorting_8u(const uint8_t * RE static void libsais_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) { - sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; + sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0, sum2 = 0; for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4((fast_sint_t)first_lms_suffix - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) @@ -2350,8 +2350,8 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(const sa_sint { const fast_sint_t prefetch_distance = 32; - sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k]; - sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) @@ -2523,8 +2523,8 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(co { const fast_sint_t prefetch_distance = 32; - sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k]; - sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) @@ -2782,8 +2782,8 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_omp(const sa_ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { - sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k]; - sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER; distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d; @@ -2871,7 +2871,7 @@ static void libsais_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRIC { const fast_sint_t prefetch_distance = 32; - const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; + const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t c; @@ -2923,7 +2923,7 @@ static void libsais_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT SA static void libsais_partial_sorting_shift_buckets_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { - sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; + sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i; for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) @@ -3179,8 +3179,8 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(const sa_sint { const fast_sint_t prefetch_distance = 32; - sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k]; - sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + sa_sint_t * RESTRICT induction_bucket = &buckets[3 * (fast_sint_t)k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) @@ -3352,8 +3352,8 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(co { const fast_sint_t prefetch_distance = 32; - sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k]; - sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + sa_sint_t * RESTRICT induction_bucket = &buckets[3 * (fast_sint_t)k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) @@ -3832,8 +3832,8 @@ static void libsais_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T static void libsais_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { - libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * k], threads, thread_state); - libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * k], threads, thread_state); + libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * (fast_sint_t)k], threads, thread_state); + libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * (fast_sint_t)k], threads, thread_state); libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state); } @@ -4371,7 +4371,7 @@ static void libsais_place_lms_suffixes_interval_8u(sa_sint_t * RESTRICT SA, sa_s static void libsais_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { - const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k]; + const sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) @@ -4448,7 +4448,7 @@ static void libsais_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRIC static void libsais_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { - const sa_sint_t * RESTRICT bucket_end = &buckets[5 * k]; + const sa_sint_t * RESTRICT bucket_end = &buckets[5 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) @@ -4471,7 +4471,7 @@ static void libsais_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, static void libsais_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { - const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k]; + const sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) @@ -5830,20 +5830,20 @@ static sa_sint_t libsais_induce_final_order_8u_omp(const uint8_t * RESTRICT T, s static void libsais_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { - libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * k], threads, thread_state); - libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * k], threads, thread_state); + libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * (fast_sint_t)k], threads, thread_state); + libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * (fast_sint_t)k], threads, thread_state); } static void libsais_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { - libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * k], threads, thread_state); - libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * k], threads, thread_state); + libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * (fast_sint_t)k], threads, thread_state); + libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * (fast_sint_t)k], threads, thread_state); } static void libsais_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { - libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * k], threads, thread_state); - libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * k], threads, thread_state); + libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * (fast_sint_t)k], threads, thread_state); + libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * (fast_sint_t)k], threads, thread_state); } static void libsais_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) @@ -6266,7 +6266,7 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S if (k > 0 && fs / k >= 6) { sa_sint_t alignment = (fs - 1024) / k >= 6 ? 1024 : 16; - sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 6 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * k]; + sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 6 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * (fast_sint_t)k]; sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state); if (m > 1) @@ -6276,8 +6276,8 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S sa_sint_t first_lms_suffix = SA[n - m]; sa_sint_t left_suffixes_count = libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix); - libsais_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * k], threads, thread_state); - libsais_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * k], threads); + libsais_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * (fast_sint_t)k], threads, thread_state); + libsais_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * (fast_sint_t)k], threads); if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); } @@ -6319,7 +6319,7 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S else if (k > 0 && fs / k >= 4) { sa_sint_t alignment = (fs - 1024) / k >= 4 ? 1024 : 16; - sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 4 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * k]; + sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 4 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * (fast_sint_t)k]; sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); if (m > 1) @@ -6363,7 +6363,7 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S else if (k > 0 && fs / k >= 2) { sa_sint_t alignment = (fs - 1024) / k >= 2 ? 1024 : 16; - sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 2 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * k]; + sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 2 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * (fast_sint_t)k]; sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); if (m > 1) diff --git a/src/libsais.h b/libsais.h similarity index 99% rename from src/libsais.h rename to libsais.h index c931701..f259fd0 100644 --- a/src/libsais.h +++ b/libsais.h @@ -26,8 +26,8 @@ Please see the file LICENSE for full copyright information. #define LIBSAIS_VERSION_MAJOR 2 #define LIBSAIS_VERSION_MINOR 7 -#define LIBSAIS_VERSION_PATCH 1 -#define LIBSAIS_VERSION_STRING "2.7.1" +#define LIBSAIS_VERSION_PATCH 2 +#define LIBSAIS_VERSION_STRING "2.7.2" #ifdef __cplusplus extern "C" { diff --git a/src/libsais16.c b/libsais16.c similarity index 99% rename from src/libsais16.c rename to libsais16.c index 2c2baca..381db0e 100644 --- a/src/libsais16.c +++ b/libsais16.c @@ -1104,7 +1104,7 @@ static void libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(cons else { fast_sint_t bucket_size = 2 * (fast_sint_t)k; - fast_sint_t bucket_stride = libsais16_get_bucket_stride(buckets - &SA[n + n], bucket_size, omp_num_threads); + fast_sint_t bucket_stride = libsais16_get_bucket_stride(buckets - &SA[(fast_sint_t)n + (fast_sint_t)n], bucket_size, omp_num_threads); { thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; @@ -1288,7 +1288,7 @@ static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_2k_omp(const sa_sin static void libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(_OPENMP) - sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n + n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } + sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[(fast_sint_t)n + (fast_sint_t)n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } if (max_threads > 1 && n >= 65536 && n / k >= 2) { if (max_threads > n / 8 / k) { max_threads = n / 8 / k; } @@ -1359,8 +1359,8 @@ static void libsais16_initialize_buckets_start_and_end_16u(sa_sint_t * RESTRICT static void libsais16_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { - sa_sint_t * RESTRICT bucket_start = &buckets[4 * k]; - sa_sint_t * RESTRICT bucket_end = &buckets[5 * k]; + sa_sint_t * RESTRICT bucket_start = &buckets[4 * (fast_sint_t)k]; + sa_sint_t * RESTRICT bucket_end = &buckets[5 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) @@ -1373,8 +1373,8 @@ static void libsais16_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_si static void libsais16_initialize_buckets_start_and_end_32s_4k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { - sa_sint_t * RESTRICT bucket_start = &buckets[2 * k]; - sa_sint_t * RESTRICT bucket_end = &buckets[3 * k]; + sa_sint_t * RESTRICT bucket_start = &buckets[2 * (fast_sint_t)k]; + sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) @@ -1479,7 +1479,7 @@ static sa_sint_t libsais16_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k } { - sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; + sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) @@ -1493,8 +1493,8 @@ static sa_sint_t libsais16_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k static void libsais16_initialize_buckets_for_radix_and_partial_sorting_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) { - sa_sint_t * RESTRICT bucket_start = &buckets[2 * k]; - sa_sint_t * RESTRICT bucket_end = &buckets[3 * k]; + sa_sint_t * RESTRICT bucket_start = &buckets[2 * (fast_sint_t)k]; + sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++; buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--; @@ -2040,7 +2040,7 @@ static void libsais16_initialize_buckets_for_partial_sorting_16u(const uint16_t static void libsais16_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) { - sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; + sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0, sum2 = 0; for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4((fast_sint_t)first_lms_suffix - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) @@ -2328,8 +2328,8 @@ static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_4k(const sa_si { const fast_sint_t prefetch_distance = 32; - sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k]; - sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) @@ -2501,8 +2501,8 @@ static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_4k_block_sort( { const fast_sint_t prefetch_distance = 32; - sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k]; - sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) @@ -2760,8 +2760,8 @@ static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_6k_omp(const s static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { - sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k]; - sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER; distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d; @@ -2849,7 +2849,7 @@ static void libsais16_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTR { const fast_sint_t prefetch_distance = 32; - const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; + const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t c; @@ -2901,7 +2901,7 @@ static void libsais16_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT static void libsais16_partial_sorting_shift_buckets_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { - sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; + sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i; for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) @@ -3157,8 +3157,8 @@ static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_4k(const sa_si { const fast_sint_t prefetch_distance = 32; - sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k]; - sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + sa_sint_t * RESTRICT induction_bucket = &buckets[3 * (fast_sint_t)k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) @@ -3330,8 +3330,8 @@ static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_4k_block_sort( { const fast_sint_t prefetch_distance = 32; - sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k]; - sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + sa_sint_t * RESTRICT induction_bucket = &buckets[3 * (fast_sint_t)k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) @@ -3810,8 +3810,8 @@ static void libsais16_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT static void libsais16_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { - libsais16_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * k], threads, thread_state); - libsais16_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * k], threads, thread_state); + libsais16_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * (fast_sint_t)k], threads, thread_state); + libsais16_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * (fast_sint_t)k], threads, thread_state); libsais16_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state); } @@ -4349,7 +4349,7 @@ static void libsais16_place_lms_suffixes_interval_16u(sa_sint_t * RESTRICT SA, s static void libsais16_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { - const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k]; + const sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) @@ -4426,7 +4426,7 @@ static void libsais16_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTR static void libsais16_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { - const sa_sint_t * RESTRICT bucket_end = &buckets[5 * k]; + const sa_sint_t * RESTRICT bucket_end = &buckets[5 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) @@ -4449,7 +4449,7 @@ static void libsais16_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT S static void libsais16_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { - const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k]; + const sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) @@ -5808,20 +5808,20 @@ static sa_sint_t libsais16_induce_final_order_16u_omp(const uint16_t * RESTRICT static void libsais16_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { - libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * k], threads, thread_state); - libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * k], threads, thread_state); + libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * (fast_sint_t)k], threads, thread_state); + libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * (fast_sint_t)k], threads, thread_state); } static void libsais16_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { - libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * k], threads, thread_state); - libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * k], threads, thread_state); + libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * (fast_sint_t)k], threads, thread_state); + libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * (fast_sint_t)k], threads, thread_state); } static void libsais16_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { - libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * k], threads, thread_state); - libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * k], threads, thread_state); + libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * (fast_sint_t)k], threads, thread_state); + libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * (fast_sint_t)k], threads, thread_state); } static void libsais16_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) @@ -6244,7 +6244,7 @@ static sa_sint_t libsais16_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT if (k > 0 && fs / k >= 6) { sa_sint_t alignment = (fs - 1024) / k >= 6 ? 1024 : 16; - sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 6 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * k]; + sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 6 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * (fast_sint_t)k]; sa_sint_t m = libsais16_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state); if (m > 1) @@ -6254,8 +6254,8 @@ static sa_sint_t libsais16_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT sa_sint_t first_lms_suffix = SA[n - m]; sa_sint_t left_suffixes_count = libsais16_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix); - libsais16_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * k], threads, thread_state); - libsais16_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * k], threads); + libsais16_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * (fast_sint_t)k], threads, thread_state); + libsais16_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * (fast_sint_t)k], threads); if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); } @@ -6297,7 +6297,7 @@ static sa_sint_t libsais16_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT else if (k > 0 && fs / k >= 4) { sa_sint_t alignment = (fs - 1024) / k >= 4 ? 1024 : 16; - sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 4 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * k]; + sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 4 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * (fast_sint_t)k]; sa_sint_t m = libsais16_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); if (m > 1) @@ -6341,7 +6341,7 @@ static sa_sint_t libsais16_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT else if (k > 0 && fs / k >= 2) { sa_sint_t alignment = (fs - 1024) / k >= 2 ? 1024 : 16; - sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 2 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * k]; + sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 2 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * (fast_sint_t)k]; sa_sint_t m = libsais16_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); if (m > 1) diff --git a/src/libsais16.h b/libsais16.h similarity index 99% rename from src/libsais16.h rename to libsais16.h index 11bc94f..2874632 100644 --- a/src/libsais16.h +++ b/libsais16.h @@ -26,8 +26,8 @@ Please see the file LICENSE for full copyright information. #define LIBSAIS16_VERSION_MAJOR 2 #define LIBSAIS16_VERSION_MINOR 7 -#define LIBSAIS16_VERSION_PATCH 1 -#define LIBSAIS16_VERSION_STRING "2.7.1" +#define LIBSAIS16_VERSION_PATCH 2 +#define LIBSAIS16_VERSION_STRING "2.7.2" #ifdef __cplusplus extern "C" { diff --git a/src/libsais64.c b/libsais64.c similarity index 99% rename from src/libsais64.c rename to libsais64.c index 72786f6..0e4defc 100644 --- a/src/libsais64.c +++ b/libsais64.c @@ -1096,7 +1096,7 @@ static void libsais64_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(cons else { fast_sint_t bucket_size = 2 * (fast_sint_t)k; - fast_sint_t bucket_stride = libsais64_get_bucket_stride(buckets - &SA[n + n], bucket_size, omp_num_threads); + fast_sint_t bucket_stride = libsais64_get_bucket_stride(buckets - &SA[(fast_sint_t)n + (fast_sint_t)n], bucket_size, omp_num_threads); { thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; @@ -1280,7 +1280,7 @@ static sa_sint_t libsais64_count_and_gather_lms_suffixes_32s_2k_omp(const sa_sin static void libsais64_count_and_gather_compacted_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(_OPENMP) - sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n + n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } + sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[(fast_sint_t)n + (fast_sint_t)n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } if (max_threads > 1 && n >= 65536 && n / k >= 2) { if (max_threads > n / 8 / k) { max_threads = n / 8 / k; } @@ -1351,8 +1351,8 @@ static void libsais64_initialize_buckets_start_and_end_8u(sa_sint_t * RESTRICT b static void libsais64_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { - sa_sint_t * RESTRICT bucket_start = &buckets[4 * k]; - sa_sint_t * RESTRICT bucket_end = &buckets[5 * k]; + sa_sint_t * RESTRICT bucket_start = &buckets[4 * (fast_sint_t)k]; + sa_sint_t * RESTRICT bucket_end = &buckets[5 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) @@ -1365,8 +1365,8 @@ static void libsais64_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_si static void libsais64_initialize_buckets_start_and_end_32s_4k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { - sa_sint_t * RESTRICT bucket_start = &buckets[2 * k]; - sa_sint_t * RESTRICT bucket_end = &buckets[3 * k]; + sa_sint_t * RESTRICT bucket_start = &buckets[2 * (fast_sint_t)k]; + sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) @@ -1471,7 +1471,7 @@ static sa_sint_t libsais64_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k } { - sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; + sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) @@ -1485,8 +1485,8 @@ static sa_sint_t libsais64_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k static void libsais64_initialize_buckets_for_radix_and_partial_sorting_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) { - sa_sint_t * RESTRICT bucket_start = &buckets[2 * k]; - sa_sint_t * RESTRICT bucket_end = &buckets[3 * k]; + sa_sint_t * RESTRICT bucket_start = &buckets[2 * (fast_sint_t)k]; + sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++; buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--; @@ -2032,7 +2032,7 @@ static void libsais64_initialize_buckets_for_partial_sorting_8u(const uint8_t * static void libsais64_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) { - sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; + sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0, sum2 = 0; for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4((fast_sint_t)first_lms_suffix - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) @@ -2320,8 +2320,8 @@ static sa_sint_t libsais64_partial_sorting_scan_left_to_right_32s_4k(const sa_si { const fast_sint_t prefetch_distance = 32; - sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k]; - sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) @@ -2493,8 +2493,8 @@ static sa_sint_t libsais64_partial_sorting_scan_left_to_right_32s_4k_block_sort( { const fast_sint_t prefetch_distance = 32; - sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k]; - sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) @@ -2752,8 +2752,8 @@ static sa_sint_t libsais64_partial_sorting_scan_left_to_right_32s_6k_omp(const s static sa_sint_t libsais64_partial_sorting_scan_left_to_right_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { - sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k]; - sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER; distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d; @@ -2841,7 +2841,7 @@ static void libsais64_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTR { const fast_sint_t prefetch_distance = 32; - const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; + const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t c; @@ -2893,7 +2893,7 @@ static void libsais64_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT static void libsais64_partial_sorting_shift_buckets_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { - sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; + sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i; for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) @@ -3149,8 +3149,8 @@ static sa_sint_t libsais64_partial_sorting_scan_right_to_left_32s_4k(const sa_si { const fast_sint_t prefetch_distance = 32; - sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k]; - sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + sa_sint_t * RESTRICT induction_bucket = &buckets[3 * (fast_sint_t)k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) @@ -3322,8 +3322,8 @@ static sa_sint_t libsais64_partial_sorting_scan_right_to_left_32s_4k_block_sort( { const fast_sint_t prefetch_distance = 32; - sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k]; - sa_sint_t * RESTRICT distinct_names = &buckets[0 * k]; + sa_sint_t * RESTRICT induction_bucket = &buckets[3 * (fast_sint_t)k]; + sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) @@ -3802,8 +3802,8 @@ static void libsais64_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT static void libsais64_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { - libsais64_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * k], threads, thread_state); - libsais64_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * k], threads, thread_state); + libsais64_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * (fast_sint_t)k], threads, thread_state); + libsais64_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * (fast_sint_t)k], threads, thread_state); libsais64_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state); } @@ -4341,7 +4341,7 @@ static void libsais64_place_lms_suffixes_interval_8u(sa_sint_t * RESTRICT SA, sa static void libsais64_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { - const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k]; + const sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) @@ -4418,7 +4418,7 @@ static void libsais64_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTR static void libsais64_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { - const sa_sint_t * RESTRICT bucket_end = &buckets[5 * k]; + const sa_sint_t * RESTRICT bucket_end = &buckets[5 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) @@ -4441,7 +4441,7 @@ static void libsais64_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT S static void libsais64_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { - const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k]; + const sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) @@ -5800,20 +5800,20 @@ static sa_sint_t libsais64_induce_final_order_8u_omp(const uint8_t * RESTRICT T, static void libsais64_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { - libsais64_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * k], threads, thread_state); - libsais64_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * k], threads, thread_state); + libsais64_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * (fast_sint_t)k], threads, thread_state); + libsais64_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * (fast_sint_t)k], threads, thread_state); } static void libsais64_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { - libsais64_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * k], threads, thread_state); - libsais64_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * k], threads, thread_state); + libsais64_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * (fast_sint_t)k], threads, thread_state); + libsais64_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * (fast_sint_t)k], threads, thread_state); } static void libsais64_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { - libsais64_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * k], threads, thread_state); - libsais64_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * k], threads, thread_state); + libsais64_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * (fast_sint_t)k], threads, thread_state); + libsais64_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * (fast_sint_t)k], threads, thread_state); } static void libsais64_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) @@ -6302,7 +6302,7 @@ static sa_sint_t libsais64_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT if (k > 0 && fs / k >= 6) { sa_sint_t alignment = (fs - 1024) / k >= 6 ? 1024 : 16; - sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais64_align_up(&SA[n + fs - 6 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * k]; + sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais64_align_up(&SA[n + fs - 6 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * (fast_sint_t)k]; sa_sint_t m = libsais64_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state); if (m > 1) @@ -6312,8 +6312,8 @@ static sa_sint_t libsais64_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT sa_sint_t first_lms_suffix = SA[n - m]; sa_sint_t left_suffixes_count = libsais64_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix); - libsais64_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * k], threads, thread_state); - libsais64_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * k], threads); + libsais64_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * (fast_sint_t)k], threads, thread_state); + libsais64_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * (fast_sint_t)k], threads); if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); } @@ -6355,7 +6355,7 @@ static sa_sint_t libsais64_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT else if (k > 0 && fs / k >= 4) { sa_sint_t alignment = (fs - 1024) / k >= 4 ? 1024 : 16; - sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais64_align_up(&SA[n + fs - 4 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * k]; + sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais64_align_up(&SA[n + fs - 4 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * (fast_sint_t)k]; sa_sint_t m = libsais64_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); if (m > 1) @@ -6399,7 +6399,7 @@ static sa_sint_t libsais64_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT else if (k > 0 && fs / k >= 2) { sa_sint_t alignment = (fs - 1024) / k >= 2 ? 1024 : 16; - sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais64_align_up(&SA[n + fs - 2 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * k]; + sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais64_align_up(&SA[n + fs - 2 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * (fast_sint_t)k]; sa_sint_t m = libsais64_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); if (m > 1) diff --git a/src/libsais64.h b/libsais64.h similarity index 99% rename from src/libsais64.h rename to libsais64.h index 9384974..786a76c 100644 --- a/src/libsais64.h +++ b/libsais64.h @@ -26,8 +26,8 @@ Please see the file LICENSE for full copyright information. #define LIBSAIS64_VERSION_MAJOR 2 #define LIBSAIS64_VERSION_MINOR 7 -#define LIBSAIS64_VERSION_PATCH 1 -#define LIBSAIS64_VERSION_STRING "2.7.1" +#define LIBSAIS64_VERSION_PATCH 2 +#define LIBSAIS64_VERSION_STRING "2.7.2" #ifdef __cplusplus extern "C" {