diff --git a/CHANGES b/CHANGES index 628721a..e2783b2 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,6 @@ +Changes in 3.3.4 (January, 24 2024) +- Implemented GPU acceleration of reverse Burrows–Wheeler transform. + Changes in 3.3.3 (November, 26 2023) - Fixed out-of-bound memory access issue for large inputs. - Slightly improved compression performance. diff --git a/README b/README index 03e5a06..822c3b2 100644 --- a/README +++ b/README @@ -10,7 +10,7 @@ block-sorting data compression algorithms. libbsc is a library based on bsc, it uses the same algorithms as bsc and enables you to compress memory blocks. -Copyright (c) 2009-2023 Ilya Grebnov +Copyright (c) 2009-2024 Ilya Grebnov See file AUTHORS for a full list of contributors. @@ -21,7 +21,7 @@ See the bsc and libbsc web site: Software License: ----------------- -Copyright (c) 2009-2023 Ilya Grebnov +Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -54,7 +54,8 @@ Compression and decompression requirements are the same and in bytes, can be estimated as 16Mb + 5 x block size x number of blocks processed in parallel. GPU memory usage for NVIDIA CUDA technology is different from CPU memory usage -and can be estimated as 20 x block size for ST and 21 x block size for BWT. +and can be estimated as 20 x block size for ST, 21 x block size for forward BWT +and 7 x block size for inverse BWT. NVIDIA GPU acceleration: diff --git a/VERSION b/VERSION index 3f09e91..2c6109e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.3.3 \ No newline at end of file +3.3.4 \ No newline at end of file diff --git a/bsc.cpp b/bsc.cpp index ee9396e..1925291 100644 --- a/bsc.cpp +++ b/bsc.cpp @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -869,8 +869,8 @@ void ProcessCommandline(int argc, char * argv[]) int main(int argc, char * argv[]) { - fprintf(stdout, "This is bsc, Block Sorting Compressor. Version 3.3.3. 26 November 2023.\n"); - fprintf(stdout, "Copyright (c) 2009-2023 Ilya Grebnov .\n\n"); + fprintf(stdout, "This is bsc, Block Sorting Compressor. Version 3.3.4. 24 January 2024.\n"); + fprintf(stdout, "Copyright (c) 2009-2024 Ilya Grebnov .\n\n"); #if defined(_OPENMP) && defined(__INTEL_COMPILER) diff --git a/libbsc/adler32/adler32.cpp b/libbsc/adler32/adler32.cpp index ea0381a..b09b894 100644 --- a/libbsc/adler32/adler32.cpp +++ b/libbsc/adler32/adler32.cpp @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/adler32/adler32.h b/libbsc/adler32/adler32.h index f9458bf..3f24c97 100644 --- a/libbsc/adler32/adler32.h +++ b/libbsc/adler32/adler32.h @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/bwt/bwt.cpp b/libbsc/bwt/bwt.cpp index b4b44e9..e1e054e 100644 --- a/libbsc/bwt/bwt.cpp +++ b/libbsc/bwt/bwt.cpp @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -226,13 +226,63 @@ int bsc_bwt_encode(unsigned char * T, int n, unsigned char * num_indexes, int * return LIBBSC_NOT_ENOUGH_MEMORY; } +int bsc_bwt_gpu_decode(unsigned char * T, int n, int index, int features) +{ + int result = -1; + +#ifdef LIBBSC_CUDA_SUPPORT + if (features & LIBBSC_FEATURE_CUDA) + { + int storage_approx_length = (n / 3) | 0x1fffff; + +#ifdef LIBBSC_OPENMP + omp_set_lock(&bwt_cuda_lock); + + if (bwt_cuda_device_storage_size < storage_approx_length) + { + if (bwt_cuda_device_storage != NULL) + { + libcubwt_free_device_storage(bwt_cuda_device_storage); + + bwt_cuda_device_storage = NULL; + bwt_cuda_device_storage_size = 0; + } + + if (libcubwt_allocate_device_storage(&bwt_cuda_device_storage, storage_approx_length + (storage_approx_length / 32)) == LIBCUBWT_NO_ERROR) + { + bwt_cuda_device_storage_size = storage_approx_length + (storage_approx_length / 32); + } + } + + if (bwt_cuda_device_storage_size >= storage_approx_length) + { + result = (int)libcubwt_unbwt(bwt_cuda_device_storage, T, T, n, NULL, index); + } + + omp_unset_lock(&bwt_cuda_lock); +#else + void * bwt_cuda_device_storage = NULL; + + if (libcubwt_allocate_device_storage(&bwt_cuda_device_storage, storage_approx_length) == LIBCUBWT_NO_ERROR) + { + result = (int)libcubwt_unbwt(bwt_cuda_device_storage, T, T, n, NULL, index); + + libcubwt_free_device_storage(bwt_cuda_device_storage); + } +#endif + } +#endif + + return result; +} + int bsc_bwt_decode(unsigned char * T, int n, int index, unsigned char num_indexes, int * indexes, int features) { if ((T == NULL) || (n < 0) || (index <= 0) || (index > n)) { return LIBBSC_BAD_PARAMETER; } - if (n <= 1) + if (n <= 1 || bsc_bwt_gpu_decode(T, n, index, features) == 0) { return LIBBSC_NO_ERROR; } diff --git a/libbsc/bwt/bwt.h b/libbsc/bwt/bwt.h index cddbbaf..6a9f25f 100644 --- a/libbsc/bwt/bwt.h +++ b/libbsc/bwt/bwt.h @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/bwt/libcubwt/AUTHORS b/libbsc/bwt/libcubwt/AUTHORS index 56fe850..b4fb785 100644 --- a/libbsc/bwt/libcubwt/AUTHORS +++ b/libbsc/bwt/libcubwt/AUTHORS @@ -7,4 +7,5 @@ Leyuan Wang, Sean Baxter, John D. Owens, Yury Shukhrov, Rory Mitchell, Jacopo Pantaleoni, Duane Merrill, Georgy Evtushenko, Allison Vacanti, Robert Crovella, - Mark Harris. + Mark Harris, Vitaly Osipov, Andy Adinets, Elias Stehle, + Michael Maniscalco. diff --git a/libbsc/bwt/libcubwt/CHANGES b/libbsc/bwt/libcubwt/CHANGES index 446d5d8..91b895a 100644 --- a/libbsc/bwt/libcubwt/CHANGES +++ b/libbsc/bwt/libcubwt/CHANGES @@ -1,3 +1,6 @@ +Changes in 1.6.0 (January 24, 2024) +- Inverse Burrows-Wheeler transform. + Changes in 1.5.0 (March 24, 2023) - Reduced memory usage and improved performance. diff --git a/libbsc/bwt/libcubwt/VERSION b/libbsc/bwt/libcubwt/VERSION index 3e1ad72..ce6a70b 100644 --- a/libbsc/bwt/libcubwt/VERSION +++ b/libbsc/bwt/libcubwt/VERSION @@ -1 +1 @@ -1.5.0 \ No newline at end of file +1.6.0 \ No newline at end of file diff --git a/libbsc/bwt/libcubwt/libcubwt.cu b/libbsc/bwt/libcubwt/libcubwt.cu index d52b3d1..5bb96a6 100644 --- a/libbsc/bwt/libcubwt/libcubwt.cu +++ b/libbsc/bwt/libcubwt/libcubwt.cu @@ -1,9 +1,9 @@ /*-- This file is a part of libcubwt, a library for CUDA accelerated -burrows wheeler transform construction. +burrows wheeler transform construction and inversion. - Copyright (c) 2022-2023 Ilya Grebnov + Copyright (c) 2022-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -68,6 +68,19 @@ Please see the file LICENSE for full copyright and license details. #define CUDA_WARP_THREADS (32) #define CUDA_DEVICE_PADDING (12 * 768) +#define ALPHABET_SIZE (256) + +template +struct OffsetToPointerOperator +{ + BaseTypeT * d_base; + + __host__ __device__ __forceinline__ BaseTypeT * operator()(const OffsetTypeT& offset) const + { + return d_base + offset; + } +}; + typedef struct LIBCUBWT_DEVICE_STORAGE { void * device_rsort_temp_storage; @@ -98,7 +111,12 @@ typedef struct LIBCUBWT_DEVICE_STORAGE uint2 * device_descriptors_small; void * device_storage; + size_t device_storage_size; + int32_t device_L2_cache_bits; + int32_t device_multiprocessor_count; + int32_t device_multiprocessor_max_blocks; + int32_t device_multiprocessor_max_threads; void * host_pinned_storage; size_t host_pinned_storage_size; @@ -136,7 +154,7 @@ static cudaError_t libcubwt_cuda_safe_call(const char * filename, int32_t line, } template -__device__ __forceinline__ T libcubwt_warp_reduce_sum(T value) +static __device__ __forceinline__ T libcubwt_warp_reduce_sum(T value) { #if CUDA_DEVICE_ARCH >= 800 && !defined(__CUDA__) return __reduce_add_sync((uint32_t)-1, value); @@ -153,7 +171,7 @@ __device__ __forceinline__ T libcubwt_warp_reduce_sum(T value) } template -__device__ __forceinline__ T libcubwt_warp_reduce_max(T value) +static __device__ __forceinline__ T libcubwt_warp_reduce_max(T value) { #if CUDA_DEVICE_ARCH >= 800 && !defined(__CUDA__) return __reduce_max_sync((uint32_t)-1, value); @@ -170,7 +188,7 @@ __device__ __forceinline__ T libcubwt_warp_reduce_max(T value) } template -__device__ __forceinline__ void libcubwt_delay_or_prevent_hoisting(T delay) +static __device__ __forceinline__ void libcubwt_delay_or_prevent_hoisting(T delay) { #if CUDA_DEVICE_ARCH >= 700 __nanosleep(delay); @@ -179,6 +197,71 @@ __device__ __forceinline__ void libcubwt_delay_or_prevent_hoisting(T delay) #endif } +template +static __device__ __forceinline__ uint32_t libcubwt_match_any_sync(const uint32_t sync_mask, const uint32_t input) +{ + uint32_t peers_mask = sync_mask; + + #pragma unroll + for (uint32_t bit = 0; bit < INPUT_BITS; bit += 1) + { + uint32_t peers, bit_mask = 1 << bit; + + asm("{\n" + " .reg .pred p;\n" + " and.b32 %0, %1, %2;\n" + " setp.ne.u32 p, %0, 0;\n" + " vote.ballot.sync.b32 %0, p, %3;\n" + " @!p not.b32 %0, %0;\n" + " }" + : "=r"(peers) : "r"(input), "r"(bit_mask), "r"(sync_mask)); + + peers_mask &= peers; + } + + return peers_mask; +} + +static __device__ __forceinline__ uint32_t libcubwt_xxhash32_b32(uint32_t data, uint32_t seed) +{ + uint32_t x = (data * 0xc2b2ae3du) + seed + (uint32_t)sizeof(data) + 0x165667b1u; + + x = (x << 17) | (x >> 15); + x *= 0x27d4eb2fu; x ^= x >> 15; + x *= 0x85ebca77u; x ^= x >> 13; + x *= 0xc2b2ae3du; x ^= x >> 16; + + return x; +} + +static __device__ __forceinline__ uint4 libcubwt_shift_left_b128(uint4 v, uint32_t shift) +{ + if ((shift & 8) > 0) { v.w = v.y; v.z = v.x; v.y = 0 ; v.x = 0; } + if ((shift & 4) > 0) { v.w = v.z; v.z = v.y; v.y = v.x; v.x = 0; } + + shift = (shift << 3) & 0x18u; + v.w = __funnelshift_l(v.z, v.w, shift); + v.z = __funnelshift_l(v.y, v.z, shift); + v.y = __funnelshift_l(v.x, v.y, shift); + v.x = __funnelshift_l(0x0, v.x, shift); + + return v; +} + +static __device__ __forceinline__ uint4 libcubwt_shift_right_b128(uint4 v, uint32_t shift) +{ + if ((shift & 8) > 0) { v.x = v.z; v.y = v.w; v.z = 0 ; v.w = 0; } + if ((shift & 4) > 0) { v.x = v.y; v.y = v.z; v.z = v.w; v.w = 0; } + + shift = (shift << 3) & 0x18u; + v.x = __funnelshift_r(v.x, v.y, shift); + v.y = __funnelshift_r(v.y, v.z, shift); + v.z = __funnelshift_r(v.z, v.w, shift); + v.w = __funnelshift_r(v.w, 0x0, shift); + + return v; +} + __global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) static void libcubwt_gather_values_uint32_kernel(const uint32_t * device_idx, const uint32_t * RESTRICT device_src, uint32_t * device_dst, uint32_t m) { @@ -1901,7 +1984,7 @@ static void libcubwt_merge_suffixes_kernel( uint2 mod12_suffix = shared_storage.stage1.suffixes_l[mod12_index]; #pragma unroll - for (uint32_t item = 0; item < 5; ++item) + for (uint32_t item = 0; item < 5; item += 1) { bool predicate = libcubwt_compare_suffixes_kernel(mod0l_suffix, mod0h_suffix, mod12_suffix); @@ -1916,7 +1999,7 @@ static void libcubwt_merge_suffixes_kernel( { #pragma unroll - for (uint32_t item = 0; item < 5; ++item) + for (uint32_t item = 0; item < 5; item += 1) { if (suffixes[item] >= 0x01000000u) { @@ -2169,6 +2252,9 @@ int64_t libcubwt_allocate_device_storage(void ** device_storage, int64_t max_len int32_t cuda_device_capability; libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaGetDevice(&cuda_device_ordinal), status); + libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaDeviceGetAttribute(&storage->device_multiprocessor_max_blocks, cudaDevAttrMaxBlocksPerMultiprocessor, cuda_device_ordinal), status); + libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaDeviceGetAttribute(&storage->device_multiprocessor_max_threads, cudaDevAttrMaxThreadsPerMultiProcessor, cuda_device_ordinal), status); + libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaDeviceGetAttribute(&storage->device_multiprocessor_count, cudaDevAttrMultiProcessorCount, cuda_device_ordinal), status); libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaDeviceGetAttribute(&cuda_device_L2_cache_size, cudaDevAttrL2CacheSize, cuda_device_ordinal), status); libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::PtxVersion(cuda_device_capability, cuda_device_ordinal), status); @@ -2176,7 +2262,7 @@ int64_t libcubwt_allocate_device_storage(void ** device_storage, int64_t max_len { storage->device_L2_cache_bits = 0; while (cuda_device_L2_cache_size >>= 1) { storage->device_L2_cache_bits += 1; }; - storage->cuda_block_threads = (cuda_device_capability == 860 || cuda_device_capability == 870 || cuda_device_capability == 890) ? 768 : 512; + storage->cuda_block_threads = (cuda_device_capability == 860 || cuda_device_capability == 870 || cuda_device_capability == 890) ? 768u : 512u; } } @@ -2210,27 +2296,29 @@ int64_t libcubwt_allocate_device_storage(void ** device_storage, int64_t max_len if (status == cudaSuccess) { - size_t device_storage_size = 0; + storage->device_storage_size = 0; - device_storage_size += storage->device_ssort_temp_storage_size; - device_storage_size += storage->device_rsort_temp_storage_size; + storage->device_storage_size += storage->device_ssort_temp_storage_size; + storage->device_storage_size += storage->device_rsort_temp_storage_size; - device_storage_size += (max_expanded_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint8_t); + storage->device_storage_size += (max_expanded_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint8_t); - device_storage_size += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint8_t); - device_storage_size += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t); - device_storage_size += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t); - device_storage_size += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t); + storage->device_storage_size += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint8_t); + storage->device_storage_size += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t); + storage->device_storage_size += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t); + storage->device_storage_size += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t); - device_storage_size += (num_descriptors + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint4); - device_storage_size += (num_descriptors + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint4); - device_storage_size += (num_descriptors + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint2); + storage->device_storage_size += (num_descriptors + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint4); + storage->device_storage_size += (num_descriptors + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint4); + storage->device_storage_size += (num_descriptors + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint2); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_storage, device_storage_size), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_storage, storage->device_storage_size), status); if (status == cudaSuccess) { - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMallocHost((void **)&storage->host_pinned_storage, storage->host_pinned_storage_size = 256 * sizeof(uint32_t)), status); + storage->host_pinned_storage_size = (storage->device_multiprocessor_max_threads * storage->device_multiprocessor_count) * 6 * sizeof(uint32_t); + + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMallocHost((void **)&storage->host_pinned_storage, storage->host_pinned_storage_size), status); status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamCreate(&storage->cuda_stream), status); } } @@ -2335,3 +2423,702 @@ int64_t libcubwt_bwt_aux(void * device_storage, const uint8_t * T, uint8_t * L, return libcubwt_get_error_code(status); } + +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) +static void libcubwt_compute_histogram_offsets(const uint8_t * RESTRICT device_L, const uint32_t n, uint32_t * RESTRICT device_offsets) +{ + __shared__ __align__(32) uint32_t histogram[ALPHABET_SIZE]; + + if (threadIdx.x < ALPHABET_SIZE) + { + histogram[threadIdx.x] = 0; + } + + __syncthreads(); + +#if CUDA_DEVICE_ARCH < 800 + uint32_t byte0 = 0, byte1 = 0, count = 0; +#endif + + for (uint32_t thread_index = blockIdx.x * CUDA_BLOCK_THREADS * 16 + threadIdx.x * 16; thread_index < n; thread_index += gridDim.x * CUDA_BLOCK_THREADS * 16) + { + uint4 bytes = __ldg((uint4 *)(device_L + thread_index)); + +#if CUDA_DEVICE_ARCH < 800 + byte0 = __byte_perm(0, bytes.x, 0x0004); if (byte0 != byte1) { atomicAdd(&histogram[byte1], count); count = 0; } count += 1; + byte1 = __byte_perm(0, bytes.x, 0x0005); if (byte1 != byte0) { atomicAdd(&histogram[byte0], count); count = 0; } count += 1; + byte0 = __byte_perm(0, bytes.x, 0x0006); if (byte0 != byte1) { atomicAdd(&histogram[byte1], count); count = 0; } count += 1; + byte1 = __byte_perm(0, bytes.x, 0x0007); if (byte1 != byte0) { atomicAdd(&histogram[byte0], count); count = 0; } count += 1; + + byte0 = __byte_perm(0, bytes.y, 0x0004); if (byte0 != byte1) { atomicAdd(&histogram[byte1], count); count = 0; } count += 1; + byte1 = __byte_perm(0, bytes.y, 0x0005); if (byte1 != byte0) { atomicAdd(&histogram[byte0], count); count = 0; } count += 1; + byte0 = __byte_perm(0, bytes.y, 0x0006); if (byte0 != byte1) { atomicAdd(&histogram[byte1], count); count = 0; } count += 1; + byte1 = __byte_perm(0, bytes.y, 0x0007); if (byte1 != byte0) { atomicAdd(&histogram[byte0], count); count = 0; } count += 1; + + byte0 = __byte_perm(0, bytes.z, 0x0004); if (byte0 != byte1) { atomicAdd(&histogram[byte1], count); count = 0; } count += 1; + byte1 = __byte_perm(0, bytes.z, 0x0005); if (byte1 != byte0) { atomicAdd(&histogram[byte0], count); count = 0; } count += 1; + byte0 = __byte_perm(0, bytes.z, 0x0006); if (byte0 != byte1) { atomicAdd(&histogram[byte1], count); count = 0; } count += 1; + byte1 = __byte_perm(0, bytes.z, 0x0007); if (byte1 != byte0) { atomicAdd(&histogram[byte0], count); count = 0; } count += 1; + + byte0 = __byte_perm(0, bytes.w, 0x0004); if (byte0 != byte1) { atomicAdd(&histogram[byte1], count); count = 0; } count += 1; + byte1 = __byte_perm(0, bytes.w, 0x0005); if (byte1 != byte0) { atomicAdd(&histogram[byte0], count); count = 0; } count += 1; + byte0 = __byte_perm(0, bytes.w, 0x0006); if (byte0 != byte1) { atomicAdd(&histogram[byte1], count); count = 0; } count += 1; + byte1 = __byte_perm(0, bytes.w, 0x0007); if (byte1 != byte0) { atomicAdd(&histogram[byte0], count); count = 0; } count += 1; +#else + atomicAdd(&histogram[__byte_perm(0, bytes.x, 0x0004)], 1); + atomicAdd(&histogram[__byte_perm(0, bytes.x, 0x0005)], 1); + atomicAdd(&histogram[__byte_perm(0, bytes.x, 0x0006)], 1); + atomicAdd(&histogram[__byte_perm(0, bytes.x, 0x0007)], 1); + + atomicAdd(&histogram[__byte_perm(0, bytes.y, 0x0004)], 1); + atomicAdd(&histogram[__byte_perm(0, bytes.y, 0x0005)], 1); + atomicAdd(&histogram[__byte_perm(0, bytes.y, 0x0006)], 1); + atomicAdd(&histogram[__byte_perm(0, bytes.y, 0x0007)], 1); + + atomicAdd(&histogram[__byte_perm(0, bytes.z, 0x0004)], 1); + atomicAdd(&histogram[__byte_perm(0, bytes.z, 0x0005)], 1); + atomicAdd(&histogram[__byte_perm(0, bytes.z, 0x0006)], 1); + atomicAdd(&histogram[__byte_perm(0, bytes.z, 0x0007)], 1); + + atomicAdd(&histogram[__byte_perm(0, bytes.w, 0x0004)], 1); + atomicAdd(&histogram[__byte_perm(0, bytes.w, 0x0005)], 1); + atomicAdd(&histogram[__byte_perm(0, bytes.w, 0x0006)], 1); + atomicAdd(&histogram[__byte_perm(0, bytes.w, 0x0007)], 1); +#endif + } + +#if CUDA_DEVICE_ARCH < 800 + atomicAdd(&histogram[byte1], count); +#endif + + __syncthreads(); + + if (threadIdx.x < ALPHABET_SIZE) + { + uint32_t offset; + + { + typedef cub::BlockScan BlockScan; + __shared__ __align__(32) typename BlockScan::TempStorage temp_storage; + + BlockScan(temp_storage).ExclusiveSum(histogram[threadIdx.x], offset); + } + + if (blockIdx.x == 0) + { + offset += (threadIdx.x == 0 ? 0x80000001 : 0x80000000); + } + + if (offset > 0) + { + atomicAdd(&device_offsets[threadIdx.x], offset); + } + } +} + +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) +static void libcubwt_compute_LF_mapping(const uint8_t * RESTRICT device_L, const uint32_t primary_index, uint32_t * RESTRICT device_descriptors, uint32_t * RESTRICT device_LF) +{ + __shared__ __align__(32) uint32_t shared_histogram[CUDA_BLOCK_THREADS / CUDA_WARP_THREADS][ALPHABET_SIZE]; + __shared__ __align__(32) uint4 shared_bytes[CUDA_BLOCK_THREADS]; + + bool has_primary_index = false; + + { + uint32_t thread_index = blockIdx.x * CUDA_BLOCK_THREADS * 16 + threadIdx.x * 16; + uint4 bytes = __ldg((uint4 *)(device_L + thread_index)); shared_bytes[threadIdx.x] = bytes; + + uint32_t * RESTRICT warp_histogram = &shared_histogram[threadIdx.x / CUDA_WARP_THREADS][0]; + + #pragma unroll + for (uint32_t lane_index = 4 * (threadIdx.x % CUDA_WARP_THREADS), iteration = 0; iteration < ALPHABET_SIZE / (4 * CUDA_WARP_THREADS); lane_index += 4 * CUDA_WARP_THREADS, iteration += 1) + { + *(uint4 *)(warp_histogram + lane_index) = make_uint4(0, 0, 0, 0); + } + + __syncwarp(); + +#if CUDA_DEVICE_ARCH < 800 + uint32_t byte0 = 0, byte1 = 0, count = 0; + + byte0 = __byte_perm(0, bytes.x, 0x0004); count += 1; + byte1 = __byte_perm(0, bytes.x, 0x0005); if (byte1 != byte0) { atomicAdd(&warp_histogram[byte0], count); count = 0; } count += 1; + byte0 = __byte_perm(0, bytes.x, 0x0006); if (byte0 != byte1) { atomicAdd(&warp_histogram[byte1], count); count = 0; } count += 1; + byte1 = __byte_perm(0, bytes.x, 0x0007); if (byte1 != byte0) { atomicAdd(&warp_histogram[byte0], count); count = 0; } count += 1; + + byte0 = __byte_perm(0, bytes.y, 0x0004); if (byte0 != byte1) { atomicAdd(&warp_histogram[byte1], count); count = 0; } count += 1; + byte1 = __byte_perm(0, bytes.y, 0x0005); if (byte1 != byte0) { atomicAdd(&warp_histogram[byte0], count); count = 0; } count += 1; + byte0 = __byte_perm(0, bytes.y, 0x0006); if (byte0 != byte1) { atomicAdd(&warp_histogram[byte1], count); count = 0; } count += 1; + byte1 = __byte_perm(0, bytes.y, 0x0007); if (byte1 != byte0) { atomicAdd(&warp_histogram[byte0], count); count = 0; } count += 1; + + byte0 = __byte_perm(0, bytes.z, 0x0004); if (byte0 != byte1) { atomicAdd(&warp_histogram[byte1], count); count = 0; } count += 1; + byte1 = __byte_perm(0, bytes.z, 0x0005); if (byte1 != byte0) { atomicAdd(&warp_histogram[byte0], count); count = 0; } count += 1; + byte0 = __byte_perm(0, bytes.z, 0x0006); if (byte0 != byte1) { atomicAdd(&warp_histogram[byte1], count); count = 0; } count += 1; + byte1 = __byte_perm(0, bytes.z, 0x0007); if (byte1 != byte0) { atomicAdd(&warp_histogram[byte0], count); count = 0; } count += 1; + + byte0 = __byte_perm(0, bytes.w, 0x0004); if (byte0 != byte1) { atomicAdd(&warp_histogram[byte1], count); count = 0; } count += 1; + byte1 = __byte_perm(0, bytes.w, 0x0005); if (byte1 != byte0) { atomicAdd(&warp_histogram[byte0], count); count = 0; } count += 1; + byte0 = __byte_perm(0, bytes.w, 0x0006); if (byte0 != byte1) { atomicAdd(&warp_histogram[byte1], count); count = 0; } count += 1; + byte1 = __byte_perm(0, bytes.w, 0x0007); if (byte1 != byte0) { atomicAdd(&warp_histogram[byte0], count); count = 0; } count += 1; + + atomicAdd(&warp_histogram[byte1], count); +#else + atomicAdd(&warp_histogram[__byte_perm(0, bytes.x, 0x0004)], 1); + atomicAdd(&warp_histogram[__byte_perm(0, bytes.x, 0x0005)], 1); + atomicAdd(&warp_histogram[__byte_perm(0, bytes.x, 0x0006)], 1); + atomicAdd(&warp_histogram[__byte_perm(0, bytes.x, 0x0007)], 1); + + atomicAdd(&warp_histogram[__byte_perm(0, bytes.y, 0x0004)], 1); + atomicAdd(&warp_histogram[__byte_perm(0, bytes.y, 0x0005)], 1); + atomicAdd(&warp_histogram[__byte_perm(0, bytes.y, 0x0006)], 1); + atomicAdd(&warp_histogram[__byte_perm(0, bytes.y, 0x0007)], 1); + + atomicAdd(&warp_histogram[__byte_perm(0, bytes.z, 0x0004)], 1); + atomicAdd(&warp_histogram[__byte_perm(0, bytes.z, 0x0005)], 1); + atomicAdd(&warp_histogram[__byte_perm(0, bytes.z, 0x0006)], 1); + atomicAdd(&warp_histogram[__byte_perm(0, bytes.z, 0x0007)], 1); + + atomicAdd(&warp_histogram[__byte_perm(0, bytes.w, 0x0004)], 1); + atomicAdd(&warp_histogram[__byte_perm(0, bytes.w, 0x0005)], 1); + atomicAdd(&warp_histogram[__byte_perm(0, bytes.w, 0x0006)], 1); + atomicAdd(&warp_histogram[__byte_perm(0, bytes.w, 0x0007)], 1); +#endif + + __syncwarp(); + + if ((primary_index - thread_index) < 16u) + { + has_primary_index = true; warp_histogram[0] -= 1; + } + } + + __syncthreads(); + + if (threadIdx.x < ALPHABET_SIZE) + { + uint32_t prefix_sum = 0; + uint32_t byte_count = 0; + + { + #pragma unroll + for (uint32_t warp_index = 0; warp_index < CUDA_BLOCK_THREADS / CUDA_WARP_THREADS; warp_index += 1) + { + byte_count += shared_histogram[warp_index][threadIdx.x]; + } + } + + { + { + cub::ThreadStore(device_descriptors + blockIdx.x * ALPHABET_SIZE + threadIdx.x, byte_count | 0x40000000u); + } + + { + uint32_t * RESTRICT descriptors_lookback = device_descriptors + blockIdx.x * ALPHABET_SIZE + threadIdx.x; + + uint32_t descriptor, delay = 8; + do + { + descriptors_lookback -= ALPHABET_SIZE; + + do + { + libcubwt_delay_or_prevent_hoisting(delay <<= 1); + + descriptor = cub::ThreadLoad(descriptors_lookback); + + } while (descriptor == 0); + + delay = 0; prefix_sum += descriptor & (~0xc0000000u); + + } while ((descriptor & 0x80000000u) == 0); + + __syncwarp(); + } + + { + cub::ThreadStore(device_descriptors + blockIdx.x * ALPHABET_SIZE + threadIdx.x, (prefix_sum + byte_count) | 0x80000000u); + } + } + + { + #pragma unroll + for (uint32_t warp_index = 0; warp_index < CUDA_BLOCK_THREADS / CUDA_WARP_THREADS; warp_index += 1) + { + byte_count = shared_histogram[warp_index][threadIdx.x]; + shared_histogram[warp_index][threadIdx.x] = prefix_sum; + + prefix_sum += byte_count; + } + } + } + + __syncthreads(); + + if (__any_sync((uint32_t)-1, has_primary_index)) + { + uint32_t thread_index = (threadIdx.x / CUDA_WARP_THREADS) * (CUDA_WARP_THREADS * 16) + (threadIdx.x % CUDA_WARP_THREADS); + + uint32_t * RESTRICT warp_histogram = &shared_histogram[threadIdx.x / CUDA_WARP_THREADS][0]; + uint8_t * RESTRICT thread_bytes = ((uint8_t *)shared_bytes) + thread_index; + + thread_index += blockIdx.x * CUDA_BLOCK_THREADS * 16; + + #pragma unroll + for (uint32_t byte_index = 0; byte_index < 16; byte_index += 1, thread_bytes += CUDA_WARP_THREADS, thread_index += CUDA_WARP_THREADS) + { + uint32_t byte = thread_bytes[0]; if (primary_index == thread_index) { byte = 256; } + + uint32_t peers_mask = libcubwt_match_any_sync<9>((uint32_t)-1, byte); + uint32_t peers_offset = __popc(peers_mask & cub::LaneMaskLt()); + uint32_t warp_offset = (byte < 256 ? warp_histogram[byte] : 0); + + device_LF[thread_index] = warp_offset + peers_offset; + + if (byte_index < 15) + { + __syncwarp(); + + if (byte < 256 && peers_offset == 0) { warp_histogram[byte] = warp_offset + __popc(peers_mask); } + + __syncwarp(); + } + } + } + else + { + uint32_t thread_index = (threadIdx.x / CUDA_WARP_THREADS) * (CUDA_WARP_THREADS * 16) + (threadIdx.x % CUDA_WARP_THREADS); + + uint32_t * RESTRICT warp_histogram = &shared_histogram[threadIdx.x / CUDA_WARP_THREADS][0]; + uint8_t * RESTRICT thread_bytes = ((uint8_t *)shared_bytes) + thread_index; + + thread_index += blockIdx.x * CUDA_BLOCK_THREADS * 16; + + #pragma unroll + for (uint32_t byte_index = 0; byte_index < 16; byte_index += 1, thread_bytes += CUDA_WARP_THREADS, thread_index += CUDA_WARP_THREADS) + { + uint32_t byte = thread_bytes[0]; + + uint32_t peers_mask = libcubwt_match_any_sync<8>((uint32_t)-1, byte); + uint32_t peers_offset = __popc(peers_mask & cub::LaneMaskLt()); + uint32_t warp_offset = warp_histogram[byte]; + + device_LF[thread_index] = warp_offset + peers_offset; + + if (byte_index < 15) + { + __syncwarp(); + + if (peers_offset == 0) { warp_histogram[byte] = warp_offset + __popc(peers_mask); } + + __syncwarp(); + } + } + } +} + +template +__global__ __launch_bounds__(cuda_block_threads) +static void libcubwt_mark_segments(const uint32_t segments_quotient, const uint32_t segments_remainder, uint32_t * RESTRICT device_LF, uint2 * RESTRICT device_segments_positions) +{ + uint32_t thread_index = blockIdx.x * cuda_block_threads + threadIdx.x; + uint32_t F_position = (thread_index * segments_quotient) + ((thread_index < segments_remainder) ? thread_index : segments_remainder); + + if (thread_index > 0) + { + F_position += libcubwt_xxhash32_b32(thread_index, segments_quotient ^ segments_remainder) % (segments_quotient + (thread_index < segments_remainder)); + } + + uint32_t L_position = device_LF[F_position]; device_LF[F_position] = thread_index | 0x80000000u; + + device_segments_positions[thread_index] = make_uint2(F_position, L_position); +} + +template +__global__ __launch_bounds__(cuda_block_threads) +static void libcubwt_survey_segments(const uint32_t * RESTRICT device_LF, const uint32_t * RESTRICT device_offsets, uint2 * RESTRICT device_segments_positions, uint2 * RESTRICT device_segments_surveys, uint8_t * RESTRICT device_staging_area, uint32_t staging_size) +{ + __shared__ __align__(32) uint32_t shared_offsets[ALPHABET_SIZE]; + + { + if (cuda_block_threads == 32) + { + ((uint4 *)shared_offsets)[threadIdx.x + 0] = ((uint4 *)device_offsets)[threadIdx.x + 0]; + ((uint4 *)shared_offsets)[threadIdx.x + 32] = ((uint4 *)device_offsets)[threadIdx.x + 32]; + } + else if (cuda_block_threads == 64) + { + ((uint4 *)shared_offsets)[threadIdx.x] = ((uint4 *)device_offsets)[threadIdx.x]; + } + else if (threadIdx.x < 64) + { + ((uint4 *)shared_offsets)[threadIdx.x] = ((uint4 *)device_offsets)[threadIdx.x]; + } + + __syncthreads(); + } + + uint32_t thread_index = blockIdx.x * cuda_block_threads + threadIdx.x; + uint32_t F_position = device_segments_positions[thread_index].x; + uint32_t L_position = device_segments_positions[thread_index].y; + uint32_t count = 0; + + const uint32_t offset64 = shared_offsets[64], offset128 = shared_offsets[128], offset192 = shared_offsets[192]; + uint8_t * RESTRICT thread_staging = device_staging_area + ((uint64_t)thread_index * staging_size); + + uint4 T = make_uint4(0, 0, 0, 0); + + do + { + F_position |= 0x80000000u; + + uint32_t next_position = cub::ThreadLoad(device_LF + L_position); + uint32_t val = (F_position >= offset128) ? 128 : 0; + uint32_t offsetX64 = (F_position >= offset128) ? offset192 : offset64; + + if (F_position >= offsetX64 ) { val += 64; } + if (F_position >= shared_offsets[val + 32]) { val += 32; } + if (F_position >= shared_offsets[val + 16]) { val += 16; } + if (F_position >= shared_offsets[val + 8]) { val += 8; } + if (F_position >= shared_offsets[val + 4]) { val += 4; } + if (F_position >= shared_offsets[val + 2]) { val += 2; } + if (F_position >= shared_offsets[val + 1]) { val += 1; } + + T.w = __byte_perm(T.z, T.w, 0x6543); T.z = __byte_perm(T.y, T.z, 0x6543); + T.y = __byte_perm(T.x, T.y, 0x6543); T.x = __byte_perm(val, T.x, 0x6540); + + F_position = L_position; L_position = next_position; count += 1; + + if ((staging_size != 0) && (count % 16 == 0)) + { + staging_size -= 16; *(uint4 *)(thread_staging + staging_size) = T; + + if (staging_size == 0) + { + device_segments_positions[thread_index] = make_uint2(F_position, L_position); + } + } + + } while ((int32_t)L_position >= 0); + + __syncwarp(); + + device_segments_surveys[thread_index] = make_uint2(L_position ^ 0x80000000u, count); + + if (staging_size != 0) + { + device_segments_positions[thread_index] = make_uint2(F_position, L_position); + + if (count % 16 != 0) + { + staging_size -= 16; *(uint4 *)(thread_staging + staging_size) = libcubwt_shift_left_b128(T, -count); + } + } +} + +template +__global__ __launch_bounds__(cuda_block_threads) +static void libcubwt_decode_segments_fallback(const uint32_t * RESTRICT device_LF, const uint32_t * RESTRICT device_offsets, const uint2 * RESTRICT device_segments_positions, const uint2 * RESTRICT device_segments_surveys, uint8_t * RESTRICT device_output_area) +{ + __shared__ __align__(32) uint32_t shared_offsets[ALPHABET_SIZE]; + + { + if (cuda_block_threads == 32) + { + ((uint4 *)shared_offsets)[threadIdx.x + 0] = ((uint4 *)device_offsets)[threadIdx.x + 0]; + ((uint4 *)shared_offsets)[threadIdx.x + 32] = ((uint4 *)device_offsets)[threadIdx.x + 32]; + } + else if (cuda_block_threads == 64) + { + ((uint4 *)shared_offsets)[threadIdx.x] = ((uint4 *)device_offsets)[threadIdx.x]; + } + else if (threadIdx.x < 64) + { + ((uint4 *)shared_offsets)[threadIdx.x] = ((uint4 *)device_offsets)[threadIdx.x]; + } + + __syncthreads(); + } + + uint32_t thread_index = blockIdx.x * cuda_block_threads + threadIdx.x; + uint32_t F_position = device_segments_positions[thread_index].x; + uint32_t L_position = device_segments_positions[thread_index].y; + uint32_t offset = device_segments_surveys[thread_index].x; + + if ((int32_t)L_position < 0) + { + return; + } + + const uint32_t offset64 = shared_offsets[64], offset128 = shared_offsets[128], offset192 = shared_offsets[192]; + uint4 T = libcubwt_shift_right_b128((offset % 16 != 0) ? *(uint4 *)(device_output_area + (offset & (-16u))) : make_uint4(0, 0, 0, 0), offset); + + do + { + F_position |= 0x80000000u; + + uint32_t next_position = cub::ThreadLoad(device_LF + L_position); + uint32_t val = (F_position >= offset128) ? 128 : 0; + uint32_t offsetX64 = (F_position >= offset128) ? offset192 : offset64; + + if (F_position >= offsetX64 ) { val += 64; } + if (F_position >= shared_offsets[val + 32]) { val += 32; } + if (F_position >= shared_offsets[val + 16]) { val += 16; } + if (F_position >= shared_offsets[val + 8]) { val += 8; } + if (F_position >= shared_offsets[val + 4]) { val += 4; } + if (F_position >= shared_offsets[val + 2]) { val += 2; } + if (F_position >= shared_offsets[val + 1]) { val += 1; } + + T.w = __byte_perm(T.z, T.w, 0x6543); T.z = __byte_perm(T.y, T.z, 0x6543); + T.y = __byte_perm(T.x, T.y, 0x6543); T.x = __byte_perm(val, T.x, 0x6540); + + F_position = L_position; L_position = next_position; offset -= 1; + + if (offset % 16 == 0) + { + *(uint4 *)(device_output_area + offset) = T; + } + + } while ((int32_t)L_position >= 0); + + if (offset % 16 != 0) + { + uint4 P = *(uint4 *)(device_output_area + (offset & (-16u))); + uint4 M = make_uint4(~0u, ~0u, ~0u, ~0u); + + T = libcubwt_shift_left_b128(T, offset); + M = libcubwt_shift_left_b128(M, offset); + + T.x = (T.x & M.x) | (P.x & ~M.x); + T.y = (T.y & M.y) | (P.y & ~M.y); + T.z = (T.z & M.z) | (P.z & ~M.z); + T.w = (T.w & M.w) | (P.w & ~M.w); + + *(uint4 *)(device_output_area + (offset & (-16u))) = T; + } +} + +static bool libcubwt_rank_segments( + uint2 * RESTRICT segments_surveys, + uint64_t * RESTRICT memcpy_from, + uint32_t * RESTRICT memcpy_to, + uint32_t * RESTRICT memcpy_size, + const uint32_t n, + const uint32_t num_segments, + const uint32_t staging_size) +{ + uint32_t sum = n, max_count = 0; + + for (uint64_t p = 0, segment = num_segments; segment != 0; segment -= 2) + { + { + uint32_t next = segments_surveys[p].x; + uint32_t count = segments_surveys[p].y; + uint32_t size = count < staging_size ? count : staging_size; + + segments_surveys[p].x = sum - size; + + memcpy_from[segment - 1] = ((uint64_t)staging_size * (p + 1)) - size; + memcpy_to [segment - 1] = sum - size; + memcpy_size[segment - 1] = size; + + sum -= count; p = next; max_count = max_count < count ? count : max_count; + } + + { + uint32_t next = segments_surveys[p].x; + uint32_t count = segments_surveys[p].y; + uint32_t size = count < staging_size ? count : staging_size; + + segments_surveys[p].x = sum - size; + + memcpy_from[segment - 2] = ((uint64_t)staging_size * (p + 1)) - size; + memcpy_to [segment - 2] = sum - size; + memcpy_size[segment - 2] = size; + + sum -= count; p = next; max_count = max_count < count ? count : max_count; + } + } + + return max_count > staging_size; +} + +int64_t libcubwt_unbwt(void * device_storage, const uint8_t * T, uint8_t * U, int64_t n, const int32_t * freq, int32_t i) +{ + LIBCUBWT_DEVICE_STORAGE * storage = (LIBCUBWT_DEVICE_STORAGE *)device_storage; + + cudaError_t status = cudaSuccess; + int64_t num_descriptors = (n + 1 + storage->cuda_block_threads * 16 - 1) / (storage->cuda_block_threads * 16); + int64_t aligned_n = num_descriptors * (storage->cuda_block_threads * 16); + int64_t num_segments = std::min(storage->device_multiprocessor_max_threads * storage->device_multiprocessor_count, ((int32_t)(n + 1) / 48) & (-128)); + + if ((storage == NULL) || (T == NULL) || (U == NULL) || (n < 128 * 1024) || (i <= 0) || (i > n) || (n > 5 * storage->max_length) || (aligned_n >= 1023 * 1024 * 1024)) + { + return LIBCUBWT_BAD_PARAMETER; + } + + size_t device_batched_memcpy_temp_storage_size; + + { + uint8_t * d_staging = NULL; + uint64_t * d_memcpy_from = NULL; + uint32_t * d_memcpy_to = NULL; + uint32_t * d_memcpy_size = NULL; + + OffsetToPointerOperator memcpy_from_operator { d_staging }; + OffsetToPointerOperator memcpy_to_operator { d_staging }; + + cub::TransformInputIterator, uint64_t *> memcpy_from_iterator (d_memcpy_from, memcpy_from_operator); + cub::TransformInputIterator, uint32_t *> memcpy_to_iterator (d_memcpy_to , memcpy_to_operator ); + + device_batched_memcpy_temp_storage_size = 0; + + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceCopy::Batched( + NULL, device_batched_memcpy_temp_storage_size, + memcpy_from_iterator, + memcpy_to_iterator, + d_memcpy_size, + (uint32_t)num_segments)); + + if (status != cudaSuccess) { return libcubwt_get_error_code(status); } + + device_batched_memcpy_temp_storage_size = (device_batched_memcpy_temp_storage_size + (size_t)1023) & (size_t)(-1024); + } + + uint8_t * device_alloc = (uint8_t *)storage->device_storage; + uint8_t * device_L = (uint8_t *)device_alloc; device_alloc += aligned_n; + uint32_t * device_LF = (uint32_t *)device_alloc; device_alloc += aligned_n * sizeof(uint32_t); + uint32_t * device_offsets = (uint32_t *)device_alloc; device_alloc += ALPHABET_SIZE * sizeof(uint32_t); + + void * device_batched_memcpy_temp_storage = device_alloc; + uint2 * device_segments_positions = (uint2 * )(device_alloc + device_batched_memcpy_temp_storage_size); + uint2 * device_segments_surveys = (uint2 * )(device_segments_positions + num_segments); + uint64_t * device_batched_memcpy_from = (uint64_t *)(device_segments_surveys + num_segments); + uint32_t * device_batched_memcpy_to = (uint32_t *)(device_batched_memcpy_from + num_segments); + uint32_t * device_batched_memcpy_size = (uint32_t *)(device_batched_memcpy_to + num_segments); + + device_alloc += std::max(num_descriptors * ALPHABET_SIZE * sizeof(uint32_t), device_batched_memcpy_temp_storage_size + num_segments * (8 + 4) * sizeof(uint32_t)); + + if ((size_t)(device_alloc - (uint8_t *)storage->device_storage) > storage->device_storage_size) + { + return LIBCUBWT_NOT_ENOUGH_MEMORY; + } + + { + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(device_L , T , (size_t)(i ), cudaMemcpyHostToDevice, storage->cuda_stream), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(device_L + i + 1, T + i, (size_t)(n - i), cudaMemcpyHostToDevice, storage->cuda_stream), status); + + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemsetAsync(device_L + i + 0, 0 , 1 , storage->cuda_stream), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemsetAsync(device_L + n + 1, 255, (size_t)(aligned_n - n - 1), storage->cuda_stream), status); + + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemsetAsync(device_offsets, 0, (1 + num_descriptors) * ALPHABET_SIZE * sizeof(uint32_t), storage->cuda_stream), status); + } + + if (status == cudaSuccess) + { + if (freq != NULL) + { + uint32_t * offsets = (uint32_t *)storage->host_pinned_storage; + + for (uint32_t sum = 0x80000001, c = 0; c < ALPHABET_SIZE; c += 1) { offsets[c] = sum; sum += freq[c]; } + + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(device_offsets, offsets, ALPHABET_SIZE * sizeof(uint32_t), cudaMemcpyHostToDevice, storage->cuda_stream), status); + } + else + { + int64_t n_histogram_blocks = std::min((uint32_t)num_descriptors, (storage->device_multiprocessor_max_threads / storage->cuda_block_threads) * storage->device_multiprocessor_count); + + libcubwt_compute_histogram_offsets<<<(uint32_t)n_histogram_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>(device_L, (uint32_t)aligned_n, device_offsets); + } + + if (status == cudaSuccess) + { + libcubwt_compute_LF_mapping<<<(uint32_t)num_descriptors, storage->cuda_block_threads, 0, storage->cuda_stream>>>(device_L, (uint32_t)i, device_offsets + ALPHABET_SIZE, device_LF); + } + } + + if (status == cudaSuccess) + { + uint8_t * device_staging_area = (uint8_t *)(device_batched_memcpy_size + num_segments); + uint8_t * device_staging_end = (uint8_t *)storage->device_storage + storage->device_storage_size; + uint32_t staging_size = (uint32_t)(((device_staging_end - device_staging_area) / num_segments) & (-16)); + + { + if (32 * storage->device_multiprocessor_max_blocks * storage->device_multiprocessor_count >= std::min((int32_t)num_segments, storage->device_multiprocessor_max_threads * storage->device_multiprocessor_count)) + { + libcubwt_mark_segments<32><<<(uint32_t)num_segments / 32, 32, 0, storage->cuda_stream>>>((uint32_t)((n + 1) / num_segments), (uint32_t)((n + 1) % num_segments), device_LF, device_segments_positions); + libcubwt_survey_segments<32><<<(uint32_t)num_segments / 32, 32, 0, storage->cuda_stream>>>(device_LF, device_offsets, device_segments_positions, device_segments_surveys, device_staging_area, staging_size); + } + else if (64 * storage->device_multiprocessor_max_blocks * storage->device_multiprocessor_count >= std::min((int32_t)num_segments, storage->device_multiprocessor_max_threads * storage->device_multiprocessor_count)) + { + libcubwt_mark_segments<64><<<(uint32_t)num_segments / 64, 64, 0, storage->cuda_stream>>>((uint32_t)((n + 1) / num_segments), (uint32_t)((n + 1) % num_segments), device_LF, device_segments_positions); + libcubwt_survey_segments<64><<<(uint32_t)num_segments / 64, 64, 0, storage->cuda_stream>>>(device_LF, device_offsets, device_segments_positions, device_segments_surveys, device_staging_area, staging_size); + } + else + { + libcubwt_mark_segments<128><<<(uint32_t)num_segments / 128, 128, 0, storage->cuda_stream>>>((uint32_t)((n + 1) / num_segments), (uint32_t)((n + 1) % num_segments), device_LF, device_segments_positions); + libcubwt_survey_segments<128><<<(uint32_t)num_segments / 128, 128, 0, storage->cuda_stream>>>(device_LF, device_offsets, device_segments_positions, device_segments_surveys, device_staging_area, staging_size); + } + } + + { + uint2 * host_segments_surveys = (uint2 * )storage->host_pinned_storage; + uint64_t * host_batched_memcpy_from = (uint64_t *)(host_segments_surveys + num_segments); + uint32_t * host_batched_memcpy_to = (uint32_t *)(host_batched_memcpy_from + num_segments); + uint32_t * host_batched_memcpy_size = (uint32_t *)(host_batched_memcpy_to + num_segments); + + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(host_segments_surveys, device_segments_surveys, num_segments * 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, storage->cuda_stream), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamSynchronize(storage->cuda_stream), status); + + if (status == cudaSuccess) + { + bool overflow = libcubwt_rank_segments(host_segments_surveys, host_batched_memcpy_from, host_batched_memcpy_to, host_batched_memcpy_size, (uint32_t)(n + 1), (uint32_t)num_segments, (uint32_t)staging_size); + + status = overflow + ? libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(device_segments_surveys, host_segments_surveys, num_segments * 6 * sizeof(uint32_t), cudaMemcpyHostToDevice, storage->cuda_stream), status) + : libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(device_batched_memcpy_from, host_batched_memcpy_from, num_segments * 4 * sizeof(uint32_t), cudaMemcpyHostToDevice, storage->cuda_stream), status); + + if (status == cudaSuccess) + { + OffsetToPointerOperator memcpy_from_operator { device_staging_area }; + OffsetToPointerOperator memcpy_to_operator { device_L }; + + cub::TransformInputIterator, uint64_t *> memcpy_from_iterator (device_batched_memcpy_from, memcpy_from_operator); + cub::TransformInputIterator, uint32_t *> memcpy_to_iterator (device_batched_memcpy_to , memcpy_to_operator ); + + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceCopy::Batched( + device_batched_memcpy_temp_storage, + device_batched_memcpy_temp_storage_size, + memcpy_from_iterator, + memcpy_to_iterator, + device_batched_memcpy_size, + (uint32_t)num_segments, + storage->cuda_stream)); + } + + if (status == cudaSuccess && overflow) + { + if (32 * storage->device_multiprocessor_max_blocks * storage->device_multiprocessor_count >= std::min((int32_t)num_segments, storage->device_multiprocessor_max_threads * storage->device_multiprocessor_count)) + { + libcubwt_decode_segments_fallback<32><<<(uint32_t)num_segments / 32, 32, 0, storage->cuda_stream>>>(device_LF, device_offsets, device_segments_positions, device_segments_surveys, device_L); + } + else if (64 * storage->device_multiprocessor_max_blocks * storage->device_multiprocessor_count >= std::min((int32_t)num_segments, storage->device_multiprocessor_max_threads * storage->device_multiprocessor_count)) + { + libcubwt_decode_segments_fallback<64><<<(uint32_t)num_segments / 64, 64, 0, storage->cuda_stream>>>(device_LF, device_offsets, device_segments_positions, device_segments_surveys, device_L); + } + else + { + libcubwt_decode_segments_fallback<128><<<(uint32_t)num_segments / 128, 128, 0, storage->cuda_stream>>>(device_LF, device_offsets, device_segments_positions, device_segments_surveys, device_L); + } + } + + if (status == cudaSuccess) + { + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(U, device_L, (size_t)n, cudaMemcpyDeviceToHost, storage->cuda_stream), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamSynchronize(storage->cuda_stream), status); + } + } + } + } + + return status != cudaSuccess ? libcubwt_get_error_code(status) : LIBCUBWT_NO_ERROR; +} diff --git a/libbsc/bwt/libcubwt/libcubwt.cuh b/libbsc/bwt/libcubwt/libcubwt.cuh index f303803..f969c3c 100644 --- a/libbsc/bwt/libcubwt/libcubwt.cuh +++ b/libbsc/bwt/libcubwt/libcubwt.cuh @@ -1,9 +1,9 @@ /*-- This file is a part of libcubwt, a library for CUDA accelerated -burrows wheeler transform construction. +burrows wheeler transform construction and inversion. - Copyright (c) 2022-2023 Ilya Grebnov + Copyright (c) 2022-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,9 +25,9 @@ Please see the file LICENSE for full copyright and license details. #define LIBCUBWT_CUH 1 #define LIBCUBWT_VERSION_MAJOR 1 -#define LIBCUBWT_VERSION_MINOR 5 +#define LIBCUBWT_VERSION_MINOR 6 #define LIBCUBWT_VERSION_PATCH 0 -#define LIBCUBWT_VERSION_STRING "1.5.0" +#define LIBCUBWT_VERSION_STRING "1.6.0" #define LIBCUBWT_NO_ERROR 0 #define LIBCUBWT_BAD_PARAMETER -1 @@ -45,7 +45,16 @@ extern "C" { /** * Allocates storage on the CUDA device that allows reusing allocated memory with each libcubwt operation. - * @param max_length The maximum length of string to support. + * @param device_storage A reference to the memory pointer where the allocated device storage will be saved. + * @param max_length This parameter controls the amount of allocated memory, ensuring that libcubwt operations + * can accommodate strings of lengths up to this value for both forward and reverse Burrows-Wheeler Transforms. + * The method currently allocates approximately 20.5 times the string length, which is the necessary amount of memory + * for the forward Burrows-Wheeler Transform of a string at maximum length. This allocation is also sufficient and + * optimal for the reverse Burrows-Wheeler Transform. However, if performance is less critical, or if device memory is limited, + * 'max_length' can be lowered. Allocating memory at approximately 6.8 times the string length should still yield about 90% + * of the optimal performance for the reverse Burrows-Wheeler Transform. This effectively means that reverse + * Burrows-Wheeler Transform operations can be performed with storage allocated using a 'max_length' parameter + * at a third of the input string's maximum length. * @return LIBCUBWT_NO_ERROR if no error occurred, libcubwt error code otherwise. */ int64_t libcubwt_allocate_device_storage(void ** device_storage, int64_t max_length); @@ -79,6 +88,18 @@ extern "C" { */ int64_t libcubwt_bwt_aux(void * device_storage, const uint8_t * T, uint8_t * L, int64_t n, int64_t r, uint32_t * I); + /** + * Reconstructs the original string from a given burrows-wheeler transformed string (BWT) with primary index. + * @param device_storage The previously allocated storage on the CUDA device. + * @param T [0..n-1] The input string. + * @param U [0..n-1] The output string (can be T). + * @param n The length of the given string. + * @param freq [0..255] The input symbol frequency table (can be NULL). + * @param i The primary index. + * @return LIBCUBWT_NO_ERROR if no error occurred, libcubwt error code otherwise. + */ + int64_t libcubwt_unbwt(void * device_storage, const uint8_t * T, uint8_t * U, int64_t n, const int32_t * freq, int32_t i); + #ifdef __cplusplus } #endif diff --git a/libbsc/coder/coder.cpp b/libbsc/coder/coder.cpp index 5fcd2c2..4a5ad3d 100644 --- a/libbsc/coder/coder.cpp +++ b/libbsc/coder/coder.cpp @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/coder/coder.h b/libbsc/coder/coder.h index 97bc040..cd80697 100644 --- a/libbsc/coder/coder.h +++ b/libbsc/coder/coder.h @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/coder/common/predictor.h b/libbsc/coder/common/predictor.h index 34ca792..09267f4 100644 --- a/libbsc/coder/common/predictor.h +++ b/libbsc/coder/common/predictor.h @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/coder/common/rangecoder.h b/libbsc/coder/common/rangecoder.h index 5df8282..29180fb 100644 --- a/libbsc/coder/common/rangecoder.h +++ b/libbsc/coder/common/rangecoder.h @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/coder/common/tables.h b/libbsc/coder/common/tables.h index a11bc9c..c017d92 100644 --- a/libbsc/coder/common/tables.h +++ b/libbsc/coder/common/tables.h @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/coder/qlfc/qlfc.cpp b/libbsc/coder/qlfc/qlfc.cpp index f4d3469..7ba5069 100644 --- a/libbsc/coder/qlfc/qlfc.cpp +++ b/libbsc/coder/qlfc/qlfc.cpp @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/coder/qlfc/qlfc.h b/libbsc/coder/qlfc/qlfc.h index b6f2714..ac0b2be 100644 --- a/libbsc/coder/qlfc/qlfc.h +++ b/libbsc/coder/qlfc/qlfc.h @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/coder/qlfc/qlfc_model.cpp b/libbsc/coder/qlfc/qlfc_model.cpp index 78fc09a..70c6665 100644 --- a/libbsc/coder/qlfc/qlfc_model.cpp +++ b/libbsc/coder/qlfc/qlfc_model.cpp @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/coder/qlfc/qlfc_model.h b/libbsc/coder/qlfc/qlfc_model.h index 110ae05..c154d7c 100644 --- a/libbsc/coder/qlfc/qlfc_model.h +++ b/libbsc/coder/qlfc/qlfc_model.h @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/filters.h b/libbsc/filters.h index bf732dc..cbc3b49 100644 --- a/libbsc/filters.h +++ b/libbsc/filters.h @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/filters/detectors.cpp b/libbsc/filters/detectors.cpp index 09c445c..895ffbd 100644 --- a/libbsc/filters/detectors.cpp +++ b/libbsc/filters/detectors.cpp @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/filters/preprocessing.cpp b/libbsc/filters/preprocessing.cpp index 7003ae5..56e5c5a 100644 --- a/libbsc/filters/preprocessing.cpp +++ b/libbsc/filters/preprocessing.cpp @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/filters/tables.h b/libbsc/filters/tables.h index ebf9f08..0780e87 100644 --- a/libbsc/filters/tables.h +++ b/libbsc/filters/tables.h @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/libbsc.h b/libbsc/libbsc.h index c14cb94..254e314 100644 --- a/libbsc/libbsc.h +++ b/libbsc/libbsc.h @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -35,8 +35,8 @@ See also the bsc and libbsc web site: #define LIBBSC_VERSION_MAJOR 3 #define LIBBSC_VERSION_MINOR 3 -#define LIBBSC_VERSION_PATCH 3 -#define LIBBSC_VERSION_STRING "3.3.3" +#define LIBBSC_VERSION_PATCH 4 +#define LIBBSC_VERSION_STRING "3.3.4" #define LIBBSC_NO_ERROR 0 #define LIBBSC_BAD_PARAMETER -1 diff --git a/libbsc/libbsc/libbsc.cpp b/libbsc/libbsc/libbsc.cpp index 763d042..7939b99 100644 --- a/libbsc/libbsc/libbsc.cpp +++ b/libbsc/libbsc/libbsc.cpp @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/lzp/lzp.cpp b/libbsc/lzp/lzp.cpp index 93f530c..69ab54c 100644 --- a/libbsc/lzp/lzp.cpp +++ b/libbsc/lzp/lzp.cpp @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/lzp/lzp.h b/libbsc/lzp/lzp.h index e4814a4..e8e3743 100644 --- a/libbsc/lzp/lzp.h +++ b/libbsc/lzp/lzp.h @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/platform/platform.cpp b/libbsc/platform/platform.cpp index 6a2f992..13fd565 100644 --- a/libbsc/platform/platform.cpp +++ b/libbsc/platform/platform.cpp @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/platform/platform.h b/libbsc/platform/platform.h index 23e2d98..e7a7d59 100644 --- a/libbsc/platform/platform.h +++ b/libbsc/platform/platform.h @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/st/st.cpp b/libbsc/st/st.cpp index 82123fe..5016c56 100644 --- a/libbsc/st/st.cpp +++ b/libbsc/st/st.cpp @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/st/st.cu b/libbsc/st/st.cu index 531e82a..0b54f25 100644 --- a/libbsc/st/st.cu +++ b/libbsc/st/st.cu @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/st/st.cuh b/libbsc/st/st.cuh index b3c0e62..70f1cb2 100644 --- a/libbsc/st/st.cuh +++ b/libbsc/st/st.cuh @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/libbsc/st/st.h b/libbsc/st/st.h index f405e4c..90cf6d1 100644 --- a/libbsc/st/st.h +++ b/libbsc/st/st.h @@ -8,7 +8,7 @@ This file is a part of bsc and/or libbsc, a program and a library for lossless, block-sorting data compression. - Copyright (c) 2009-2021 Ilya Grebnov + Copyright (c) 2009-2024 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.