From be844a76f4d4ed3927fcb240a36c6d52bb3908a3 Mon Sep 17 00:00:00 2001 From: Kal Conley Date: Fri, 4 Aug 2023 03:17:31 +0200 Subject: [PATCH] Initial commit --- .bazelrc | 52 + .bazelversion | 1 + .clang-format | 9 + .clang-tidy | 15 + .github/workflows/main.yml | 256 +++++ .gitignore | 1 + BUILD.bazel | 43 + LICENSE-APACHE | 202 ++++ LICENSE-MIT | 19 + README.md | 134 +++ WORKSPACE | 23 + src/bitshuffle.c | 1709 +++++++++++++++++++++++++++++++++ src/bitshuffle.h | 68 ++ tests/bitshuffle_benchmark.cc | 75 ++ tests/bitshuffle_test.cc | 146 +++ third_party/BUILD.bazel | 0 third_party/bitshuffle.BUILD | 16 + 17 files changed, 2769 insertions(+) create mode 100644 .bazelrc create mode 100644 .bazelversion create mode 100644 .clang-format create mode 100644 .clang-tidy create mode 100644 .github/workflows/main.yml create mode 100644 .gitignore create mode 100644 BUILD.bazel create mode 100644 LICENSE-APACHE create mode 100644 LICENSE-MIT create mode 100644 README.md create mode 100644 WORKSPACE create mode 100644 src/bitshuffle.c create mode 100644 src/bitshuffle.h create mode 100644 tests/bitshuffle_benchmark.cc create mode 100644 tests/bitshuffle_test.cc create mode 100644 third_party/BUILD.bazel create mode 100644 third_party/bitshuffle.BUILD diff --git a/.bazelrc b/.bazelrc new file mode 100644 index 0000000..875f2e3 --- /dev/null +++ b/.bazelrc @@ -0,0 +1,52 @@ +common --enable_platform_specific_config + +build --features=layering_check +build --repo_env=BAZEL_CXXOPTS=-std=c++14 + +build:ci --keep_going +build:ci --test_output=all + +build:generic_gcc --copt=-Wall +build:generic_gcc --copt=-Wextra +build:generic_gcc --copt=-Wpointer-arith +build:generic_gcc --per_file_copt=^//@-Wundef,-Werror +build:generic_gcc --copt=-Wno-unused-parameter + +build:clang --config=generic_gcc +build:clang --repo_env=BAZEL_COMPILER=clang +build:clang --repo_env=CC=clang +build:clang --repo_env=CXX=clang++ +# https://github.com/bazelbuild/bazel/issues/11122 +# https://github.com/bazelbuild/bazel/issues/12797 +build:clang --linkopt=-fsanitize-link-c++-runtime + +build:libc++ --cxxopt=-stdlib=libc++ +build:libc++ --linkopt=-stdlib=libc++ + +build:libc++-static --config=libc++ +build:libc++-static --repo_env=BAZEL_LINKLIBS=-l%:libc++.a:-l%:libc++abi.a:-lm +build:libc++-static --repo_env=BAZEL_LINKOPTS=-pthread + +build:macos --cxxopt=-std=c++14 + +build:macos-x86_64 --copt=-arch --copt=x86_64 +build:macos-x86_64 --linkopt=-arch --linkopt=x86_64 + +build:msvc --per_file_copt=^external/@/W2 +build:msvc --per_file_copt=^//@/W4,/WX +build:msvc --conlyopt=/Za +build:msvc --cxxopt=/std:c++14 + +build:asan --features=asan +build:asan --test_env=ASAN_OPTIONS=check_initialization_order=1:detect_invalid_pointer_pairs=1:detect_stack_use_after_return=1:strict_init_order=1:strict_string_checks=1 + +build:asan-msvc --config=asan +build:asan-msvc --copt=/fsanitize=address +build:asan-msvc --features=frame_pointer +build:asan-msvc --features=static_link_msvcrt + +build:ubsan --features=ubsan +build:ubsan --test_env=UBSAN_OPTIONS=halt_on_error=1:print_stacktrace=1 + +build:ubsan-extra --config=ubsan +build:ubsan-extra --copt=-fsanitize=implicit-signed-integer-truncation diff --git a/.bazelversion b/.bazelversion new file mode 100644 index 0000000..dc0208a --- /dev/null +++ b/.bazelversion @@ -0,0 +1 @@ +6.3.1 diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..28afbba --- /dev/null +++ b/.clang-format @@ -0,0 +1,9 @@ +BasedOnStyle: Chromium +AccessModifierOffset: -4 +AlwaysBreakBeforeMultilineStrings: false +BraceWrapping: + AfterControlStatement: MultiLine +BreakBeforeBraces: Custom +ColumnLimit: 100 +IndentWidth: 4 +SpacesBeforeTrailingComments: 1 diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 0000000..e43f5a3 --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,15 @@ +--- +Checks: "\ +bugprone-*,\ +readability-inconsistent-declaration-parameter-name,\ +readability-redundant-*,\ +readability-uppercase-literal-suffix,\ +-bugprone-easily-swappable-parameters,\ +-bugprone-implicit-widening-of-multiplication-result,\ +-bugprone-narrowing-conversions,\ +-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling" +CheckOptions: + readability-inconsistent-declaration-parameter-name.IgnoreMacros: false + readability-inconsistent-declaration-parameter-name.Strict: true +WarningsAsErrors: '*' +... diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..7a376de --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,256 @@ +name: Main + +on: + push: + branches: [main, ci] + +jobs: + bazel-linux-arm64-clang: + runs-on: [self-hosted, linux, arm64] + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Configure + run: | + echo "build --config=ci --config=clang" | tee job.rc + - name: Build + run: | + bazel --bazelrc=job.rc build -c opt //... + - name: Build (libc++) + run: | + bazel --bazelrc=job.rc build -c opt --config=libc++ //... + - name: Test + run: | + bazel --bazelrc=job.rc test --copt=-O3 --config=asan --config=ubsan-extra //:bitshuffle_test --test_arg=--gtest_break_on_failure + - name: Bench + run: | + bazel --bazelrc=job.rc run -c opt --copt=-O3 //:bitshuffle_benchmark -- --benchmark_filter='BM_bitshuffle<.*>/.*/8192' + bazel-linux-arm64-gcc: + runs-on: [self-hosted, linux, arm64] + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Configure + run: | + echo "build --config=ci --config=generic_gcc" | tee job.rc + - name: Build + run: | + bazel --bazelrc=job.rc build -c opt //... + - name: Test + run: | + bazel --bazelrc=job.rc test --copt=-O3 --config=asan --config=ubsan //:bitshuffle_test --test_arg=--gtest_break_on_failure + - name: Bench + run: | + bazel --bazelrc=job.rc run -c opt --copt=-O3 //:bitshuffle_benchmark -- --benchmark_filter='BM_bitshuffle<.*>/.*/8192' + bazel-linux-x86_64-icelake-clang: + strategy: + matrix: + m: + - "sse2" + - "avx2" + - "avx512bw avx512vl" + - "avx512bw avx512vl avx512vbmi gfni" + use_ifunc: [0, 1] + runs-on: [self-hosted, linux, x64] + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Configure + run: | + m="${{ matrix.m }}" + echo "build --config=ci --config=clang --copt=-m${m// / --copt=-m} --copt=-DBITSHUF_USE_IFUNC=${{ matrix.use_ifunc }}" | tee job.rc + - name: Build + run: | + bazel --bazelrc=job.rc build -c opt //... + - name: Build (libc++) + run: | + bazel --bazelrc=job.rc build -c opt --config=libc++ //... + - name: Test + run: | + bazel --bazelrc=job.rc test --copt=-O3 --config=asan --config=ubsan-extra //:bitshuffle_test --test_arg=--gtest_break_on_failure + - name: Bench + run: | + bazel --bazelrc=job.rc run -c opt --copt=-O3 //:bitshuffle_benchmark -- --benchmark_filter='BM_bitshuffle<.*>/.*/8192' + bazel-linux-x86_64-icelake-gcc: + strategy: + matrix: + m: + - "no-sse2" + - "sse2" + - "avx2" + - "avx512bw avx512vl" + - "avx512bw avx512vl avx512vbmi gfni" + use_ifunc: [0, 1] + runs-on: [self-hosted, linux, x64] + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Configure + run: | + m="${{ matrix.m }}" + echo "build --config=ci --config=generic_gcc --copt=-m${m// / --copt=-m} --copt=-DBITSHUF_USE_IFUNC=${{ matrix.use_ifunc }}" | tee job.rc + - name: Build + run: | + bazel --bazelrc=job.rc build -c opt //... + - name: Test + run: | + bazel --bazelrc=job.rc test --copt=-O3 --config=asan --config=ubsan //:bitshuffle_test --test_arg=--gtest_break_on_failure + - name: Bench + run: | + bazel --bazelrc=job.rc run -c opt --copt=-O3 //:bitshuffle_benchmark -- --benchmark_filter='BM_bitshuffle<.*>/.*/8192' + bazel-linux-x86_64-clang: + strategy: + matrix: + os: [ubuntu-20.04, ubuntu-latest] + m: + - "sse2" + - "avx2" + use_ifunc: [0, 1] + runs-on: ${{ matrix.os }} + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Configure + run: | + m="${{ matrix.m }}" + echo "build --config=ci --config=clang --copt=-m${m// / --copt=-m} --copt=-DBITSHUF_USE_IFUNC=${{ matrix.use_ifunc }}" | tee job.rc + - name: Build + run: | + bazel --bazelrc=job.rc build -c opt //... + - name: Test + run: | + bazel --bazelrc=job.rc test --copt=-O3 --config=asan --config=ubsan-extra //:bitshuffle_test --test_arg=--gtest_break_on_failure + bazel-linux-x86_64-gcc: + strategy: + matrix: + os: [ubuntu-20.04, ubuntu-latest] + m: + - "no-sse2" + - "sse2" + - "avx2" + use_ifunc: [0, 1] + runs-on: ${{ matrix.os }} + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Install + run: | + # Bazel prefers LLD to GNU gold. Install LLD to avoid gold relocation overflow bug on ubuntu-20.04. + # https://mail.gnu.org/archive/html/bug-binutils/2020-04/msg00329.html + sudo apt-get update + sudo apt-get install lld + - name: Configure + run: | + m="${{ matrix.m }}" + echo "build --config=ci --config=generic_gcc --copt=-m${m// / --copt=-m} --copt=-DBITSHUF_USE_IFUNC=${{ matrix.use_ifunc }}" | tee job.rc + - name: Build + run: | + bazel --bazelrc=job.rc build -c opt //... + - name: Test + run: | + bazel --bazelrc=job.rc test --copt=-O3 --config=asan --config=ubsan //:bitshuffle_test --test_arg=--gtest_break_on_failure + bazel-macos-x86_64: + strategy: + matrix: + os: [macos-11, macos-12, macos-13] + m: + - "sse2" + - "avx2" + exclude: + - {os: macos-11, m: "avx2"} + - {os: macos-12, m: "avx2"} + runs-on: ${{ matrix.os }} + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Configure + run: | + m="${{ matrix.m }}" + echo "build --config=ci --config=generic_gcc --copt=-m${m// / --copt=-m}" | tee job.rc + - name: Build + run: | + bazel --bazelrc=job.rc build -c opt //... + - name: Test + run: | + bazel --bazelrc=job.rc test --copt=-O3 --config=asan --config=ubsan-extra //:bitshuffle_test --test_arg=--gtest_break_on_failure + bazel-windows-msvc: + strategy: + matrix: + # FIXME(kal): Add windows-2022 once it works with bazel. + # https://github.com/bazelbuild/bazel/issues/18592 + os: [windows-2019] + cpu: [x64, x64_x86] + runs-on: ${{ matrix.os }} + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Configure + run: | + "build --cpu=${{ matrix.cpu }}_windows --config=ci --config=msvc" | Out-File -FilePath job.rc + - name: Build + run: | + bazel --bazelrc=job.rc build -c opt //... + - name: Test + run: | + bazel --bazelrc=job.rc test --copt=/O2 --config=asan-msvc //:bitshuffle_test --test_arg=--gtest_break_on_failure + bazel-windows-msvc-arch: + strategy: + matrix: + include: + - {cpu: x64, arch: AVX} + - {cpu: x64, arch: AVX2} + - {cpu: x64_x86, arch: IA32} + - {cpu: x64_x86, arch: SSE} + - {cpu: x64_x86, arch: SSE2} + - {cpu: x64_x86, arch: AVX} + - {cpu: x64_x86, arch: AVX2} + runs-on: windows-2019 + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Configure + run: | + "build --cpu=${{ matrix.cpu }}_windows --config=ci --config=msvc --copt=/arch:${{ matrix.arch }}" | Out-File -FilePath job.rc + - name: Build + run: | + bazel --bazelrc=job.rc build -c opt //... + - name: Test + run: | + bazel --bazelrc=job.rc test --copt=/O2 --config=asan-msvc //:bitshuffle_test --test_arg=--gtest_break_on_failure + clang-format: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Clang-Format + run: | + set -eu + srcs=$(git ls-files -- '*.c' '*.cc' '*.h') + clang-format --dry-run --Werror --verbose -- ${srcs} + clang-tidy: + strategy: + matrix: + m: + - "no-sse2" + - "sse2" + - "avx2" + - "avx512bw avx512vl" + - "avx512bw avx512vl avx512vbmi gfni" + use_ifunc: [0, 1] + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Install + run: | + sudo apt-get update + sudo apt-get install clang-tidy-15 + - name: Configure + run: | + m="${{ matrix.m }}" + echo "CFLAGS=-m${m// / -m} -DBITSHUF_USE_IFUNC=${{ matrix.use_ifunc }}" >>"${GITHUB_ENV}" + - name: Clang-Tidy + run: | + set -eu + clang-tidy-15 --warnings-as-errors='*' src/bitshuffle.c -- ${CFLAGS} + clang-tidy-15 --warnings-as-errors='*' src/bitshuffle.c -- ${CFLAGS} -DNDEBUG diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a6ef824 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/bazel-* diff --git a/BUILD.bazel b/BUILD.bazel new file mode 100644 index 0000000..ed7c113 --- /dev/null +++ b/BUILD.bazel @@ -0,0 +1,43 @@ +load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test") + +package(default_visibility = ["//visibility:public"]) + +cc_library( + name = "bitshuffle", + srcs = [ + "src/bitshuffle.c", + ], + hdrs = [ + "src/bitshuffle.h", + ], +) + +cc_binary( + name = "bitshuffle_benchmark", + srcs = [ + "tests/bitshuffle_benchmark.cc", + ], + copts = [ + "-Iexternal", + ], + linkstatic = True, + deps = [ + ":bitshuffle", + "@benchmark", + "@benchmark//:benchmark_main", + "@bitshuffle//:bitshuffle_core", + ], +) + +cc_test( + name = "bitshuffle_test", + srcs = [ + "tests/bitshuffle_test.cc", + ], + linkstatic = True, + deps = [ + ":bitshuffle", + "@googletest//:gtest", + "@googletest//:gtest_main", + ], +) diff --git a/LICENSE-APACHE b/LICENSE-APACHE new file mode 100644 index 0000000..ab55200 --- /dev/null +++ b/LICENSE-APACHE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2023 Kal Conley + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/LICENSE-MIT b/LICENSE-MIT new file mode 100644 index 0000000..0df1f0c --- /dev/null +++ b/LICENSE-MIT @@ -0,0 +1,19 @@ +Copyright (c) 2023 Kal Conley + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..8edef50 --- /dev/null +++ b/README.md @@ -0,0 +1,134 @@ +# Bitshuffle + +Transpose bits to improve data compression. + +[![Build Status][actions-badge]][actions-url] + +[actions-badge]: https://github.com/kalcutter/bitshuffle/actions/workflows/main.yml/badge.svg +[actions-url]: https://github.com/kalcutter/bitshuffle/actions/workflows/main.yml?query=branch%3Amain + +## Overview + +Bitshuffle is a lossless filter for improving the compression of typed data. It works by +transposing the bits of the data to reduce entropy. + +This repository contains a highly optimized Bitshuffle implementation for modern processors. + +For more information about Bitshuffle, refer to the classic implementation here: +. + +## Scope + +This library only implements the core Bitshuffle transpose operation (exposed as a C interface). + +A Python interface is not provided. + +## Features + +* Implemented in C99. +* Optimized with SIMD instructions including SSE2, AVX2, and AVX-512. +* Runtime dispatch based on available CPU features (requires GNU IFUNC support). +* Does not allocate memory. +* Support for Clang, GCC, ICC, and MSVC. +* Tested on Linux, macOS, and Windows. + +## Performance + +The performance of this implementation is excellent. + +Compared to [`bshuf_trans_bit_elem`][bshuf_trans_bit_elem] and +[`bshuf_untrans_bit_elem`][bshuf_untrans_bit_elem] from the classic implementation, this code +yields a typical speedup between **1.3x** and **10x** depending on the CPU architecture, function, +and arguments. + +## API + +The public interface is declared in the header file [bitshuffle.h](src/bitshuffle.h). Two public +functions are defined: + +```c++ +/* Transpose bits for compression. + * + * This function performs Bitshuffle transposition of a single block. The block + * size in bytes is given by the product of `size` and `elem_size`. + * + * If required, the `scratch` argument must point to a buffer that the function + * uses for scratch purposes. The size of this buffer is given by the block + * size. + * + * On success, the function returns 0; otherwise, -1 is returned to indicate an + * error. In case of error, the memory pointed to by `out` and `scratch` is left + * unmodified. + * + * Errors + * ------ + * The function returns -1 to indicate an error if: + * + * - The `scratch` argument is `NULL` and a scratch buffer is required for the + * specified element size. + * - The `size` argument is not a multiple of 8. + */ +int bitshuf_encode_block(char* restrict out, + const char* restrict in, + char* restrict scratch, + size_t size, + size_t elem_size); +``` + +```c++ +/* Untranspose bits after decompression. + * + * This function performs the inverse of `bitshuf_encode_block()`. + * + * If required, the `scratch` argument must point to a buffer that the function + * uses for scratch purposes. The size of this buffer is given by the block + * size. + * + * On success, the function returns 0; otherwise, -1 is returned to indicate an + * error. In case of error, the memory pointed to by `out` and `scratch` is left + * unmodified. + * + * Errors + * ------ + * The function returns -1 to indicate an error if: + * + * - The `scratch` argument is `NULL` and a scratch buffer is required for the + * specified element size. + * - The `size` argument is not a multiple of 8. + */ +int bitshuf_decode_block(char* restrict out, + const char* restrict in, + char* restrict scratch, + size_t size, + size_t elem_size); +``` + +These functions perform the same operations as [`bshuf_trans_bit_elem`][bshuf_trans_bit_elem] and +[`bshuf_untrans_bit_elem`][bshuf_untrans_bit_elem], respectively. + +The header file is compatible with both C89 and C++. + +[bshuf_trans_bit_elem]: https://github.com/kiyo-masui/bitshuffle/blob/b9a1546133959298c56eee686932dbb18ff80f7a/src/bitshuffle_internals.h#L50 +[bshuf_untrans_bit_elem]: https://github.com/kiyo-masui/bitshuffle/blob/b9a1546133959298c56eee686932dbb18ff80f7a/src/bitshuffle_internals.h#L59 + +## Caveats + +Only little-endian architectures are supported. Support for big-endian machines +is not planned. + +## License + +This repository is licensed under either of + +* Apache License, Version 2.0 + ([LICENSE-APACHE](LICENSE-APACHE) or ) +* MIT license + ([LICENSE-MIT](LICENSE-MIT) or ) + +at your option. + +## Contribution + +Unless you explicitly state otherwise, any contribution intentionally submitted +for inclusion in the work by you, as defined in the Apache-2.0 license, shall be +dual licensed as above, without any additional terms or conditions. diff --git a/WORKSPACE b/WORKSPACE new file mode 100644 index 0000000..322c2bc --- /dev/null +++ b/WORKSPACE @@ -0,0 +1,23 @@ +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") + +http_archive( + name = "benchmark", + sha256 = "6430e4092653380d9dc4ccb45a1e2dc9259d581f4866dc0759713126056bc1d7", + strip_prefix = "benchmark-1.7.1", + url = "https://github.com/google/benchmark/archive/refs/tags/v1.7.1.tar.gz", +) + +http_archive( + name = "bitshuffle", + build_file = "//third_party:bitshuffle.BUILD", + sha256 = "2631aaa5d4c24e51415c7b1827d4f9dcf505ad8db03738210da9ce6dab8f5870", + strip_prefix = "bitshuffle-0.5.1", + url = "https://github.com/kiyo-masui/bitshuffle/archive/refs/tags/0.5.1.tar.gz", +) + +http_archive( + name = "googletest", + sha256 = "ad7fdba11ea011c1d925b3289cf4af2c66a352e18d4c7264392fead75e919363", + strip_prefix = "googletest-1.13.0", + url = "https://github.com/google/googletest/archive/refs/tags/v1.13.0.tar.gz", +) diff --git a/src/bitshuffle.c b/src/bitshuffle.c new file mode 100644 index 0000000..0ee0b9b --- /dev/null +++ b/src/bitshuffle.c @@ -0,0 +1,1709 @@ +// SPDX-License-Identifier: MIT OR Apache-2.0 +// Copyright (c) 2023 Kal Conley +#include "bitshuffle.h" + +#include +#include +#include + +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) +#include +#endif + +#ifndef __has_attribute +#define __has_attribute(x) 0 +#endif +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +#if defined(__INTEL_COMPILER) +#pragma warning(disable : 177) // entity-kind "entity" was declared but never referenced +#elif defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wunused-function" +#endif + +#if defined(_MSC_VER) +#pragma warning(disable : 4244) // conversion from 'type1' to 'type2', possible loss of data +#endif + +#if defined(_MSC_VER) +#if defined(_M_IX86) && _M_IX86_FP == 2 || defined(_M_X64) +#ifndef __SSE2__ +#define __SSE2__ 1 +#endif +#endif +#endif + +#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \ + __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ || \ + defined(__BIG_ENDIAN__) +#error big endian not supported +#endif + +#ifndef BITSHUF_USE_IFUNC +#if defined(__INTEL_COMPILER) || defined(__clang__) && __clang_major__ < 8 +#define BITSHUF_USE_IFUNC 0 // GFNI not supported by compiler. +#endif +#endif +#ifndef BITSHUF_USE_IFUNC +#if (__has_attribute(ifunc) && __has_attribute(target) && __has_builtin(__builtin_cpu_init) && \ + __has_builtin(__builtin_cpu_is) && __has_builtin(__builtin_cpu_supports)) || \ + (defined(__GNUC__) && __GNUC__ >= 8) +#define BITSHUF_USE_IFUNC 1 +#else +#define BITSHUF_USE_IFUNC 0 +#endif +#endif + +#define STRINGIZE(x) #x + +#if __has_attribute(target) && !defined(__INTEL_COMPILER) +#define ATTRIBUTE_TARGET(x) __attribute__((target(x))) +#else +#define ATTRIBUTE_TARGET(x) +#endif + +#if __has_attribute(always_inline) || defined(__GNUC__) +#define ALWAYS_INLINE inline __attribute__((always_inline)) +#elif defined(_MSC_VER) +#define ALWAYS_INLINE __forceinline +#else +#define ALWAYS_INLINE inline +#endif + +#if __has_attribute(noinline) || defined(__GNUC__) +#define NO_INLINE __attribute__((noinline)) +#elif defined(_MSC_VER) +#define NO_INLINE __declspec(noinline) +#else +#define NO_INLINE +#endif + +#if __has_attribute(no_sanitize_address) +#define NO_SANITIZE_ADDRESS __attribute__((no_sanitize_address)) +#else +#define NO_SANITIZE_ADDRESS +#endif + +#if __has_attribute(no_sanitize_memory) +#define NO_SANITIZE_MEMORY __attribute__((no_sanitize_memory)) +#else +#define NO_SANITIZE_MEMORY +#endif + +#if __has_attribute(no_sanitize_thread) +#define NO_SANITIZE_THREAD __attribute__((no_sanitize_thread)) +#else +#define NO_SANITIZE_THREAD +#endif + +#if __has_attribute(disable_sanitizer_instrumentation) +#define DISABLE_SANITIZER_INSTRUMENTATION __attribute__((disable_sanitizer_instrumentation)) +#else +#define DISABLE_SANITIZER_INSTRUMENTATION +#endif + +#if __has_attribute(fallthrough) +#define FALLTHROUGH __attribute__((fallthrough)) +#else +#define FALLTHROUGH +#endif + +#if __has_builtin(__builtin_expect) || defined(__GNUC__) +#define UNLIKELY(x) __builtin_expect(!!(x), 0) +#else +#define UNLIKELY(x) (x) +#endif + +// clang-format off +#define IMPLEMENT_IFUNC(NAME, PARAMS) \ + __attribute__((ifunc(STRINGIZE(NAME##_resolver)))) \ + static void NAME PARAMS; \ + \ + DISABLE_SANITIZER_INSTRUMENTATION \ + NO_SANITIZE_ADDRESS NO_SANITIZE_MEMORY NO_SANITIZE_THREAD \ + static void (*NAME##_resolver(void))PARAMS +// clang-format on + +#define IMPLEMENT_LOAD_FUNCTION(NAME, TYPE) \ + static ALWAYS_INLINE TYPE NAME(const void* mem_addr) { \ + TYPE a; \ + memcpy(&a, mem_addr, sizeof(a)); \ + return a; \ + } + +#define IMPLEMENT_STORE_FUNCTION(NAME, TYPE) \ + static ALWAYS_INLINE void NAME(void* mem_addr, const TYPE a) { \ + memcpy(mem_addr, &a, sizeof(a)); \ + } + +#if !defined(__SSE2__) +IMPLEMENT_LOAD_FUNCTION(LOAD_U64, uint64_t) +#endif +IMPLEMENT_STORE_FUNCTION(STORE_U8, uint8_t) +IMPLEMENT_STORE_FUNCTION(STORE_U64, uint64_t) + +// Computes the transpose of an 8x8 bit matrix. +// Ref: "Hacker's Delight" 7-3 by Henry Warren. +static uint64_t transpose8(uint64_t x) { + uint64_t t; + t = (x ^ (x >> 7)) & 0x00aa00aa00aa00aa; + x = (x ^ t ^ (t << 7)); + t = (x ^ (x >> 14)) & 0x0000cccc0000cccc; + x = (x ^ t ^ (t << 14)); + t = (x ^ (x >> 28)) & 0x00000000f0f0f0f0; + x = (x ^ t ^ (t << 28)); + return x; +} + +#if !defined(__SSE2__) +NO_INLINE +static void bitshuf_trans_bit(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + size /= 8; + + for (size_t i = 0; i < size; i++) { + const uint64_t a = LOAD_U64(&in[i * 8]); + const uint64_t x = transpose8(a); + STORE_U8(&out[0 * size + i], x); + STORE_U8(&out[1 * size + i], x >> 8 * 1); + STORE_U8(&out[2 * size + i], x >> 8 * 2); + STORE_U8(&out[3 * size + i], x >> 8 * 3); + STORE_U8(&out[4 * size + i], x >> 8 * 4); + STORE_U8(&out[5 * size + i], x >> 8 * 5); + STORE_U8(&out[6 * size + i], x >> 8 * 6); + STORE_U8(&out[7 * size + i], x >> 8 * 7); + } +} + +NO_INLINE +static void bitshuf_trans_byte(char* restrict out, + const char* restrict in, + size_t size, + size_t elem_size) { + assert(size % 8 == 0); + + for (size_t i = 0; i < size; i += 8) { + for (size_t j = 0; j < elem_size; j++) { + for (size_t k = 0; k < 8; k++) { + out[j * size + (i + k)] = in[(i + k) * elem_size + j]; + } + } + } +} + +NO_INLINE +static void bitshuf_trans_byte_2(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + size = size / 8 * 8; + + for (size_t i = 0; i < size; i++) { + out[0 * size + i] = in[i * 2 + 0]; + out[1 * size + i] = in[i * 2 + 1]; + } +} + +NO_INLINE +static void bitshuf_trans_byte_4(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + size = size / 8 * 8; + + for (size_t i = 0; i < size; i++) { + out[0 * size + i] = in[i * 4 + 0]; + out[1 * size + i] = in[i * 4 + 1]; + out[2 * size + i] = in[i * 4 + 2]; + out[3 * size + i] = in[i * 4 + 3]; + } +} + +static void bitshuf_trans_byte_8(char* restrict out, const char* restrict in, size_t size) { + bitshuf_trans_byte(out, in, size, 8); +} +#endif + +static void bitshuf_untrans_bit_tail(char* restrict out, + const char* restrict in, + size_t size, + size_t index) { + assert(size % 8 == 0); + size /= 8; + + for (size_t i = index; i < size; i++) { + const uint64_t a = (uint64_t)(uint8_t)in[0 * size + i] | + (uint64_t)(uint8_t)in[1 * size + i] << 8 * 1 | + (uint64_t)(uint8_t)in[2 * size + i] << 8 * 2 | + (uint64_t)(uint8_t)in[3 * size + i] << 8 * 3 | + (uint64_t)(uint8_t)in[4 * size + i] << 8 * 4 | + (uint64_t)(uint8_t)in[5 * size + i] << 8 * 5 | + (uint64_t)(uint8_t)in[6 * size + i] << 8 * 6 | + (uint64_t)(uint8_t)in[7 * size + i] << 8 * 7; + STORE_U64(&out[i * 8], transpose8(a)); + } +} + +#if !defined(__SSE2__) +NO_INLINE +static void bitshuf_untrans_bit(char* restrict out, const char* restrict in, size_t size) { + bitshuf_untrans_bit_tail(out, in, size, 0); +} + +NO_INLINE +static void bitshuf_untrans_byte(char* restrict out, + const char* restrict in, + size_t size, + size_t elem_size) { + assert(size % 8 == 0); + + for (size_t i = 0; i < size; i += 8) { + for (size_t j = 0; j < elem_size; j++) { + for (size_t k = 0; k < 8; k++) { + out[(i + k) * elem_size + j] = in[j * size + (i + k)]; + } + } + } +} + +NO_INLINE +static void bitshuf_untrans_byte_2(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + size = size / 8 * 8; + + for (size_t i = 0; i < size; i++) { + out[i * 2 + 0] = in[0 * size + i]; + out[i * 2 + 1] = in[1 * size + i]; + } +} + +NO_INLINE +static void bitshuf_untrans_byte_4(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + size = size / 8 * 8; + + for (size_t i = 0; i < size; i++) { + out[i * 4 + 0] = in[0 * size + i]; + out[i * 4 + 1] = in[1 * size + i]; + out[i * 4 + 2] = in[2 * size + i]; + out[i * 4 + 3] = in[3 * size + i]; + } +} + +static void bitshuf_untrans_byte_8(char* restrict out, const char* restrict in, size_t size) { + bitshuf_untrans_byte(out, in, size, 8); +} +#endif + +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + +IMPLEMENT_LOAD_FUNCTION(LOAD_I64, int64_t) +IMPLEMENT_STORE_FUNCTION(STORE_U16, uint16_t) +IMPLEMENT_STORE_FUNCTION(STORE_U32, uint32_t) + +#define MM256_SETR_M128I(lo, hi) _mm256_inserti128_si256(_mm256_castsi128_si256(lo), (hi), 1) + +#if defined(__clang__) +#define X(A) \ + ({ \ + __asm__("" : "+x"(A)); \ + (A); \ + }) +#else +#define X(A) (A) +#endif + +#if defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__GFNI__) || BITSHUF_USE_IFUNC +NO_INLINE +ATTRIBUTE_TARGET("avx512vbmi,avx512vl,gfni") +static void bitshuf_trans_bit_avx512vbmi_gfni(char* restrict out, + const char* restrict in, + size_t size) { + assert(size % 8 == 0); + size /= 8; + + const __m512i BSWAP64 = _mm512_set_epi64( + 0x08090a0b0c0d0e0f, 0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607, + 0x08090a0b0c0d0e0f, 0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607); + const __m512i C0 = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0x3931292119110901, 0x3830282018100800); + const __m512i C1 = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0x3b332b231b130b03, 0x3a322a221a120a02); + const __m512i C2 = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0x3d352d251d150d05, 0x3c342c241c140c04); + const __m512i C3 = _mm512_set_epi64(0, 0, 0, 0, 0, 0, 0x3f372f271f170f07, 0x3e362e261e160e06); + const __m512i I8 = _mm512_set1_epi64(0x8040201008040201); + size_t i = 0; + for (; i + 8 <= size; i += 8) { + const __m512i a = _mm512_loadu_si512(&in[i * 8]); + const __m512i u = _mm512_gf2p8affine_epi64_epi8(I8, _mm512_shuffle_epi8(a, BSWAP64), 0x00); + const __m128i u0 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C0, u)); + const __m128i u1 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C1, u)); + const __m128i u2 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C2, u)); + const __m128i u3 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C3, u)); + _mm_storel_epi64((__m128i*)&out[0 * size + i], u0); + _mm_storel_epi64((__m128i*)&out[1 * size + i], _mm_srli_si128(u0, 8)); + _mm_storel_epi64((__m128i*)&out[2 * size + i], u1); + _mm_storel_epi64((__m128i*)&out[3 * size + i], _mm_srli_si128(u1, 8)); + _mm_storel_epi64((__m128i*)&out[4 * size + i], u2); + _mm_storel_epi64((__m128i*)&out[5 * size + i], _mm_srli_si128(u2, 8)); + _mm_storel_epi64((__m128i*)&out[6 * size + i], u3); + _mm_storel_epi64((__m128i*)&out[7 * size + i], _mm_srli_si128(u3, 8)); + } + if (i < size) { + const __mmask8 k = (1U << (size - i)) - 1; + const __m512i a = _mm512_maskz_loadu_epi64(k, &in[i * 8]); + const __m512i u = _mm512_gf2p8affine_epi64_epi8(I8, _mm512_shuffle_epi8(a, BSWAP64), 0x00); + const __m128i u0 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C0, u)); + const __m128i u1 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C1, u)); + const __m128i u2 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C2, u)); + const __m128i u3 = _mm512_castsi512_si128(_mm512_permutexvar_epi8(C3, u)); + _mm_mask_storeu_epi8(&out[0 * size + i], k, u0); + _mm_mask_storeu_epi8(&out[1 * size + i], k, _mm_srli_si128(u0, 8)); + _mm_mask_storeu_epi8(&out[2 * size + i], k, u1); + _mm_mask_storeu_epi8(&out[3 * size + i], k, _mm_srli_si128(u1, 8)); + _mm_mask_storeu_epi8(&out[4 * size + i], k, u2); + _mm_mask_storeu_epi8(&out[5 * size + i], k, _mm_srli_si128(u2, 8)); + _mm_mask_storeu_epi8(&out[6 * size + i], k, u3); + _mm_mask_storeu_epi8(&out[7 * size + i], k, _mm_srli_si128(u3, 8)); + } +} +#endif + +#if defined(__AVX512BW__) && defined(__AVX512VL__) || BITSHUF_USE_IFUNC +IMPLEMENT_STORE_FUNCTION(STORE_MASK64, __mmask64) + +NO_INLINE +ATTRIBUTE_TARGET("avx512bw,avx512vl") +static void bitshuf_trans_bit_avx512bw(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + size /= 8; + + const __m512i C0 = _mm512_set1_epi8(0x01); + const __m512i C1 = _mm512_set1_epi8(0x02); + const __m512i C2 = _mm512_set1_epi8(0x04); + const __m512i C3 = _mm512_set1_epi8(0x08); + const __m512i C4 = _mm512_set1_epi8(0x10); + const __m512i C5 = _mm512_set1_epi8(0x20); + const __m512i C6 = _mm512_set1_epi8(0x40); + size_t i = 0; + for (; i + 8 <= size; i += 8) { + const __m512i a = _mm512_loadu_si512(&in[i * 8]); + STORE_MASK64(&out[0 * size + i], _mm512_test_epi8_mask(a, C0)); + STORE_MASK64(&out[1 * size + i], _mm512_test_epi8_mask(a, C1)); + STORE_MASK64(&out[2 * size + i], _mm512_test_epi8_mask(a, C2)); + STORE_MASK64(&out[3 * size + i], _mm512_test_epi8_mask(a, C3)); + STORE_MASK64(&out[4 * size + i], _mm512_test_epi8_mask(a, C4)); + STORE_MASK64(&out[5 * size + i], _mm512_test_epi8_mask(a, C5)); + STORE_MASK64(&out[6 * size + i], _mm512_test_epi8_mask(a, C6)); + STORE_MASK64(&out[7 * size + i], _mm512_movepi8_mask(a)); + } + if (i < size) { + const __mmask8 k = (1U << (size - i)) - 1; + const __m512i a = _mm512_maskz_loadu_epi64(k, &in[i * 8]); + // clang-format off + _mm_mask_storeu_epi8(&out[0 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C0))); + _mm_mask_storeu_epi8(&out[1 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C1))); + _mm_mask_storeu_epi8(&out[2 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C2))); + _mm_mask_storeu_epi8(&out[3 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C3))); + _mm_mask_storeu_epi8(&out[4 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C4))); + _mm_mask_storeu_epi8(&out[5 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C5))); + _mm_mask_storeu_epi8(&out[6 * size + i], k, _mm_set_epi64x(0, _mm512_test_epi8_mask(a, C6))); + _mm_mask_storeu_epi8(&out[7 * size + i], k, _mm_set_epi64x(0, _mm512_movepi8_mask(a))); + // clang-format on + } +} +#endif + +#if defined(__AVX2__) || BITSHUF_USE_IFUNC +NO_INLINE +ATTRIBUTE_TARGET("avx2") +static void bitshuf_trans_bit_avx2(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + size /= 8; + + size_t i = 0; + for (; i + 4 <= size; i += 4) { + const __m256i a = _mm256_loadu_si256((const __m256i*)&in[i * 8]); + __m256i u; + STORE_U32(&out[7 * size + i], _mm256_movemask_epi8(u = a)); + STORE_U32(&out[6 * size + i], _mm256_movemask_epi8(u = _mm256_add_epi8(X(u), u))); + STORE_U32(&out[5 * size + i], _mm256_movemask_epi8(u = _mm256_add_epi8(X(u), u))); + STORE_U32(&out[4 * size + i], _mm256_movemask_epi8(u = _mm256_add_epi8(X(u), u))); + STORE_U32(&out[3 * size + i], _mm256_movemask_epi8(u = _mm256_add_epi8(X(u), u))); + STORE_U32(&out[2 * size + i], _mm256_movemask_epi8(u = _mm256_add_epi8(X(u), u))); + STORE_U32(&out[1 * size + i], _mm256_movemask_epi8(u = _mm256_add_epi8(X(u), u))); + STORE_U32(&out[0 * size + i], _mm256_movemask_epi8(_mm256_add_epi8(X(u), u))); + } + if (i + 2 <= size) { + const __m128i a = _mm_loadu_si128((const __m128i*)&in[i * 8]); + __m128i u; + STORE_U16(&out[7 * size + i], _mm_movemask_epi8(u = a)); + STORE_U16(&out[6 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U16(&out[5 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U16(&out[4 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U16(&out[3 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U16(&out[2 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U16(&out[1 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U16(&out[0 * size + i], _mm_movemask_epi8(_mm_add_epi8(X(u), u))); + i += 2; + } + if (i < size) { + const __m128i a = _mm_loadl_epi64((const __m128i*)&in[i * 8]); + __m128i u; + STORE_U8(&out[7 * size + i], _mm_movemask_epi8(u = a)); + STORE_U8(&out[6 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U8(&out[5 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U8(&out[4 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U8(&out[3 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U8(&out[2 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U8(&out[1 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U8(&out[0 * size + i], _mm_movemask_epi8(_mm_add_epi8(X(u), u))); + } +} +#endif + +#if defined(__SSE2__) || BITSHUF_USE_IFUNC +NO_INLINE +ATTRIBUTE_TARGET("sse2") +static void bitshuf_trans_bit_sse2(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + size /= 8; + + size_t i = 0; + for (; i + 2 <= size; i += 2) { + const __m128i a = _mm_loadu_si128((const __m128i*)&in[i * 8]); + __m128i u; + STORE_U16(&out[7 * size + i], _mm_movemask_epi8(u = a)); + STORE_U16(&out[6 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U16(&out[5 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U16(&out[4 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U16(&out[3 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U16(&out[2 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U16(&out[1 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U16(&out[0 * size + i], _mm_movemask_epi8(_mm_add_epi8(X(u), u))); + } + if (i < size) { + const __m128i a = _mm_loadl_epi64((const __m128i*)&in[i * 8]); + __m128i u; + STORE_U8(&out[7 * size + i], _mm_movemask_epi8(u = a)); + STORE_U8(&out[6 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U8(&out[5 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U8(&out[4 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U8(&out[3 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U8(&out[2 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U8(&out[1 * size + i], _mm_movemask_epi8(u = _mm_add_epi8(X(u), u))); + STORE_U8(&out[0 * size + i], _mm_movemask_epi8(_mm_add_epi8(X(u), u))); + } +} +#endif + +#if defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__GFNI__) +#define bitshuf_trans_bit bitshuf_trans_bit_avx512vbmi_gfni +#elif BITSHUF_USE_IFUNC +IMPLEMENT_IFUNC(bitshuf_trans_bit_ifunc, + (char* restrict out, const char* restrict in, size_t size)) { + __builtin_cpu_init(); + + if (__builtin_cpu_supports("avx512vbmi") && __builtin_cpu_supports("avx512vl") && + __builtin_cpu_supports("gfni") && !__builtin_cpu_is("intel")) + { + return bitshuf_trans_bit_avx512vbmi_gfni; + } +#if defined(__AVX512BW__) && defined(__AVX512VL__) + return bitshuf_trans_bit_avx512bw; +#else + if (__builtin_cpu_supports("avx512bw") && __builtin_cpu_supports("avx512vl")) + return bitshuf_trans_bit_avx512bw; +#if defined(__AVX2__) + return bitshuf_trans_bit_avx2; +#else + if (__builtin_cpu_supports("avx2")) + return bitshuf_trans_bit_avx2; +#if defined(__SSE2__) + return bitshuf_trans_bit_sse2; +#else + if (__builtin_cpu_supports("sse2")) + return bitshuf_trans_bit_sse2; + + return bitshuf_trans_bit; +#endif +#endif +#endif +} +#define bitshuf_trans_bit bitshuf_trans_bit_ifunc +#elif defined(__AVX512BW__) && defined(__AVX512VL__) +#define bitshuf_trans_bit bitshuf_trans_bit_avx512bw +#elif defined(__AVX2__) +#define bitshuf_trans_bit bitshuf_trans_bit_avx2 +#elif defined(__SSE2__) +#define bitshuf_trans_bit bitshuf_trans_bit_sse2 +#endif + +#if defined(__SSE2__) || BITSHUF_USE_IFUNC +NO_INLINE +ATTRIBUTE_TARGET("sse2") +static void bitshuf_trans_byte_sse2(char* restrict out, + const char* restrict in, + size_t size, + size_t elem_size) { + assert(size % 8 == 0); + + size_t j = 0; + for (; j + 8 <= elem_size; j += 8) { + for (size_t i = 0; i < size; i += 8) { + const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[(i + 0) * elem_size + j]); + const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[(i + 1) * elem_size + j]); + const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[(i + 2) * elem_size + j]); + const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[(i + 3) * elem_size + j]); + const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[(i + 4) * elem_size + j]); + const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[(i + 5) * elem_size + j]); + const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[(i + 6) * elem_size + j]); + const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[(i + 7) * elem_size + j]); + __m128i u0 = _mm_unpacklo_epi8(a0, a1); + __m128i u1 = _mm_unpacklo_epi8(a2, a3); + __m128i u2 = _mm_unpacklo_epi8(a4, a5); + __m128i u3 = _mm_unpacklo_epi8(a6, a7); + const __m128i v0 = _mm_unpacklo_epi16(u0, u1); + const __m128i v1 = _mm_unpackhi_epi16(u0, u1); + const __m128i v2 = _mm_unpacklo_epi16(u2, u3); + const __m128i v3 = _mm_unpackhi_epi16(u2, u3); + u0 = _mm_unpacklo_epi32(v0, v2); + u1 = _mm_unpackhi_epi32(v0, v2); + u2 = _mm_unpacklo_epi32(v1, v3); + u3 = _mm_unpackhi_epi32(v1, v3); + _mm_storel_epi64((__m128i*)&out[(j + 0) * size + i], u0); + _mm_storel_epi64((__m128i*)&out[(j + 1) * size + i], _mm_srli_si128(u0, 8)); + _mm_storel_epi64((__m128i*)&out[(j + 2) * size + i], u1); + _mm_storel_epi64((__m128i*)&out[(j + 3) * size + i], _mm_srli_si128(u1, 8)); + _mm_storel_epi64((__m128i*)&out[(j + 4) * size + i], u2); + _mm_storel_epi64((__m128i*)&out[(j + 5) * size + i], _mm_srli_si128(u2, 8)); + _mm_storel_epi64((__m128i*)&out[(j + 6) * size + i], u3); + _mm_storel_epi64((__m128i*)&out[(j + 7) * size + i], _mm_srli_si128(u3, 8)); + } + } + if (j < elem_size) { + for (size_t i = 0; i + 8 < size; i += 8) { + const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[(i + 0) * elem_size + j]); + const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[(i + 1) * elem_size + j]); + const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[(i + 2) * elem_size + j]); + const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[(i + 3) * elem_size + j]); + const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[(i + 4) * elem_size + j]); + const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[(i + 5) * elem_size + j]); + const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[(i + 6) * elem_size + j]); + const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[(i + 7) * elem_size + j]); + __m128i u0 = _mm_unpacklo_epi8(a0, a1); + __m128i u1 = _mm_unpacklo_epi8(a2, a3); + __m128i u2 = _mm_unpacklo_epi8(a4, a5); + __m128i u3 = _mm_unpacklo_epi8(a6, a7); + const __m128i v0 = _mm_unpacklo_epi16(u0, u1); + const __m128i v1 = _mm_unpackhi_epi16(u0, u1); + const __m128i v2 = _mm_unpacklo_epi16(u2, u3); + const __m128i v3 = _mm_unpackhi_epi16(u2, u3); + u0 = _mm_unpacklo_epi32(v0, v2); + u1 = _mm_unpackhi_epi32(v0, v2); + u2 = _mm_unpacklo_epi32(v1, v3); + u3 = _mm_unpackhi_epi32(v1, v3); + switch (elem_size - j) { + case 7: + _mm_storel_epi64((__m128i*)&out[(j + 6) * size + i], u3); + FALLTHROUGH; + case 6: + _mm_storel_epi64((__m128i*)&out[(j + 5) * size + i], _mm_srli_si128(u2, 8)); + FALLTHROUGH; + case 5: + _mm_storel_epi64((__m128i*)&out[(j + 4) * size + i], u2); + FALLTHROUGH; + case 4: + _mm_storel_epi64((__m128i*)&out[(j + 3) * size + i], _mm_srli_si128(u1, 8)); + FALLTHROUGH; + case 3: + _mm_storel_epi64((__m128i*)&out[(j + 2) * size + i], u1); + FALLTHROUGH; + case 2: + _mm_storel_epi64((__m128i*)&out[(j + 1) * size + i], _mm_srli_si128(u0, 8)); + FALLTHROUGH; + default: + _mm_storel_epi64((__m128i*)&out[(j + 0) * size + i], u0); + } + } + for (; j < elem_size; j++) { + for (size_t i = size - 8; i < size; i++) + out[j * size + i] = in[i * elem_size + j]; + } + } +} +#endif + +#if defined(__SSE2__) +#define bitshuf_trans_byte bitshuf_trans_byte_sse2 +#elif BITSHUF_USE_IFUNC +IMPLEMENT_IFUNC(bitshuf_trans_byte_ifunc, + (char* restrict out, const char* restrict in, size_t size, size_t elem_size)) { + __builtin_cpu_init(); + + if (__builtin_cpu_supports("sse2")) + return bitshuf_trans_byte_sse2; + + return bitshuf_trans_byte; +} +#define bitshuf_trans_byte bitshuf_trans_byte_ifunc +#endif + +#if defined(__AVX2__) || BITSHUF_USE_IFUNC +NO_INLINE +ATTRIBUTE_TARGET("avx2") +static void bitshuf_trans_byte_2_avx2(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + + const __m256i MASK = _mm256_set1_epi16(0x00ff); + size_t i = 0; + for (; i + 32 <= size; i += 32) { + const __m256i a0 = _mm256_loadu_si256((const __m256i*)&in[i * 2]); + const __m256i a1 = _mm256_loadu_si256((const __m256i*)&in[i * 2 + 32]); + __m256i u0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a1), 1); + __m256i u1 = _mm256_permute2x128_si256(a0, a1, 0x31); + const __m256i v0 = _mm256_and_si256(u0, MASK); + const __m256i v1 = _mm256_and_si256(u1, MASK); + const __m256i v2 = _mm256_srli_epi16(u0, 8); + const __m256i v3 = _mm256_srli_epi16(u1, 8); + u0 = _mm256_packus_epi16(v0, v1); + u1 = _mm256_packus_epi16(v2, v3); + _mm256_storeu_si256((__m256i*)&out[0 * size + i], u0); + _mm256_storeu_si256((__m256i*)&out[1 * size + i], u1); + } + if (i + 16 <= size) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 2]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 2 + 16]); + const __m128i u0 = _mm_and_si128(a0, _mm256_castsi256_si128(MASK)); + const __m128i u1 = _mm_and_si128(a1, _mm256_castsi256_si128(MASK)); + const __m128i u2 = _mm_srli_epi16(a0, 8); + const __m128i u3 = _mm_srli_epi16(a1, 8); + const __m128i v0 = _mm_packus_epi16(u0, u1); + const __m128i v1 = _mm_packus_epi16(u2, u3); + _mm_storeu_si128((__m128i*)&out[0 * size + i], v0); + _mm_storeu_si128((__m128i*)&out[1 * size + i], v1); + i += 16; + } + if (i < size) { + const __m128i a = _mm_loadu_si128((const __m128i*)&in[i * 2]); + const __m128i u0 = _mm_and_si128(a, _mm256_castsi256_si128(MASK)); + const __m128i u1 = _mm_srli_epi16(a, 8); + const __m128i u = _mm_packus_epi16(u0, u1); + _mm_storel_epi64((__m128i*)&out[0 * size + i], u); + _mm_storel_epi64((__m128i*)&out[1 * size + i], _mm_srli_si128(u, 8)); + } +} +#endif + +#if defined(__SSE2__) || BITSHUF_USE_IFUNC +NO_INLINE +ATTRIBUTE_TARGET("sse2") +static void bitshuf_trans_byte_2_sse2(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + + const __m128i MASK = _mm_set1_epi16(0x00ff); + size_t i = 0; + for (; i + 16 <= size; i += 16) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 2]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 2 + 16]); + const __m128i u0 = _mm_packus_epi16(_mm_and_si128(a0, MASK), _mm_and_si128(a1, MASK)); + const __m128i u1 = _mm_packus_epi16(_mm_srli_epi16(a0, 8), _mm_srli_epi16(a1, 8)); + _mm_storeu_si128((__m128i*)&out[0 * size + i], u0); + _mm_storeu_si128((__m128i*)&out[1 * size + i], u1); + } + if (i < size) { + const __m128i a = _mm_loadu_si128((const __m128i*)&in[i * 2]); + const __m128i u = _mm_packus_epi16(_mm_and_si128(a, MASK), _mm_srli_epi16(a, 8)); + _mm_storel_epi64((__m128i*)&out[0 * size + i], u); + _mm_storel_epi64((__m128i*)&out[1 * size + i], _mm_srli_si128(u, 8)); + } +} +#endif + +#if defined(__AVX2__) +#define bitshuf_trans_byte_2 bitshuf_trans_byte_2_avx2 +#elif BITSHUF_USE_IFUNC +IMPLEMENT_IFUNC(bitshuf_trans_byte_2_ifunc, + (char* restrict out, const char* restrict in, size_t size)) { + __builtin_cpu_init(); + + if (__builtin_cpu_supports("avx2")) + return bitshuf_trans_byte_2_avx2; +#if defined(__SSE2__) + return bitshuf_trans_byte_2_sse2; +#else + if (__builtin_cpu_supports("sse2")) + return bitshuf_trans_byte_2_sse2; + + return bitshuf_trans_byte_2; +#endif +} +#define bitshuf_trans_byte_2 bitshuf_trans_byte_2_ifunc +#elif defined(__SSE2__) +#define bitshuf_trans_byte_2 bitshuf_trans_byte_2_sse2 +#endif + +#if defined(__AVX2__) || BITSHUF_USE_IFUNC +NO_INLINE +ATTRIBUTE_TARGET("avx2") +static void bitshuf_trans_byte_4_avx2(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + + const __m256i SHUF = _mm256_set_epi64x(0x0f0b07030e0a0602, 0x0d0905010c080400, + 0x0f0b07030e0a0602, 0x0d0905010c080400); + const __m256i PERM = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + size_t i = 0; + for (; i + 16 <= size; i += 16) { + const __m256i a0 = _mm256_loadu_si256((const __m256i*)&in[i * 4]); + const __m256i a1 = _mm256_loadu_si256((const __m256i*)&in[i * 4 + 32]); + const __m256i u0 = _mm256_shuffle_epi8(a0, SHUF); + const __m256i u1 = _mm256_shuffle_epi8(a1, SHUF); + const __m256i v0 = _mm256_permutevar8x32_epi32(_mm256_unpacklo_epi32(u0, u1), PERM); + const __m256i v1 = _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi32(u0, u1), PERM); + _mm_storeu_si128((__m128i*)&out[0 * size + i], _mm256_castsi256_si128(v0)); + _mm_storeu_si128((__m128i*)&out[1 * size + i], _mm256_extracti128_si256(v0, 1)); + _mm_storeu_si128((__m128i*)&out[2 * size + i], _mm256_castsi256_si128(v1)); + _mm_storeu_si128((__m128i*)&out[3 * size + i], _mm256_extracti128_si256(v1, 1)); + } + if (i + 8 <= size) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 4]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 4 + 16]); + const __m128i u0 = _mm_shuffle_epi8(a0, _mm256_castsi256_si128(SHUF)); + const __m128i u1 = _mm_shuffle_epi8(a1, _mm256_castsi256_si128(SHUF)); + const __m128i v0 = _mm_unpacklo_epi32(u0, u1); + const __m128i v1 = _mm_unpackhi_epi32(u0, u1); + _mm_storel_epi64((__m128i*)&out[0 * size + i], v0); + _mm_storel_epi64((__m128i*)&out[1 * size + i], _mm_srli_si128(v0, 8)); + _mm_storel_epi64((__m128i*)&out[2 * size + i], v1); + _mm_storel_epi64((__m128i*)&out[3 * size + i], _mm_srli_si128(v1, 8)); + } +} +#endif + +#if defined(__SSE2__) || BITSHUF_USE_IFUNC +NO_INLINE +ATTRIBUTE_TARGET("sse2") +static void bitshuf_trans_byte_4_sse2(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + + const __m128i MASK = _mm_set1_epi16(0x00ff); + size_t i = 0; + for (; i + 16 <= size; i += 16) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 4 + 16 * 0]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 4 + 16 * 1]); + const __m128i a2 = _mm_loadu_si128((const __m128i*)&in[i * 4 + 16 * 2]); + const __m128i a3 = _mm_loadu_si128((const __m128i*)&in[i * 4 + 16 * 3]); + const __m128i u0 = _mm_packus_epi16(_mm_and_si128(a0, MASK), _mm_and_si128(a1, MASK)); + const __m128i u1 = _mm_packus_epi16(_mm_and_si128(a2, MASK), _mm_and_si128(a3, MASK)); + const __m128i u2 = _mm_packus_epi16(_mm_srli_epi16(a0, 8), _mm_srli_epi16(a1, 8)); + const __m128i u3 = _mm_packus_epi16(_mm_srli_epi16(a2, 8), _mm_srli_epi16(a3, 8)); + const __m128i v0 = _mm_packus_epi16(_mm_and_si128(u0, MASK), _mm_and_si128(u1, MASK)); + const __m128i v1 = _mm_packus_epi16(_mm_and_si128(u2, MASK), _mm_and_si128(u3, MASK)); + const __m128i v2 = _mm_packus_epi16(_mm_srli_epi16(u0, 8), _mm_srli_epi16(u1, 8)); + const __m128i v3 = _mm_packus_epi16(_mm_srli_epi16(u2, 8), _mm_srli_epi16(u3, 8)); + _mm_storeu_si128((__m128i*)&out[0 * size + i], v0); + _mm_storeu_si128((__m128i*)&out[1 * size + i], v1); + _mm_storeu_si128((__m128i*)&out[2 * size + i], v2); + _mm_storeu_si128((__m128i*)&out[3 * size + i], v3); + } + if (i + 8 <= size) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 4]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 4 + 16]); + const __m128i u0 = _mm_packus_epi16(_mm_and_si128(a0, MASK), _mm_and_si128(a1, MASK)); + const __m128i u1 = _mm_packus_epi16(_mm_srli_epi16(a0, 8), _mm_srli_epi16(a1, 8)); + const __m128i v0 = _mm_packus_epi16(_mm_and_si128(u0, MASK), _mm_and_si128(u1, MASK)); + const __m128i v1 = _mm_packus_epi16(_mm_srli_epi16(u0, 8), _mm_srli_epi16(u1, 8)); + _mm_storel_epi64((__m128i*)&out[0 * size + i], v0); + _mm_storel_epi64((__m128i*)&out[1 * size + i], _mm_srli_si128(v0, 8)); + _mm_storel_epi64((__m128i*)&out[2 * size + i], v1); + _mm_storel_epi64((__m128i*)&out[3 * size + i], _mm_srli_si128(v1, 8)); + } +} +#endif + +#if defined(__AVX2__) +#define bitshuf_trans_byte_4 bitshuf_trans_byte_4_avx2 +#elif BITSHUF_USE_IFUNC +IMPLEMENT_IFUNC(bitshuf_trans_byte_4_ifunc, + (char* restrict out, const char* restrict in, size_t size)) { + __builtin_cpu_init(); + + if (__builtin_cpu_supports("avx2")) + return bitshuf_trans_byte_4_avx2; +#if defined(__SSE2__) + return bitshuf_trans_byte_4_sse2; +#else + if (__builtin_cpu_supports("sse2")) + return bitshuf_trans_byte_4_sse2; + + return bitshuf_trans_byte_4; +#endif +} +#define bitshuf_trans_byte_4 bitshuf_trans_byte_4_ifunc +#elif defined(__SSE2__) +#define bitshuf_trans_byte_4 bitshuf_trans_byte_4_sse2 +#endif + +#if defined(__SSE2__) || BITSHUF_USE_IFUNC +NO_INLINE +ATTRIBUTE_TARGET("sse2") +static void bitshuf_trans_byte_8_sse2(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + + size_t i = 0; + for (; i + 16 <= size; i += 16) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 0]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 1]); + const __m128i a2 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 2]); + const __m128i a3 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 3]); + const __m128i a4 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 4]); + const __m128i a5 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 5]); + const __m128i a6 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 6]); + const __m128i a7 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 7]); + __m128i u0 = _mm_unpacklo_epi8(a0, a1); + __m128i u1 = _mm_unpackhi_epi8(a0, a1); + __m128i u2 = _mm_unpacklo_epi8(a2, a3); + __m128i u3 = _mm_unpackhi_epi8(a2, a3); + __m128i u4 = _mm_unpacklo_epi8(a4, a5); + __m128i u5 = _mm_unpackhi_epi8(a4, a5); + __m128i u6 = _mm_unpacklo_epi8(a6, a7); + __m128i u7 = _mm_unpackhi_epi8(a6, a7); + __m128i v0 = _mm_unpacklo_epi8(u0, u1); + __m128i v1 = _mm_unpackhi_epi8(u0, u1); + __m128i v2 = _mm_unpacklo_epi8(u2, u3); + __m128i v3 = _mm_unpackhi_epi8(u2, u3); + __m128i v4 = _mm_unpacklo_epi8(u4, u5); + __m128i v5 = _mm_unpackhi_epi8(u4, u5); + __m128i v6 = _mm_unpacklo_epi8(u6, u7); + __m128i v7 = _mm_unpackhi_epi8(u6, u7); + u0 = _mm_unpacklo_epi32(v0, v2); + u1 = _mm_unpackhi_epi32(v0, v2); + u2 = _mm_unpacklo_epi32(v1, v3); + u3 = _mm_unpackhi_epi32(v1, v3); + u4 = _mm_unpacklo_epi32(v4, v6); + u5 = _mm_unpackhi_epi32(v4, v6); + u6 = _mm_unpacklo_epi32(v5, v7); + u7 = _mm_unpackhi_epi32(v5, v7); + v0 = _mm_unpacklo_epi64(u0, u4); + v1 = _mm_unpackhi_epi64(u0, u4); + v2 = _mm_unpacklo_epi64(u1, u5); + v3 = _mm_unpackhi_epi64(u1, u5); + v4 = _mm_unpacklo_epi64(u2, u6); + v5 = _mm_unpackhi_epi64(u2, u6); + v6 = _mm_unpacklo_epi64(u3, u7); + v7 = _mm_unpackhi_epi64(u3, u7); + _mm_storeu_si128((__m128i*)&out[0 * size + i], v0); + _mm_storeu_si128((__m128i*)&out[1 * size + i], v1); + _mm_storeu_si128((__m128i*)&out[2 * size + i], v2); + _mm_storeu_si128((__m128i*)&out[3 * size + i], v3); + _mm_storeu_si128((__m128i*)&out[4 * size + i], v4); + _mm_storeu_si128((__m128i*)&out[5 * size + i], v5); + _mm_storeu_si128((__m128i*)&out[6 * size + i], v6); + _mm_storeu_si128((__m128i*)&out[7 * size + i], v7); + } + if (i + 8 <= size) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 0]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 1]); + const __m128i a2 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 2]); + const __m128i a3 = _mm_loadu_si128((const __m128i*)&in[i * 8 + 16 * 3]); + __m128i u0 = _mm_unpacklo_epi8(a0, a1); + __m128i u1 = _mm_unpackhi_epi8(a0, a1); + __m128i u2 = _mm_unpacklo_epi8(a2, a3); + __m128i u3 = _mm_unpackhi_epi8(a2, a3); + const __m128i v0 = _mm_unpacklo_epi8(u0, u1); + const __m128i v1 = _mm_unpackhi_epi8(u0, u1); + const __m128i v2 = _mm_unpacklo_epi8(u2, u3); + const __m128i v3 = _mm_unpackhi_epi8(u2, u3); + u0 = _mm_unpacklo_epi32(v0, v2); + u1 = _mm_unpackhi_epi32(v0, v2); + u2 = _mm_unpacklo_epi32(v1, v3); + u3 = _mm_unpackhi_epi32(v1, v3); + _mm_storel_epi64((__m128i*)&out[0 * size + i], u0); + _mm_storel_epi64((__m128i*)&out[1 * size + i], _mm_srli_si128(u0, 8)); + _mm_storel_epi64((__m128i*)&out[2 * size + i], u1); + _mm_storel_epi64((__m128i*)&out[3 * size + i], _mm_srli_si128(u1, 8)); + _mm_storel_epi64((__m128i*)&out[4 * size + i], u2); + _mm_storel_epi64((__m128i*)&out[5 * size + i], _mm_srli_si128(u2, 8)); + _mm_storel_epi64((__m128i*)&out[6 * size + i], u3); + _mm_storel_epi64((__m128i*)&out[7 * size + i], _mm_srli_si128(u3, 8)); + } +} +#endif + +#if defined(__SSE2__) +#define bitshuf_trans_byte_8 bitshuf_trans_byte_8_sse2 +#elif BITSHUF_USE_IFUNC +IMPLEMENT_IFUNC(bitshuf_trans_byte_8_ifunc, + (char* restrict out, const char* restrict in, size_t size)) { + __builtin_cpu_init(); + + if (__builtin_cpu_supports("sse2")) + return bitshuf_trans_byte_8_sse2; + + return bitshuf_trans_byte_8; +} +#define bitshuf_trans_byte_8 bitshuf_trans_byte_8_ifunc +#endif + +#if defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__GFNI__) || BITSHUF_USE_IFUNC +NO_INLINE +ATTRIBUTE_TARGET("avx512vbmi,avx512vl,gfni") +static void bitshuf_untrans_bit_avx512vbmi_gfni(char* restrict out, + const char* restrict in, + size_t size) { + assert(size % 8 == 0); + size /= 8; + + const __m512i C = _mm512_set_epi64(0x070f171f474f575f, 0x060e161e464e565e, 0x050d151d454d555d, + 0x040c141c444c545c, 0x030b131b434b535b, 0x020a121a424a525a, + 0x0109111941495159, 0x0008101840485058); + const __m512i I8 = _mm512_set1_epi64(0x8040201008040201); + size_t i = 0; + for (; i + 8 <= size; i += 8) { +#if defined(__x86_64__) || defined(_M_X64) + const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]); + const int64_t a1 = LOAD_I64(&in[1 * size + i]); + const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]); + const int64_t a3 = LOAD_I64(&in[3 * size + i]); + const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[4 * size + i]); + const int64_t a5 = LOAD_I64(&in[5 * size + i]); + const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[6 * size + i]); + const int64_t a7 = LOAD_I64(&in[7 * size + i]); + const __m128i u0 = _mm_insert_epi64(a0, a1, 1); + const __m128i u1 = _mm_insert_epi64(a2, a3, 1); + const __m128i u2 = _mm_insert_epi64(a4, a5, 1); + const __m128i u3 = _mm_insert_epi64(a6, a7, 1); +#else + const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]); + const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]); + const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]); + const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[3 * size + i]); + const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[4 * size + i]); + const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[5 * size + i]); + const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[6 * size + i]); + const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[7 * size + i]); + const __m128i u0 = _mm_unpacklo_epi64(a0, a1); + const __m128i u1 = _mm_unpacklo_epi64(a2, a3); + const __m128i u2 = _mm_unpacklo_epi64(a4, a5); + const __m128i u3 = _mm_unpacklo_epi64(a6, a7); +#endif + const __m256i v0 = _mm256_inserti128_si256(_mm256_castsi128_si256(u0), u1, 1); + const __m256i v1 = _mm256_inserti128_si256(_mm256_castsi128_si256(u2), u3, 1); + __m512i u; + u = _mm512_permutex2var_epi8(_mm512_castsi256_si512(v0), C, _mm512_castsi256_si512(v1)); + u = _mm512_gf2p8affine_epi64_epi8(I8, u, 0x00); + _mm512_storeu_si512(&out[i * 8], u); + } + if (i < size) { + const __mmask8 k = (1U << (size - i)) - 1; + const __m128i a0 = _mm_maskz_loadu_epi8(k, &in[0 * size + i]); + const __m128i a1 = _mm_maskz_loadu_epi8(k, &in[1 * size + i]); + const __m128i a2 = _mm_maskz_loadu_epi8(k, &in[2 * size + i]); + const __m128i a3 = _mm_maskz_loadu_epi8(k, &in[3 * size + i]); + const __m128i a4 = _mm_maskz_loadu_epi8(k, &in[4 * size + i]); + const __m128i a5 = _mm_maskz_loadu_epi8(k, &in[5 * size + i]); + const __m128i a6 = _mm_maskz_loadu_epi8(k, &in[6 * size + i]); + const __m128i a7 = _mm_maskz_loadu_epi8(k, &in[7 * size + i]); + const __m128i u0 = _mm_unpacklo_epi64(a0, a1); + const __m128i u1 = _mm_unpacklo_epi64(a2, a3); + const __m128i u2 = _mm_unpacklo_epi64(a4, a5); + const __m128i u3 = _mm_unpacklo_epi64(a6, a7); + const __m256i v0 = _mm256_inserti128_si256(_mm256_castsi128_si256(u0), u1, 1); + const __m256i v1 = _mm256_inserti128_si256(_mm256_castsi128_si256(u2), u3, 1); + __m512i u; + u = _mm512_permutex2var_epi8(_mm512_castsi256_si512(v0), C, _mm512_castsi256_si512(v1)); + u = _mm512_gf2p8affine_epi64_epi8(I8, u, 0x00); + _mm512_mask_storeu_epi64(&out[i * 8], k, u); + } +} +#endif + +#if defined(__AVX512BW__) && defined(__AVX512VL__) || BITSHUF_USE_IFUNC +IMPLEMENT_LOAD_FUNCTION(LOAD_MASK64, __mmask64) + +ATTRIBUTE_TARGET("sse2") +static ALWAYS_INLINE __mmask64 MM_CVTSI128_MASK64(__m128i a) { +#if defined(__x86_64__) || defined(_M_X64) + return _mm_cvtsi128_si64(a); +#else + __mmask64 k; + _mm_storel_epi64((__m128i*)&k, a); + return k; +#endif +} + +NO_INLINE +ATTRIBUTE_TARGET("avx512bw,avx512vl") +static void bitshuf_untrans_bit_avx512bw(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + size /= 8; + + const __m512i C0 = _mm512_set1_epi8(0x01); + const __m512i C1 = _mm512_set1_epi8(0x02); + const __m512i C2 = _mm512_set1_epi8(0x04); + const __m512i C3 = _mm512_set1_epi8(0x08); + const __m512i C4 = _mm512_set1_epi8(0x10); + const __m512i C5 = _mm512_set1_epi8(0x20); + const __m512i C6 = _mm512_set1_epi8(0x40); + const __m512i C7 = _mm512_set1_epi8(-128); + size_t i = 0; + for (; i + 8 <= size; i += 8) { + __m512i u = _mm512_maskz_mov_epi8(LOAD_MASK64(&in[0 * size + i]), C0); + u = _mm512_mask_add_epi8(X(u), LOAD_MASK64(&in[1 * size + i]), u, C1); + u = _mm512_mask_add_epi8(X(u), LOAD_MASK64(&in[2 * size + i]), u, C2); + u = _mm512_mask_add_epi8(X(u), LOAD_MASK64(&in[3 * size + i]), u, C3); + u = _mm512_mask_add_epi8(X(u), LOAD_MASK64(&in[4 * size + i]), u, C4); + u = _mm512_mask_add_epi8(X(u), LOAD_MASK64(&in[5 * size + i]), u, C5); + u = _mm512_mask_add_epi8(X(u), LOAD_MASK64(&in[6 * size + i]), u, C6); + u = _mm512_mask_add_epi8(X(u), LOAD_MASK64(&in[7 * size + i]), u, C7); + _mm512_storeu_si512(&out[i * 8], u); + } + if (i < size) { + const __mmask8 k = (1U << (size - i)) - 1; + const __mmask64 a0 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[0 * size + i])); + const __mmask64 a1 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[1 * size + i])); + const __mmask64 a2 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[2 * size + i])); + const __mmask64 a3 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[3 * size + i])); + const __mmask64 a4 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[4 * size + i])); + const __mmask64 a5 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[5 * size + i])); + const __mmask64 a6 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[6 * size + i])); + const __mmask64 a7 = MM_CVTSI128_MASK64(_mm_maskz_loadu_epi8(k, &in[7 * size + i])); + __m512i u = _mm512_maskz_mov_epi8(a0, C0); + u = _mm512_mask_add_epi8(X(u), a1, u, C1); + u = _mm512_mask_add_epi8(X(u), a2, u, C2); + u = _mm512_mask_add_epi8(X(u), a3, u, C3); + u = _mm512_mask_add_epi8(X(u), a4, u, C4); + u = _mm512_mask_add_epi8(X(u), a5, u, C5); + u = _mm512_mask_add_epi8(X(u), a6, u, C6); + u = _mm512_mask_add_epi8(X(u), a7, u, C7); + _mm512_mask_storeu_epi64(&out[i * 8], k, u); + } +} +#endif + +#if defined(__AVX2__) || BITSHUF_USE_IFUNC +NO_INLINE +ATTRIBUTE_TARGET("avx2") +static void bitshuf_untrans_bit_avx2(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + size /= 8; + + const __m256i PERM = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + const __m256i MASK0 = _mm256_set1_epi64x(0x00aa00aa00aa00aa); + const __m256i MASK1 = _mm256_set1_epi64x(0x0000cccc0000cccc); + const __m256i MASK2 = _mm256_set1_epi64x(0x00000000f0f0f0f0); + size_t i = 0; + for (; i + 8 <= size; i += 8) { + const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]); + const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]); + const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]); + const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[3 * size + i]); + const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[4 * size + i]); + const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[5 * size + i]); + const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[6 * size + i]); + const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[7 * size + i]); + __m256i u0 = MM256_SETR_M128I(_mm_unpacklo_epi8(a0, a1), _mm_unpacklo_epi8(a4, a5)); + __m256i u1 = MM256_SETR_M128I(_mm_unpacklo_epi8(a2, a3), _mm_unpacklo_epi8(a6, a7)); + __m256i v0 = _mm256_unpacklo_epi16(u0, u1); + __m256i v1 = _mm256_unpackhi_epi16(u0, u1); + u0 = _mm256_permutevar8x32_epi32(v0, PERM); + u1 = _mm256_permutevar8x32_epi32(v1, PERM); + v0 = _mm256_and_si256(_mm256_xor_si256(u0, _mm256_srli_epi64(u0, 07)), MASK0); + v1 = _mm256_and_si256(_mm256_xor_si256(u1, _mm256_srli_epi64(u1, 07)), MASK0); + u0 = _mm256_xor_si256(_mm256_xor_si256(u0, _mm256_slli_epi64(v0, 07)), v0); + u1 = _mm256_xor_si256(_mm256_xor_si256(u1, _mm256_slli_epi64(v1, 07)), v1); + v0 = _mm256_and_si256(_mm256_xor_si256(u0, _mm256_srli_epi64(u0, 14)), MASK1); + v1 = _mm256_and_si256(_mm256_xor_si256(u1, _mm256_srli_epi64(u1, 14)), MASK1); + u0 = _mm256_xor_si256(_mm256_xor_si256(u0, _mm256_slli_epi64(v0, 14)), v0); + u1 = _mm256_xor_si256(_mm256_xor_si256(u1, _mm256_slli_epi64(v1, 14)), v1); + v0 = _mm256_and_si256(_mm256_xor_si256(u0, _mm256_srli_epi64(u0, 28)), MASK2); + v1 = _mm256_and_si256(_mm256_xor_si256(u1, _mm256_srli_epi64(u1, 28)), MASK2); + u0 = _mm256_xor_si256(_mm256_xor_si256(u0, _mm256_slli_epi64(v0, 28)), v0); + u1 = _mm256_xor_si256(_mm256_xor_si256(u1, _mm256_slli_epi64(v1, 28)), v1); + _mm256_storeu_si256((__m256i*)&out[i * 8], u0); + _mm256_storeu_si256((__m256i*)&out[i * 8 + 32], u1); + } + if (i < size) + bitshuf_untrans_bit_tail(out, in, size * 8, i); +} +#endif + +#if defined(__SSE2__) || BITSHUF_USE_IFUNC +NO_INLINE +ATTRIBUTE_TARGET("sse2") +static void bitshuf_untrans_bit_sse2(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + size /= 8; + + const __m128i MASK0 = _mm_set1_epi64x(0x00aa00aa00aa00aa); + const __m128i MASK1 = _mm_set1_epi64x(0x0000cccc0000cccc); + const __m128i MASK2 = _mm_set1_epi64x(0x00000000f0f0f0f0); + size_t i = 0; + for (; i + 8 <= size; i += 8) { + const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]); + const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]); + const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]); + const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[3 * size + i]); + const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[4 * size + i]); + const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[5 * size + i]); + const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[6 * size + i]); + const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[7 * size + i]); + __m128i u0 = _mm_unpacklo_epi8(a0, a1); + __m128i u1 = _mm_unpacklo_epi8(a2, a3); + __m128i u2 = _mm_unpacklo_epi8(a4, a5); + __m128i u3 = _mm_unpacklo_epi8(a6, a7); + __m128i v0 = _mm_unpacklo_epi16(u0, u1); + __m128i v1 = _mm_unpackhi_epi16(u0, u1); + __m128i v2 = _mm_unpacklo_epi16(u2, u3); + __m128i v3 = _mm_unpackhi_epi16(u2, u3); + u0 = _mm_unpacklo_epi32(v0, v2); + u1 = _mm_unpackhi_epi32(v0, v2); + u2 = _mm_unpacklo_epi32(v1, v3); + u3 = _mm_unpackhi_epi32(v1, v3); + v0 = _mm_and_si128(_mm_xor_si128(u0, _mm_srli_epi64(u0, 07)), MASK0); + v1 = _mm_and_si128(_mm_xor_si128(u1, _mm_srli_epi64(u1, 07)), MASK0); + v2 = _mm_and_si128(_mm_xor_si128(u2, _mm_srli_epi64(u2, 07)), MASK0); + v3 = _mm_and_si128(_mm_xor_si128(u3, _mm_srli_epi64(u3, 07)), MASK0); + u0 = _mm_xor_si128(_mm_xor_si128(u0, _mm_slli_epi64(v0, 07)), v0); + u1 = _mm_xor_si128(_mm_xor_si128(u1, _mm_slli_epi64(v1, 07)), v1); + u2 = _mm_xor_si128(_mm_xor_si128(u2, _mm_slli_epi64(v2, 07)), v2); + u3 = _mm_xor_si128(_mm_xor_si128(u3, _mm_slli_epi64(v3, 07)), v3); + v0 = _mm_and_si128(_mm_xor_si128(u0, _mm_srli_epi64(u0, 14)), MASK1); + v1 = _mm_and_si128(_mm_xor_si128(u1, _mm_srli_epi64(u1, 14)), MASK1); + v2 = _mm_and_si128(_mm_xor_si128(u2, _mm_srli_epi64(u2, 14)), MASK1); + v3 = _mm_and_si128(_mm_xor_si128(u3, _mm_srli_epi64(u3, 14)), MASK1); + u0 = _mm_xor_si128(_mm_xor_si128(u0, _mm_slli_epi64(v0, 14)), v0); + u1 = _mm_xor_si128(_mm_xor_si128(u1, _mm_slli_epi64(v1, 14)), v1); + u2 = _mm_xor_si128(_mm_xor_si128(u2, _mm_slli_epi64(v2, 14)), v2); + u3 = _mm_xor_si128(_mm_xor_si128(u3, _mm_slli_epi64(v3, 14)), v3); + v0 = _mm_and_si128(_mm_xor_si128(u0, _mm_srli_epi64(u0, 28)), MASK2); + v1 = _mm_and_si128(_mm_xor_si128(u1, _mm_srli_epi64(u1, 28)), MASK2); + v2 = _mm_and_si128(_mm_xor_si128(u2, _mm_srli_epi64(u2, 28)), MASK2); + v3 = _mm_and_si128(_mm_xor_si128(u3, _mm_srli_epi64(u3, 28)), MASK2); + u0 = _mm_xor_si128(_mm_xor_si128(u0, _mm_slli_epi64(v0, 28)), v0); + u1 = _mm_xor_si128(_mm_xor_si128(u1, _mm_slli_epi64(v1, 28)), v1); + u2 = _mm_xor_si128(_mm_xor_si128(u2, _mm_slli_epi64(v2, 28)), v2); + u3 = _mm_xor_si128(_mm_xor_si128(u3, _mm_slli_epi64(v3, 28)), v3); + _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 0], u0); + _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 1], u1); + _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 2], u2); + _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 3], u3); + } + if (i < size) + bitshuf_untrans_bit_tail(out, in, size * 8, i); +} +#endif + +#if defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__GFNI__) +#define bitshuf_untrans_bit bitshuf_untrans_bit_avx512vbmi_gfni +#elif BITSHUF_USE_IFUNC +IMPLEMENT_IFUNC(bitshuf_untrans_bit_ifunc, + (char* restrict out, const char* restrict in, size_t size)) { + __builtin_cpu_init(); + + if (__builtin_cpu_supports("avx512vbmi") && __builtin_cpu_supports("avx512vl") && + __builtin_cpu_supports("gfni")) + { + return bitshuf_untrans_bit_avx512vbmi_gfni; + } +#if defined(__AVX512BW__) && defined(__AVX512VL__) + return bitshuf_untrans_bit_avx512bw; +#else + if (__builtin_cpu_supports("avx512bw") && __builtin_cpu_supports("avx512vl")) + return bitshuf_untrans_bit_avx512bw; +#if defined(__AVX2__) + return bitshuf_untrans_bit_avx2; +#else + if (__builtin_cpu_supports("avx2")) + return bitshuf_untrans_bit_avx2; +#if defined(__SSE2__) + return bitshuf_untrans_bit_sse2; +#else + if (__builtin_cpu_supports("sse2")) + return bitshuf_untrans_bit_sse2; + + return bitshuf_untrans_bit; +#endif +#endif +#endif +} +#define bitshuf_untrans_bit bitshuf_untrans_bit_ifunc +#elif defined(__AVX512BW__) && defined(__AVX512VL__) +#define bitshuf_untrans_bit bitshuf_untrans_bit_avx512bw +#elif defined(__AVX2__) +#define bitshuf_untrans_bit bitshuf_untrans_bit_avx2 +#elif defined(__SSE2__) +#define bitshuf_untrans_bit bitshuf_untrans_bit_sse2 +#endif + +#if defined(__SSE2__) || BITSHUF_USE_IFUNC +NO_INLINE +ATTRIBUTE_TARGET("sse2") +static void bitshuf_untrans_byte_sse2(char* restrict out, + const char* restrict in, + size_t size, + size_t elem_size) { + assert(size % 8 == 0); + + size_t j = 0; + for (; j + 8 <= elem_size; j += 8) { + for (size_t i = 0; i < size; i += 8) { + const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[(j + 0) * size + i]); + const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[(j + 1) * size + i]); + const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[(j + 2) * size + i]); + const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[(j + 3) * size + i]); + const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[(j + 4) * size + i]); + const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[(j + 5) * size + i]); + const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[(j + 6) * size + i]); + const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[(j + 7) * size + i]); + __m128i u0 = _mm_unpacklo_epi8(a0, a1); + __m128i u1 = _mm_unpacklo_epi8(a2, a3); + __m128i u2 = _mm_unpacklo_epi8(a4, a5); + __m128i u3 = _mm_unpacklo_epi8(a6, a7); + const __m128i v0 = _mm_unpacklo_epi16(u0, u1); + const __m128i v1 = _mm_unpackhi_epi16(u0, u1); + const __m128i v2 = _mm_unpacklo_epi16(u2, u3); + const __m128i v3 = _mm_unpackhi_epi16(u2, u3); + u0 = _mm_unpacklo_epi32(v0, v2); + u1 = _mm_unpackhi_epi32(v0, v2); + u2 = _mm_unpacklo_epi32(v1, v3); + u3 = _mm_unpackhi_epi32(v1, v3); + _mm_storel_epi64((__m128i*)&out[(i + 0) * elem_size + j], u0); + _mm_storel_epi64((__m128i*)&out[(i + 1) * elem_size + j], _mm_srli_si128(u0, 8)); + _mm_storel_epi64((__m128i*)&out[(i + 2) * elem_size + j], u1); + _mm_storel_epi64((__m128i*)&out[(i + 3) * elem_size + j], _mm_srli_si128(u1, 8)); + _mm_storel_epi64((__m128i*)&out[(i + 4) * elem_size + j], u2); + _mm_storel_epi64((__m128i*)&out[(i + 5) * elem_size + j], _mm_srli_si128(u2, 8)); + _mm_storel_epi64((__m128i*)&out[(i + 6) * elem_size + j], u3); + _mm_storel_epi64((__m128i*)&out[(i + 7) * elem_size + j], _mm_srli_si128(u3, 8)); + } + } + if (j < elem_size) { + const size_t j0 = (j + 0) * size; + const size_t j1 = (j + 1) < elem_size ? (j + 1) * size : 1; + const size_t j2 = (j + 2) % elem_size * size + (j + 2) / elem_size; + const size_t j3 = (j + 3) % elem_size * size + (j + 3) / elem_size; + const size_t j4 = (j + 4) % elem_size * size + (j + 4) / elem_size; + const size_t j5 = (j + 5) % elem_size * size + (j + 5) / elem_size; + const size_t j6 = (j + 6) % elem_size * size + (j + 6) / elem_size; + const size_t j7 = (j + 7) % elem_size * size + (j + 7) / elem_size; + for (size_t i = 0; i + 8 < size; i += 8) { + const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[j0 + i]); + const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[j1 + i]); + const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[j2 + i]); + const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[j3 + i]); + const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[j4 + i]); + const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[j5 + i]); + const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[j6 + i]); + const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[j7 + i]); + __m128i u0 = _mm_unpacklo_epi8(a0, a1); + __m128i u1 = _mm_unpacklo_epi8(a2, a3); + __m128i u2 = _mm_unpacklo_epi8(a4, a5); + __m128i u3 = _mm_unpacklo_epi8(a6, a7); + const __m128i v0 = _mm_unpacklo_epi16(u0, u1); + const __m128i v1 = _mm_unpackhi_epi16(u0, u1); + const __m128i v2 = _mm_unpacklo_epi16(u2, u3); + const __m128i v3 = _mm_unpackhi_epi16(u2, u3); + u0 = _mm_unpacklo_epi32(v0, v2); + u1 = _mm_unpackhi_epi32(v0, v2); + u2 = _mm_unpacklo_epi32(v1, v3); + u3 = _mm_unpackhi_epi32(v1, v3); + _mm_storel_epi64((__m128i*)&out[(i + 0) * elem_size + j], u0); + _mm_storel_epi64((__m128i*)&out[(i + 1) * elem_size + j], _mm_srli_si128(u0, 8)); + _mm_storel_epi64((__m128i*)&out[(i + 2) * elem_size + j], u1); + _mm_storel_epi64((__m128i*)&out[(i + 3) * elem_size + j], _mm_srli_si128(u1, 8)); + _mm_storel_epi64((__m128i*)&out[(i + 4) * elem_size + j], u2); + _mm_storel_epi64((__m128i*)&out[(i + 5) * elem_size + j], _mm_srli_si128(u2, 8)); + _mm_storel_epi64((__m128i*)&out[(i + 6) * elem_size + j], u3); + _mm_storel_epi64((__m128i*)&out[(i + 7) * elem_size + j], _mm_srli_si128(u3, 8)); + } + for (; j < elem_size; j++) { + for (size_t i = size - 8; i < size; i++) + out[i * elem_size + j] = in[j * size + i]; + } + } +} +#endif + +#if defined(__SSE2__) +#define bitshuf_untrans_byte bitshuf_untrans_byte_sse2 +#elif BITSHUF_USE_IFUNC +IMPLEMENT_IFUNC(bitshuf_untrans_byte_ifunc, + (char* restrict out, const char* restrict in, size_t size, size_t elem_size)) { + __builtin_cpu_init(); + + if (__builtin_cpu_supports("sse2")) + return bitshuf_untrans_byte_sse2; + + return bitshuf_untrans_byte; +} +#define bitshuf_untrans_byte bitshuf_untrans_byte_ifunc +#endif + +#if defined(__AVX2__) || BITSHUF_USE_IFUNC +NO_INLINE +ATTRIBUTE_TARGET("avx2") +static void bitshuf_untrans_byte_2_avx2(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + + size_t i = 0; + for (; i + 32 <= size; i += 32) { + const __m256i a0 = _mm256_loadu_si256((const __m256i*)&in[0 * size + i]); + const __m256i a1 = _mm256_loadu_si256((const __m256i*)&in[1 * size + i]); + const __m256i u0 = _mm256_permute4x64_epi64(a0, 0xd8); + const __m256i u1 = _mm256_permute4x64_epi64(a1, 0xd8); + const __m256i v0 = _mm256_unpacklo_epi8(u0, u1); + const __m256i v1 = _mm256_unpackhi_epi8(u0, u1); + _mm256_storeu_si256((__m256i*)&out[i * 2], v0); + _mm256_storeu_si256((__m256i*)&out[i * 2 + 32], v1); + } + if (i + 16 <= size) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[0 * size + i]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[1 * size + i]); + const __m128i u0 = _mm_unpacklo_epi8(a0, a1); + const __m128i u1 = _mm_unpackhi_epi8(a0, a1); + _mm_storeu_si128((__m128i*)&out[i * 2], u0); + _mm_storeu_si128((__m128i*)&out[i * 2 + 16], u1); + i += 16; + } + if (i + 8 <= size) { + const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]); + const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]); + const __m128i u = _mm_unpacklo_epi8(a0, a1); + _mm_storeu_si128((__m128i*)&out[i * 2], u); + } +} +#endif + +#if defined(__SSE2__) || BITSHUF_USE_IFUNC +NO_INLINE +ATTRIBUTE_TARGET("sse2") +static void bitshuf_untrans_byte_2_sse2(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + + size_t i = 0; + for (; i + 16 <= size; i += 16) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[0 * size + i]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[1 * size + i]); + const __m128i u0 = _mm_unpacklo_epi8(a0, a1); + const __m128i u1 = _mm_unpackhi_epi8(a0, a1); + _mm_storeu_si128((__m128i*)&out[i * 2], u0); + _mm_storeu_si128((__m128i*)&out[i * 2 + 16], u1); + } + if (i + 8 <= size) { + const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]); + const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]); + const __m128i u = _mm_unpacklo_epi8(a0, a1); + _mm_storeu_si128((__m128i*)&out[i * 2], u); + } +} +#endif + +#if defined(__AVX2__) +#define bitshuf_untrans_byte_2 bitshuf_untrans_byte_2_avx2 +#elif BITSHUF_USE_IFUNC +IMPLEMENT_IFUNC(bitshuf_untrans_byte_2_ifunc, + (char* restrict out, const char* restrict in, size_t size)) { + __builtin_cpu_init(); + + if (__builtin_cpu_supports("avx2")) + return bitshuf_untrans_byte_2_avx2; +#if defined(__SSE2__) + return bitshuf_untrans_byte_2_sse2; +#else + if (__builtin_cpu_supports("sse2")) + return bitshuf_untrans_byte_2_sse2; + + return bitshuf_untrans_byte_2; +#endif +} +#define bitshuf_untrans_byte_2 bitshuf_untrans_byte_2_ifunc +#elif defined(__SSE2__) +#define bitshuf_untrans_byte_2 bitshuf_untrans_byte_2_sse2 +#endif + +#if defined(__AVX2__) || BITSHUF_USE_IFUNC +NO_INLINE +ATTRIBUTE_TARGET("avx2") +static void bitshuf_untrans_byte_4_avx2(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + + size_t i = 0; + for (; i + 32 <= size; i += 32) { + const __m256i a0 = _mm256_loadu_si256((const __m256i*)&in[0 * size + i]); + const __m256i a1 = _mm256_loadu_si256((const __m256i*)&in[1 * size + i]); + const __m256i a2 = _mm256_loadu_si256((const __m256i*)&in[2 * size + i]); + const __m256i a3 = _mm256_loadu_si256((const __m256i*)&in[3 * size + i]); + __m256i u0 = _mm256_unpacklo_epi8(a0, a1); + __m256i u1 = _mm256_unpackhi_epi8(a0, a1); + __m256i u2 = _mm256_unpacklo_epi8(a2, a3); + __m256i u3 = _mm256_unpackhi_epi8(a2, a3); + const __m256i v0 = _mm256_unpacklo_epi16(u0, u2); + const __m256i v1 = _mm256_unpackhi_epi16(u0, u2); + const __m256i v2 = _mm256_unpacklo_epi16(u1, u3); + const __m256i v3 = _mm256_unpackhi_epi16(u1, u3); + u0 = _mm256_inserti128_si256(v0, _mm256_castsi256_si128(v1), 1); + u1 = _mm256_inserti128_si256(v2, _mm256_castsi256_si128(v3), 1); + u2 = _mm256_permute2x128_si256(v0, v1, 0x31); + u3 = _mm256_permute2x128_si256(v2, v3, 0x31); + _mm256_storeu_si256((__m256i*)&out[i * 4 + 32 * 0], u0); + _mm256_storeu_si256((__m256i*)&out[i * 4 + 32 * 1], u1); + _mm256_storeu_si256((__m256i*)&out[i * 4 + 32 * 2], u2); + _mm256_storeu_si256((__m256i*)&out[i * 4 + 32 * 3], u3); + } + if (i + 16 <= size) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[0 * size + i]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[1 * size + i]); + const __m128i a2 = _mm_loadu_si128((const __m128i*)&in[2 * size + i]); + const __m128i a3 = _mm_loadu_si128((const __m128i*)&in[3 * size + i]); + const __m128i u0 = _mm_unpacklo_epi8(a0, a1); + const __m128i u1 = _mm_unpackhi_epi8(a0, a1); + const __m128i u2 = _mm_unpacklo_epi8(a2, a3); + const __m128i u3 = _mm_unpackhi_epi8(a2, a3); + const __m128i v0 = _mm_unpacklo_epi16(u0, u2); + const __m128i v1 = _mm_unpackhi_epi16(u0, u2); + const __m128i v2 = _mm_unpacklo_epi16(u1, u3); + const __m128i v3 = _mm_unpackhi_epi16(u1, u3); + _mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 0], v0); + _mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 1], v1); + _mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 2], v2); + _mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 3], v3); + i += 16; + } + if (i + 8 <= size) { + const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]); + const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]); + const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]); + const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[3 * size + i]); + const __m128i u0 = _mm_unpacklo_epi8(a0, a1); + const __m128i u1 = _mm_unpacklo_epi8(a2, a3); + const __m128i v0 = _mm_unpacklo_epi16(u0, u1); + const __m128i v1 = _mm_unpackhi_epi16(u0, u1); + _mm_storeu_si128((__m128i*)&out[i * 4], v0); + _mm_storeu_si128((__m128i*)&out[i * 4 + 16], v1); + } +} +#endif + +#if defined(__SSE2__) || BITSHUF_USE_IFUNC +NO_INLINE +ATTRIBUTE_TARGET("sse2") +static void bitshuf_untrans_byte_4_sse2(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + + size_t i = 0; + for (; i + 16 <= size; i += 16) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[0 * size + i]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[1 * size + i]); + const __m128i a2 = _mm_loadu_si128((const __m128i*)&in[2 * size + i]); + const __m128i a3 = _mm_loadu_si128((const __m128i*)&in[3 * size + i]); + const __m128i u0 = _mm_unpacklo_epi8(a0, a1); + const __m128i u1 = _mm_unpackhi_epi8(a0, a1); + const __m128i u2 = _mm_unpacklo_epi8(a2, a3); + const __m128i u3 = _mm_unpackhi_epi8(a2, a3); + const __m128i v0 = _mm_unpacklo_epi16(u0, u2); + const __m128i v1 = _mm_unpackhi_epi16(u0, u2); + const __m128i v2 = _mm_unpacklo_epi16(u1, u3); + const __m128i v3 = _mm_unpackhi_epi16(u1, u3); + _mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 0], v0); + _mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 1], v1); + _mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 2], v2); + _mm_storeu_si128((__m128i*)&out[i * 4 + 16 * 3], v3); + } + if (i + 8 <= size) { + const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]); + const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]); + const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]); + const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[3 * size + i]); + const __m128i u0 = _mm_unpacklo_epi8(a0, a1); + const __m128i u1 = _mm_unpacklo_epi8(a2, a3); + const __m128i v0 = _mm_unpacklo_epi16(u0, u1); + const __m128i v1 = _mm_unpackhi_epi16(u0, u1); + _mm_storeu_si128((__m128i*)&out[i * 4], v0); + _mm_storeu_si128((__m128i*)&out[i * 4 + 16], v1); + } +} +#endif + +#if defined(__AVX2__) +#define bitshuf_untrans_byte_4 bitshuf_untrans_byte_4_avx2 +#elif BITSHUF_USE_IFUNC +IMPLEMENT_IFUNC(bitshuf_untrans_byte_4_ifunc, + (char* restrict out, const char* restrict in, size_t size)) { + __builtin_cpu_init(); + + if (__builtin_cpu_supports("avx2")) + return bitshuf_untrans_byte_4_avx2; +#if defined(__SSE2__) + return bitshuf_untrans_byte_4_sse2; +#else + if (__builtin_cpu_supports("sse2")) + return bitshuf_untrans_byte_4_sse2; + + return bitshuf_untrans_byte_4; +#endif +} +#define bitshuf_untrans_byte_4 bitshuf_untrans_byte_4_ifunc +#elif defined(__SSE2__) +#define bitshuf_untrans_byte_4 bitshuf_untrans_byte_4_sse2 +#endif + +#if defined(__SSE2__) || BITSHUF_USE_IFUNC +NO_INLINE +ATTRIBUTE_TARGET("sse2") +static void bitshuf_untrans_byte_8_sse2(char* restrict out, const char* restrict in, size_t size) { + assert(size % 8 == 0); + + size_t i = 0; + for (; i + 16 <= size; i += 16) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)&in[0 * size + i]); + const __m128i a1 = _mm_loadu_si128((const __m128i*)&in[1 * size + i]); + const __m128i a2 = _mm_loadu_si128((const __m128i*)&in[2 * size + i]); + const __m128i a3 = _mm_loadu_si128((const __m128i*)&in[3 * size + i]); + const __m128i a4 = _mm_loadu_si128((const __m128i*)&in[4 * size + i]); + const __m128i a5 = _mm_loadu_si128((const __m128i*)&in[5 * size + i]); + const __m128i a6 = _mm_loadu_si128((const __m128i*)&in[6 * size + i]); + const __m128i a7 = _mm_loadu_si128((const __m128i*)&in[7 * size + i]); + __m128i u0 = _mm_unpacklo_epi8(a0, a1); + __m128i u1 = _mm_unpackhi_epi8(a0, a1); + __m128i u2 = _mm_unpacklo_epi8(a2, a3); + __m128i u3 = _mm_unpackhi_epi8(a2, a3); + __m128i u4 = _mm_unpacklo_epi8(a4, a5); + __m128i u5 = _mm_unpackhi_epi8(a4, a5); + __m128i u6 = _mm_unpacklo_epi8(a6, a7); + __m128i u7 = _mm_unpackhi_epi8(a6, a7); + const __m128i v0 = _mm_unpacklo_epi16(u0, u2); + const __m128i v1 = _mm_unpackhi_epi16(u0, u2); + const __m128i v2 = _mm_unpacklo_epi16(u1, u3); + const __m128i v3 = _mm_unpackhi_epi16(u1, u3); + const __m128i v4 = _mm_unpacklo_epi16(u4, u6); + const __m128i v5 = _mm_unpackhi_epi16(u4, u6); + const __m128i v6 = _mm_unpacklo_epi16(u5, u7); + const __m128i v7 = _mm_unpackhi_epi16(u5, u7); + u0 = _mm_unpacklo_epi32(v0, v4); + u1 = _mm_unpackhi_epi32(v0, v4); + u2 = _mm_unpacklo_epi32(v1, v5); + u3 = _mm_unpackhi_epi32(v1, v5); + u4 = _mm_unpacklo_epi32(v2, v6); + u5 = _mm_unpackhi_epi32(v2, v6); + u6 = _mm_unpacklo_epi32(v3, v7); + u7 = _mm_unpackhi_epi32(v3, v7); + _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 0], u0); + _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 1], u1); + _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 2], u2); + _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 3], u3); + _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 4], u4); + _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 5], u5); + _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 6], u6); + _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 7], u7); + } + if (i + 8 <= size) { + const __m128i a0 = _mm_loadl_epi64((const __m128i*)&in[0 * size + i]); + const __m128i a1 = _mm_loadl_epi64((const __m128i*)&in[1 * size + i]); + const __m128i a2 = _mm_loadl_epi64((const __m128i*)&in[2 * size + i]); + const __m128i a3 = _mm_loadl_epi64((const __m128i*)&in[3 * size + i]); + const __m128i a4 = _mm_loadl_epi64((const __m128i*)&in[4 * size + i]); + const __m128i a5 = _mm_loadl_epi64((const __m128i*)&in[5 * size + i]); + const __m128i a6 = _mm_loadl_epi64((const __m128i*)&in[6 * size + i]); + const __m128i a7 = _mm_loadl_epi64((const __m128i*)&in[7 * size + i]); + __m128i u0 = _mm_unpacklo_epi8(a0, a1); + __m128i u1 = _mm_unpacklo_epi8(a2, a3); + __m128i u2 = _mm_unpacklo_epi8(a4, a5); + __m128i u3 = _mm_unpacklo_epi8(a6, a7); + const __m128i v0 = _mm_unpacklo_epi16(u0, u1); + const __m128i v1 = _mm_unpackhi_epi16(u0, u1); + const __m128i v2 = _mm_unpacklo_epi16(u2, u3); + const __m128i v3 = _mm_unpackhi_epi16(u2, u3); + u0 = _mm_unpacklo_epi32(v0, v2); + u1 = _mm_unpackhi_epi32(v0, v2); + u2 = _mm_unpacklo_epi32(v1, v3); + u3 = _mm_unpackhi_epi32(v1, v3); + _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 0], u0); + _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 1], u1); + _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 2], u2); + _mm_storeu_si128((__m128i*)&out[i * 8 + 16 * 3], u3); + } +} +#endif + +#if defined(__SSE2__) +#define bitshuf_untrans_byte_8 bitshuf_untrans_byte_8_sse2 +#elif BITSHUF_USE_IFUNC +IMPLEMENT_IFUNC(bitshuf_untrans_byte_8_ifunc, + (char* restrict out, const char* restrict in, size_t size)) { + __builtin_cpu_init(); + + if (__builtin_cpu_supports("sse2")) + return bitshuf_untrans_byte_8_sse2; + + return bitshuf_untrans_byte_8; +} +#define bitshuf_untrans_byte_8 bitshuf_untrans_byte_8_ifunc +#endif + +#endif + +int bitshuf_encode_block(char* restrict out, + const char* restrict in, + char* restrict scratch, + size_t size, + size_t elem_size) { + if (UNLIKELY(size & 7)) + return -1; + + if (elem_size == 1) { + bitshuf_trans_bit(out, in, size); + } else { + if (UNLIKELY(!scratch && elem_size > 1)) + return -1; + + switch (elem_size) { + case 2: + bitshuf_trans_byte_2(scratch, in, size); + break; + case 4: + bitshuf_trans_byte_4(scratch, in, size); + break; + case 8: + bitshuf_trans_byte_8(scratch, in, size); + break; + default: + bitshuf_trans_byte(scratch, in, size, elem_size); + break; + } + for (size_t i = 0; i < elem_size; i++) + bitshuf_trans_bit(&out[i * size], &scratch[i * size], size); + } + return 0; +} + +int bitshuf_decode_block(char* restrict out, + const char* restrict in, + char* restrict scratch, + size_t size, + size_t elem_size) { + if (UNLIKELY(size & 7)) + return -1; + + if (elem_size == 1) { + bitshuf_untrans_bit(out, in, size); + } else { + if (UNLIKELY(!scratch && elem_size > 1)) + return -1; + + for (size_t i = 0; i < elem_size; i++) + bitshuf_untrans_bit(&scratch[i * size], &in[i * size], size); + + switch (elem_size) { + case 2: + bitshuf_untrans_byte_2(out, scratch, size); + break; + case 4: + bitshuf_untrans_byte_4(out, scratch, size); + break; + case 8: + bitshuf_untrans_byte_8(out, scratch, size); + break; + default: + bitshuf_untrans_byte(out, scratch, size, elem_size); + break; + } + } + return 0; +} diff --git a/src/bitshuffle.h b/src/bitshuffle.h new file mode 100644 index 0000000..3405dd0 --- /dev/null +++ b/src/bitshuffle.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: MIT OR Apache-2.0 */ +/* Copyright (c) 2023 Kal Conley + */ +#ifndef BITSHUFFLE_H_ +#define BITSHUFFLE_H_ + +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +/* Transpose bits for compression. + * + * This function performs Bitshuffle transposition of a single block. The block + * size in bytes is given by the product of `size` and `elem_size`. + * + * If required, the `scratch` argument must point to a buffer that the function + * uses for scratch purposes. The size of this buffer is given by the block + * size. + * + * On success, the function returns 0; otherwise, -1 is returned to indicate an + * error. In case of error, the memory pointed to by `out` and `scratch` is left + * unmodified. + * + * Pointer arguments of this function have C99 `restrict` semantics. If the + * `out`, `in`, or `scratch` buffers overlap, the behavior is undefined. + * + * Errors + * ------ + * The function returns -1 to indicate an error if: + * + * - The `scratch` argument is `NULL` and a scratch buffer is required for the + * specified element size. + * - The `size` argument is not a multiple of 8. + */ +int bitshuf_encode_block(char* out, const char* in, char* scratch, size_t size, size_t elem_size); + +/* Untranspose bits after decompression. + * + * This function performs the inverse of `bitshuf_encode_block()`. + * + * If required, the `scratch` argument must point to a buffer that the function + * uses for scratch purposes. The size of this buffer is given by the block + * size. + * + * On success, the function returns 0; otherwise, -1 is returned to indicate an + * error. In case of error, the memory pointed to by `out` and `scratch` is left + * unmodified. + * + * Pointer arguments of this function have C99 `restrict` semantics. If the + * `out`, `in`, or `scratch` buffers overlap, the behavior is undefined. + * + * Errors + * ------ + * The function returns -1 to indicate an error if: + * + * - The `scratch` argument is `NULL` and a scratch buffer is required for the + * specified element size. + * - The `size` argument is not a multiple of 8. + */ +int bitshuf_decode_block(char* out, const char* in, char* scratch, size_t size, size_t elem_size); + +#if defined(__cplusplus) +} /* extern "C" */ +#endif + +#endif /* BITSHUFFLE_H_ */ diff --git a/tests/bitshuffle_benchmark.cc b/tests/bitshuffle_benchmark.cc new file mode 100644 index 0000000..d8c2d3f --- /dev/null +++ b/tests/bitshuffle_benchmark.cc @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: MIT OR Apache-2.0 +// Copyright (c) 2023 Kal Conley +#include "src/bitshuffle.h" + +#include +#include +#include +#include +#include + +#include "benchmark/benchmark.h" +#include "bitshuffle/src/bitshuffle_internals.h" + +#if defined(_MSC_VER) +#pragma warning(disable : 4100) // unreferenced formal parameter +#pragma warning(disable : 4244) // conversion from 'type1' to 'type2', possible loss of data +#endif + +template +static void BM_bitshuffle(benchmark::State& state) { + const size_t elem_size = state.range(0); + const size_t size_bytes = + state.range(1) / (elem_size * BSHUF_BLOCKED_MULT) * (elem_size * BSHUF_BLOCKED_MULT); + const size_t size = size_bytes / elem_size; + if (size == 0) { + state.SkipWithError("size is zero"); + return; + } + std::unique_ptr out{new char[size_bytes]}; + std::unique_ptr in{new char[size_bytes]}; + std::unique_ptr scratch{new char[size_bytes]}; + for (auto _ : state) { + benchmark::DoNotOptimize(out.get()); + const int r = F(&out[0], &in[0], &scratch[0], size, elem_size); + assert(r == 0); + (void)r; + benchmark::ClobberMemory(); + } +} + +int bm_memcpy(char* out, const char* in, char* scratch, size_t size, size_t elem_size) { + std::memcpy(out, in, size * elem_size); + return 0; +} + +int bm_bshuf_trans_bit_elem(char* out, + const char* in, + char* scratch, + size_t size, + size_t elem_size) { + const int64_t n = bshuf_trans_bit_elem(in, out, size, elem_size); + return static_cast(n) == size * elem_size ? 0 : -1; +} + +int bm_bshuf_untrans_bit_elem(char* out, + const char* in, + char* scratch, + size_t size, + size_t elem_size) { + const int64_t n = bshuf_untrans_bit_elem(in, out, size, elem_size); + return static_cast(n) == size * elem_size ? 0 : -1; +} + +#define memcpy bm_memcpy +#define bshuf_trans_bit_elem bm_bshuf_trans_bit_elem +#define bshuf_untrans_bit_elem bm_bshuf_untrans_bit_elem +// clang-format off +#define ELEM_SIZES {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 63, 64, 65, 66} +#define SIZES {256, 1024, 4096, 8192, 16384, 1'000'000, 16'000'000, 256'000'000} +// clang-format on +BENCHMARK_TEMPLATE(BM_bitshuffle, memcpy)->ArgsProduct({{1}, SIZES}); +BENCHMARK_TEMPLATE(BM_bitshuffle, bitshuf_encode_block)->ArgsProduct({ELEM_SIZES, SIZES}); +BENCHMARK_TEMPLATE(BM_bitshuffle, bitshuf_decode_block)->ArgsProduct({ELEM_SIZES, SIZES}); +BENCHMARK_TEMPLATE(BM_bitshuffle, bshuf_trans_bit_elem)->ArgsProduct({ELEM_SIZES, SIZES}); +BENCHMARK_TEMPLATE(BM_bitshuffle, bshuf_untrans_bit_elem)->ArgsProduct({ELEM_SIZES, SIZES}); diff --git a/tests/bitshuffle_test.cc b/tests/bitshuffle_test.cc new file mode 100644 index 0000000..053f186 --- /dev/null +++ b/tests/bitshuffle_test.cc @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: MIT OR Apache-2.0 +// Copyright (c) 2023 Kal Conley +#include "src/bitshuffle.h" + +#include +#include +#include +#include +#include +#include + +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +#define BSHUF_BLOCKED_MULT 8 + +template +class unaligned_allocator { +public: + using value_type = T; + + unaligned_allocator() = default; + + template + unaligned_allocator(unaligned_allocator const&) noexcept {} + + value_type* allocate(size_t n) { + return static_cast(::operator new((n + 1) * sizeof(value_type))) + 1; + } + + void deallocate(value_type* p, size_t) noexcept { ::operator delete(p - 1); } +}; + +#define EXPECT_BITSHUF(ELEM_SIZE, ...) \ + [](const std::vector>& u, \ + const std::vector>& v, const size_t elem_size) { \ + assert(u.size() == v.size()); \ + assert(u.size() % (BSHUF_BLOCKED_MULT * elem_size) == 0); \ + std::vector> buf(u.size()); \ + std::vector> scratch(u.size()); \ + { \ + std::memset(buf.data(), 0xcc, buf.size()); \ + std::memset(scratch.data(), 0xcc, scratch.size()); \ + const int r = bitshuf_encode_block( \ + reinterpret_cast(buf.data()), reinterpret_cast(u.data()), \ + reinterpret_cast(scratch.data()), buf.size() / elem_size, elem_size); \ + ASSERT_EQ(r, 0); \ + if (std::memcmp(buf.data(), v.data(), buf.size()) != 0) { \ + ASSERT_THAT(buf, ::testing::ElementsAreArray(v)); \ + } \ + } \ + { \ + std::memset(buf.data(), 0xcc, buf.size()); \ + std::memset(scratch.data(), 0xcc, scratch.size()); \ + const int r = bitshuf_decode_block( \ + reinterpret_cast(buf.data()), reinterpret_cast(v.data()), \ + reinterpret_cast(scratch.data()), buf.size() / elem_size, elem_size); \ + ASSERT_EQ(r, 0); \ + if (std::memcmp(buf.data(), u.data(), buf.size()) != 0) { \ + ASSERT_THAT(buf, ::testing::ElementsAreArray(u)); \ + } \ + } \ + }(__VA_ARGS__, ELEM_SIZE) + +TEST(Bitshuffle, ScratchNull) { + char out[8], in[8]; + std::memset(in, 0, sizeof(in)); + for (size_t elem_size : {0, 1}) { + EXPECT_EQ(bitshuf_encode_block(out, in, NULL, 8, elem_size), 0); + EXPECT_EQ(bitshuf_decode_block(out, in, NULL, 8, elem_size), 0); + } +} + +TEST(Bitshuffle, ScratchNullError) { + char out[1], in[1]; + for (size_t elem_size = 2; elem_size <= 257; elem_size++) { + EXPECT_EQ(bitshuf_encode_block(out, in, NULL, 8, elem_size), -1); + EXPECT_EQ(bitshuf_decode_block(out, in, NULL, 8, elem_size), -1); + } +} + +TEST(Bitshuffle, SizeZero) { + char out[1], in[1], scratch[1]; + for (size_t elem_size = 0; elem_size <= 257; elem_size++) { + EXPECT_EQ(bitshuf_encode_block(out, in, scratch, 0, elem_size), 0); + EXPECT_EQ(bitshuf_decode_block(out, in, scratch, 0, elem_size), 0); + } +} + +TEST(Bitshuffle, ErrorIfSizeNotMultipleOf8) { + char out[1], in[1], scratch[1]; + for (size_t size : {1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17}) { + EXPECT_EQ(bitshuf_encode_block(out, in, scratch, size, 1), -1); + EXPECT_EQ(bitshuf_decode_block(out, in, scratch, size, 1), -1); + } +} + +TEST(Bitshuffle, ElemSizeZero) { + char out[1], in[1], scratch[1]; + for (size_t size = 0; size <= 256; size += BSHUF_BLOCKED_MULT) { + EXPECT_EQ(bitshuf_encode_block(out, in, scratch, size, 0), 0); + EXPECT_EQ(bitshuf_decode_block(out, in, scratch, size, 0), 0); + } +} + +TEST(Bitshuffle, OneBitSetExhaustive) { + const size_t N = 120; + static_assert(N % BSHUF_BLOCKED_MULT == 0, ""); + + for (size_t size = 0; size <= N; size += BSHUF_BLOCKED_MULT) { + for (size_t elem_size = 0; elem_size <= 17; elem_size++) { + std::vector> u(size * elem_size); + std::vector> v(size * elem_size); + for (size_t i = 0; i < size; i++) { + for (size_t b = 0; b < elem_size * 8; b++) { + u.assign(u.size(), 0); + v.assign(v.size(), 0); + u[(i * elem_size) + b / 8] = 1U << (b % 8); + v[(b * size + i) / 8] = 1U << (i % 8); + EXPECT_BITSHUF(elem_size, u, v); + } + } + } + } +} + +TEST(Bitshuffle, OneBitZeroExhaustive) { + const size_t N = 120; + static_assert(N % BSHUF_BLOCKED_MULT == 0, ""); + + for (size_t size = 0; size <= N; size += BSHUF_BLOCKED_MULT) { + for (size_t elem_size = 0; elem_size <= 17; elem_size++) { + std::vector> u(size * elem_size); + std::vector> v(size * elem_size); + for (size_t i = 0; i < size; i++) { + for (size_t b = 0; b < elem_size * 8; b++) { + u.assign(u.size(), 0xff); + v.assign(v.size(), 0xff); + u[(i * elem_size) + b / 8] = ~(1U << (b % 8)); + v[(b * size + i) / 8] = ~(1U << (i % 8)); + EXPECT_BITSHUF(elem_size, u, v); + } + } + } + } +} diff --git a/third_party/BUILD.bazel b/third_party/BUILD.bazel new file mode 100644 index 0000000..e69de29 diff --git a/third_party/bitshuffle.BUILD b/third_party/bitshuffle.BUILD new file mode 100644 index 0000000..a4ed965 --- /dev/null +++ b/third_party/bitshuffle.BUILD @@ -0,0 +1,16 @@ +load("@rules_cc//cc:defs.bzl", "cc_library") + +package(default_visibility = ["//visibility:public"]) + +cc_library( + name = "bitshuffle_core", + srcs = [ + "src/bitshuffle_core.c", + "src/iochain.c", + ], + hdrs = [ + "src/bitshuffle_core.h", + "src/bitshuffle_internals.h", + "src/iochain.h", + ], +)